## Combine Domain Parquet Files

This short jupyter notebook takes the domain parquet data and converts it into a single parquet. All cells must be ran in this notebook to do so.

First we perform necessary imports.

In [3]:
from pyspark.sql import SparkSession, functions as F
from functools import reduce

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Crime Curated")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

PAGE_DIRECTORY = '../../../data/1. landing/domain/pages'

In [None]:
# this is just used to make sure the data is correct when being read in

listings_links_path = "../../../data/1. landing/domain/metadata/listings_links.csv"

# Open and read the CSV file
with open(listings_links_path, 'r') as file:
    listing_urls = set(file.readlines())


# load all the dataframes
BATCH_SIZE = 500
dfs = []

for i in range(0, len(listing_urls), BATCH_SIZE):
    page_directory = PAGE_DIRECTORY + f'/{i}-{i+BATCH_SIZE}.parquet'
    dfs.append(spark.read.parquet(page_directory))

union_df = reduce(lambda df1, df2: df1.union(df2), dfs)

In [None]:
output_path = "../../../data/1. landing/domain/domain_current_listings.parquet"

# repartition the dataframe (as 450+ parititons is very excessive)
repartitioned = union_df.coalesce(4)

# finally write to file
repartitioned.write.parquet(output_path)