In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, upper, sum as spark_sum, count, lit
from pyspark.sql.window import Window
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataPreprocessing") \
    .getOrCreate()

# Define file paths
read_data_path = "/mnt/blobstorage1"

# Read data into Spark DataFrames from mounted ADLS location
df = spark.read.format("csv").option("header", "true").load(read_data_path)

In [0]:
# Drop the 'reservation_status_date' and 'reservation_status_days_difference' columns
df = df.drop('reservation_status_date', 'reservation_status', 'assigned_room_type')


In [0]:
# Drop the 'agent' and 'company' columns
df = df.drop('agent', 'company')

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Assuming a Spark session is already created and named spark
# If not, you need to create one as follows:
# spark = SparkSession.builder.appName("MyApp").getOrCreate()

# 1. Sorting the DataFrame
df = df.orderBy(['name', 'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'])

# 2. Creating a 'num_bookings' column
window_spec = Window.partitionBy('name').orderBy('arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month')
df = df.withColumn('num_bookings', F.row_number().over(window_spec) - 1)

# 3. Dropping the 'arrival_date_year' column
df = df.drop('arrival_date_year')

# 4. Conditionally replacing country names
# First, compute the country counts and create a broadcast variable (for efficiency in larger datasets)
country_counts = df.groupBy('country').count()
broadcast_country_counts = spark.sparkContext.broadcast({row['country']: row['count'] for row in country_counts.collect()})
df = df.withColumn('country', F.when(df['country'].isin([k for k, v in broadcast_country_counts.value.items() if v > 1000]), df['country']).otherwise(F.lit('Other')))


In [0]:
df = df.drop('name', 'email','phone-number', 'credit_card')

In [0]:
# Write the DataFrame to a single CSV file in a directory
output_path_dir = "/mnt/blobstorage1"
df.coalesce(1).write.mode('overwrite').option("header", "true").csv(output_path_dir)