### Step 1: Load Bronze data into notebook

In [0]:
# Load the CSV file into a DataFrame
game_plays_csv_path = "/mnt/nhl-finalproject/bronze/nhlkaggle.zip/game_plays.csv"
xy_df = spark.read.csv(game_plays_csv_path, header=True, inferSchema=True)


### Step 2: Cleaning 
- drop all duplicates  
- rename columns to lower case 


In [0]:
# Cleaning : remove duplicates
xy_df= xy_df.dropDuplicates()


In [0]:
# rename columns to lower case 
xy_df = xy_df.toDF(*[c.lower() for c in xy_df.columns])


### Step 3: Cleaning 
- convert data type of columns x and y to numeric type
- drop null values of x 
- drop values of x less than -89 or more than 89

In [0]:
from pyspark.sql.types import DoubleType

# Ensure 'x' is of numeric type
xy_df = xy_df.withColumn('x', xy_df['x'].cast(DoubleType()))
xy_df = xy_df.withColumn('y', xy_df['y'].cast(DoubleType()))

# Cleaning: Remove rows where x is null
xy_df = xy_df.filter(xy_df['x'].isNotNull())


# Transform: Extract only rows where -89 <= x <= 89
xy_df = xy_df.filter((xy_df['x'] >= -89) & (xy_df['x'] <= 89))



In [0]:
# save transformed dataframe to parquet files in silver layer
xy_df.write.format('delta').mode('overwrite').save('/mnt/nhl-finalproject/silver/xy_game_plays')