#### Step 1: Import data and convert to dataframes

In [0]:
import pandas as pd
from datetime import datetime

# Define file paths for the Parquet files
player_info_delta_path = "/mnt/nhl-finalproject/silver/goat_player_info/"
game_skater_stats_delta_path = "/mnt/nhl-finalproject/silver/goat_game_skater_stats/"
team_info_delta_path = "/mnt/nhl-finalproject/silver/goat_team_info/"
game_goalie_stats_delta_path = "/mnt/nhl-finalproject/silver/goat_game_goalie_stats/"

# Read Parquet files into Spark DataFrames and convert to pandas DataFrames
player_info = spark.read.format("delta").load(player_info_delta_path).toPandas()
game_skater_stats = spark.read.format("delta").load(game_skater_stats_delta_path).toPandas()
team_info = spark.read.format("delta").load(team_info_delta_path).toPandas()
game_goalie_stats = spark.read.format("delta").load(game_goalie_stats_delta_path).toPandas()



In [0]:

dfs = {
    'player_info': player_info,
    'game_skater_stats': game_skater_stats,
    'team_info': team_info,
    'game_goalie_stats': game_goalie_stats
}



#### Dropping columns

In [0]:
# Drop columns
player_info.drop(columns=['nationality', 'birthcity', 'birthstateprovince', 'height', 'shootscatches'], inplace=True)

game_skater_stats.drop(columns=['hits', 'powerplaygoals', 'powerplayassists', 'penaltyminutes', 'faceoffwins', 'faceofftaken', 'giveaways', 'shorthandedgoals', 'shorthandedassists', 'plusminus', 'eventimeonice', 'shorthandedtimeonice', 'powerplaytimeonice'], inplace=True)

team_info.drop(columns=['franchiseid', 'abbreviation', 'link'], inplace=True)



In [0]:
game_goalie_stats.drop(columns=['assists', 'goals', 'pim', 'shots', 'powerplaysaves', 'shorthandedsaves', 'evensaves', 'shorthandedshotsagainst', 'evenshotsagainst', 'powerplayshotsagainst', 'decision', 'savepercentage', 'powerplaysavepercentage','evenstrengthsavepercentage'], inplace=True)

### Cleaning/Transforming player_info
- create age column 
- create weight_kg column 

In [0]:
# create an 'age' column from 'birthDate' by defining function to calculate age based on birth date, assuming current date of 2021 Jan 01
def calculate_age(birth_date):
    if pd.isnull(birth_date):
        return None
    return (datetime(2021, 1, 1) - birth_date).days // 365

player_info['age']= player_info['birthdate'].apply(calculate_age)


# Convert weight to kg
player_info['weight_kg'] = player_info['weight'] * 0.453592



### Cleaning/Transforming game_skater_stats and game_goalie_stats
- remove rows where team_id is 87, 88, 89, 90 as these correspond to all starts team and are not part of the season. 

In [0]:
# remove values 87 - 90 in df
values_to_remove = [87, 88, 89, 90]

# Remove rows where 'team_id' is in values_to_remove
game_skater_stats = game_skater_stats[~game_skater_stats['team_id'].isin(values_to_remove)]
game_goalie_stats = game_goalie_stats[~game_goalie_stats['team_id'].isin(values_to_remove)]

In [0]:
game_skater_stats.display()


game_id,player_id,team_id,timeonice,assists,goals,shots,takeaways,blocked
2012020453,8469490,20,517,0,0,1,0.0,1.0
2012020453,8471260,20,975,0,0,2,0.0,0.0
2012020453,8474673,20,1122,0,0,0,1.0,3.0
2012020453,8469770,20,1551,0,0,2,0.0,0.0
2012020453,8470151,20,1433,0,0,2,0.0,2.0
2012020453,8470162,20,878,1,0,0,0.0,0.0
2012020453,8471185,20,873,0,1,1,2.0,1.0
2012020453,8469500,20,892,0,0,5,1.0,0.0
2012020453,8467977,20,524,0,0,2,0.0,0.0
2012020453,8467338,20,1089,0,0,0,0.0,1.0


In [0]:
dbutils.fs.ls("/mnt/nhl-finalproject/gold/")

[]

In [0]:
# Create a dictionary of DataFrames
dfs_final = {
    'player_info': player_info,
    'game_skater_stats': game_skater_stats,
    'team_info': team_info,
    'game_goalie_stats': game_goalie_stats
}

# create spark dataframe dictionary
spark_dfs = {}

# populate it with corresponding spark df using pandas df in dfs
for name, df in dfs_final.items():
    spark_dfs[name] = spark.createDataFrame(df)

for name, spark_df in spark_dfs.items():
    # Construct the full path for each file
    file_path = f"/mnt/nhl-finalproject/gold/{name}"
    
    # Write the DataFrame to Delta format
    spark_df.write.format('delta').mode('overwrite').save(file_path)





In [0]:
player_info.display()