#### Step 1: Import data and convert to dataframes 

In [0]:
import pandas as pd
from datetime import datetime

player_info_csv_path="/mnt/nhl-finalproject/bronze/nhlkaggle.zip/player_info.csv"
game_skater_stats_csv_path= "/mnt/nhl-finalproject/bronze/nhlkaggle.zip/game_skater_stats.csv"
team_info_csv_path ='/mnt/nhl-finalproject/bronze/nhlkaggle.zip/team_info.csv'
game_goalie_stats_csv_path = '/mnt/nhl-finalproject/bronze/nhlkaggle.zip/game_goalie_stats.csv'


# read csv to spark dataframe 
player_info = spark.read.csv(player_info_csv_path, header=True, inferSchema=True).toPandas()
game_skater_stats = spark.read.csv(game_skater_stats_csv_path, header=True, inferSchema=True).toPandas()
team_info = spark.read.csv(team_info_csv_path, header=True, inferSchema=True).toPandas()
game_goalie_stats = spark.read.csv(game_goalie_stats_csv_path, header=True, inferSchema=True).toPandas()

#### Step 2: Cleaning
- Drop duplicates
- Renaming column names to lower case

In [0]:
dfs = {
    'player_info': player_info,
    'game_skater_stats': game_skater_stats,
    'team_info': team_info,
    'game_goalie_stats': game_goalie_stats
}
# Iterate over the dictionary and drop duplicates in each DataFrame
for name, df in dfs.items():
    dfs[name] = df.drop_duplicates()

# Update the original DataFrame variables from the dictionary
player_info, game_skater_stats, team_info, game_goalie_stats = dfs.values()



In [0]:
# change column names to lower cases
for name, df in dfs.items(): 
    dfs[name].columns = df.columns.str.lower()


#### Step 3: Cleaning player_info : 
- changing column datatype 
- handling null data 

In [0]:
import numpy as np

# Replace the string "na" with NaN
player_info['weight'] = player_info['weight'].replace("NA", np.nan)
player_info['height_cm'] = player_info['height_cm'].replace("NA", np.nan)

# Convert the column to float
player_info['weight'] = player_info['weight'].astype('float64')
player_info['height_cm'] = player_info['height_cm'].astype('float64')


In [0]:
# Convert ids to string type 
player_info['player_id'] = player_info['player_id'].astype('string') 
player_info['firstname'] = player_info['firstname'].astype('string')
player_info['lastname'] = player_info['lastname'].astype('string')
player_info['primaryposition']= player_info['primaryposition'].astype('string')

In [0]:
# replace nulls for 'height_cm', 'weight' as 0.0
player_info['height_cm'] = player_info['height_cm'].fillna(0.0)
player_info['weight']= player_info['weight'].fillna(0.0)

#check for nulls again
player_info.isnull().sum()


player_id             0
firstname             0
lastname              0
nationality           0
birthcity             0
primaryposition       0
birthdate             0
birthstateprovince    0
height                0
height_cm             0
weight                0
shootscatches         0
dtype: int64

#### Step 4: Cleaning/Transforming game_skater_stats 
- changing column datatype 
- handling null data 


In [0]:
# Replace the string "NA" with 0.0
game_skater_stats['takeaways'] = game_skater_stats['takeaways'].replace("NA", np.nan)
game_skater_stats['blocked'] = game_skater_stats['blocked'].replace("NA", np.nan)

# Convert the column to float
game_skater_stats['takeaways'] =game_skater_stats['takeaways'].astype('float64')
game_skater_stats['blocked'] = game_skater_stats['blocked'].astype('float64')

# Convert IDs to string type 
game_skater_stats['game_id'] = game_skater_stats['game_id'].astype('string')
game_skater_stats['player_id'] = game_skater_stats['player_id'].astype('string')
game_skater_stats['team_id'] = game_skater_stats['team_id'].astype('string')



In [0]:
#check for null
game_skater_stats.isnull().sum()

game_id                      0
player_id                    0
team_id                      0
timeonice                    0
assists                      0
goals                        0
shots                        0
hits                         0
powerplaygoals               0
powerplayassists             0
penaltyminutes               0
faceoffwins                  0
faceofftaken                 0
takeaways               398107
giveaways                    0
shorthandedgoals             0
shorthandedassists           0
blocked                 398107
plusminus                    0
eventimeonice                0
shorthandedtimeonice         0
powerplaytimeonice           0
dtype: int64

In [0]:
# fill null data with 0.0 
game_skater_stats['takeaways'] = game_skater_stats['takeaways'].fillna(0.0)
game_skater_stats['blocked'] = game_skater_stats['blocked'].fillna(0.0)

#check for null
game_skater_stats.isnull().sum()

game_id                 0
player_id               0
team_id                 0
timeonice               0
assists                 0
goals                   0
shots                   0
hits                    0
powerplaygoals          0
powerplayassists        0
penaltyminutes          0
faceoffwins             0
faceofftaken            0
takeaways               0
giveaways               0
shorthandedgoals        0
shorthandedassists      0
blocked                 0
plusminus               0
eventimeonice           0
shorthandedtimeonice    0
powerplaytimeonice      0
dtype: int64

In [0]:
# Convert other id types to strings
team_info['team_id'] = team_info['team_id'].astype('string')
team_info['shortname'] = team_info['shortname'].astype('string')
team_info['teamname'] = team_info['teamname'].astype('string')
game_goalie_stats['game_id'] = game_goalie_stats['game_id'].astype('string')
game_goalie_stats['player_id'] = game_goalie_stats['player_id'].astype('string')
game_goalie_stats['team_id'] = game_goalie_stats['team_id'].astype('string')


In [0]:
# check for duplicates
#game_skater_stats.duplicated().sum()


0

In [0]:

#team_info.duplicated().sum()

0

In [0]:
#game_goalie_stats.duplicated().sum()


0

In [0]:

#team_info.isna().sum()


team_id         0
franchiseid     0
shortname       0
teamname        0
abbreviation    0
link            0
dtype: int64

In [0]:
#game_goalie_stats.isna().sum()

game_id                          0
player_id                        0
team_id                          0
timeonice                        0
assists                          0
goals                            0
pim                              0
shots                            0
saves                            0
powerplaysaves                   0
shorthandedsaves                 0
evensaves                        0
shorthandedshotsagainst          0
evenshotsagainst                 0
powerplayshotsagainst            0
decision                      3719
savepercentage                   0
powerplaysavepercentage          0
evenstrengthsavepercentage       0
dtype: int64

In [0]:
# fill null data with 0.0 
game_goalie_stats['decision'] = game_goalie_stats['decision'].fillna('NA')


In [0]:
game_goalie_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51163 entries, 0 to 56653
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   game_id                     51163 non-null  string
 1   player_id                   51163 non-null  string
 2   team_id                     51163 non-null  string
 3   timeonice                   51163 non-null  int32 
 4   assists                     51163 non-null  int32 
 5   goals                       51163 non-null  int32 
 6   pim                         51163 non-null  int32 
 7   shots                       51163 non-null  int32 
 8   saves                       51163 non-null  int32 
 9   powerplaysaves              51163 non-null  int32 
 10  shorthandedsaves            51163 non-null  int32 
 11  evensaves                   51163 non-null  int32 
 12  shorthandedshotsagainst     51163 non-null  int32 
 13  evenshotsagainst            51163 non-null  in

In [0]:
player_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3925 entries, 0 to 3924
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   player_id           3925 non-null   string        
 1   firstname           3925 non-null   string        
 2   lastname            3925 non-null   string        
 3   nationality         3925 non-null   object        
 4   birthcity           3925 non-null   object        
 5   primaryposition     3925 non-null   string        
 6   birthdate           3925 non-null   datetime64[ns]
 7   birthstateprovince  3925 non-null   object        
 8   height              3925 non-null   object        
 9   height_cm           3925 non-null   float64       
 10  weight              3925 non-null   float64       
 11  shootscatches       3925 non-null   object        
dtypes: datetime64[ns](1), float64(2), object(5), string(4)
memory usage: 398.6+ KB


In [0]:
team_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33 entries, 0 to 32
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   team_id       33 non-null     string
 1   franchiseid   33 non-null     int32 
 2   shortname     33 non-null     string
 3   teamname      33 non-null     string
 4   abbreviation  33 non-null     object
 5   link          33 non-null     object
dtypes: int32(1), object(2), string(3)
memory usage: 1.7+ KB


In [0]:
# Create a dictionary of DataFrames
dfs_final = {
    'player_info': player_info,
    'game_skater_stats': game_skater_stats,
    'team_info': team_info,
    'game_goalie_stats': game_goalie_stats
}

# create spark dataframe dictionary
spark_dfs = {}

# populate it with corresponding spark df using pandas df in dfs
for name, df in dfs_final.items():
    spark_dfs[name] = spark.createDataFrame(df)

for name, spark_df in spark_dfs.items():
    # Construct the full path for each file
    file_path = f"/mnt/nhl-finalproject/silver/goat_{name}"
    
    # Write the DataFrame to Delta format
    spark_df.write.format('delta').mode('overwrite').save(file_path)


In [0]:
dbutils.fs.ls("/mnt/nhl-finalproject/silver")

[FileInfo(path='dbfs:/mnt/nhl-finalproject/silver/goat_game_goalie_stats/', name='goat_game_goalie_stats/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/nhl-finalproject/silver/goat_game_skater_stats/', name='goat_game_skater_stats/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/nhl-finalproject/silver/goat_player_info/', name='goat_player_info/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/nhl-finalproject/silver/goat_team_info/', name='goat_team_info/', size=0, modificationTime=0)]