## This notebook takes the database created by the raw scrape of game results, cleans up some problems with the data 

### Task List
- the advanced metrics tables should have the team name added to each player's row
    - can also probably store as a single table instead of two tables
        - would also want to add home or away to each row along with Team Name
        Should be able to do it with a rather simple if then and the team names in the Game_ID

- Import the master rosters that are scraped and stored as CSV into the database so we can join data on age, class rank, ect 
    - I 

In [15]:
## Dependencies

import pandas as pd
import numpy as np

import sqlite3

# db_path = '../data/2022-2023 Season Data.db' # Set FOr 2022-2023 Season

db_path = '2023_Season_10_31_23_Game_Stats.db'

conn = sqlite3.connect(db_path)

# Roster data
folder = '../data/rosters/'

df_2023 = pd.read_csv(folder + '2023_master_roster.csv')
df_2022 = pd.read_csv(folder + '2022_master_roster.csv')
df_2021 = pd.read_csv(folder + '2021_master_roster.csv')
df_2020 = pd.read_csv(folder + '2020_master_roster.csv')




## Clean and Transform Advanced Metrics
- add, Team and Home-Away columns, combine the two tables into a single table

In [16]:
# Load the data from the advanced metrics tables
query_team1 = "SELECT * FROM advanced_metrics_team1;"
query_team2 = "SELECT * FROM advanced_metrics_team2;"

df_advanced_team1 = pd.read_sql_query(query_team1, conn)
df_advanced_team2 = pd.read_sql_query(query_team2, conn)

# # Show some sample data to verify
# df_advanced_team1.head(), df_advanced_team2.head()

# Correctly extract team names from Game_ID
df_advanced_team1['Team'] = df_advanced_team1['Game_ID'].apply(lambda x: x.split('-')[3])
df_advanced_team1['Home/Away'] = 'Away'  # Team1 is the away team
df_advanced_team2['Team'] = df_advanced_team2['Game_ID'].apply(lambda x: x.split('-')[4])
df_advanced_team2['Home/Away'] = 'Home'  # Team2 is the home team

# Recombine the two dataframes into a single one, again
df_advanced_combined_corrected = pd.concat([df_advanced_team1, df_advanced_team2], ignore_index=True)

# Show some sample data to verify the correction
# df_advanced_combined_corrected.sample(10)

# Save the combined and corrected dataframe back to the database as a new table
new_table_name = 'advanced_metrics_combined'
df_advanced_combined_corrected.to_sql(new_table_name, conn, if_exists='replace', index=False)

# Drop the original advanced metrics tables from the database
drop_table1_query = "DROP TABLE IF EXISTS advanced_metrics_team1;"
drop_table2_query = "DROP TABLE IF EXISTS advanced_metrics_team2;"

# conn.execute(drop_table1_query)
# conn.execute(drop_table2_query)

# Verify by listing all the tables in the database again
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()
table_names = [table[0] for table in tables]
table_names



['scoring_summary',
 'penalty_summary',
 'goalie_stats',
 'player_stats',
 'line_chart',
 'linescore',
 'advanced_metrics_team1',
 'advanced_metrics_team2',
 'master_roster',
 'game_details',
 'player_stats_ytd',
 'advanced_metrics_combined']

In [17]:
## Additional cleaning steps for the advanced metrics table
# Remove rows where Player is "TOTAL"
df_advanced_combined_cleaned = df_advanced_combined_corrected[df_advanced_combined_corrected['Player'] != 'TOTAL']
df_advanced_combined_cleaned = df_advanced_combined_cleaned[df_advanced_combined_cleaned['Player'] != 'Name: Clean_Name, dtype: object,']





## Add Home and Away Columns to game_details table

In [18]:
# Step 1: Read the game_details table into a DataFrame
df_game_details = pd.read_sql("SELECT * FROM game_details", conn)

# Step 2: Create new columns for Home and Away Teams by parsing Game_ID
df_game_details['Away_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[3])
df_game_details['Home_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[4])

# Step 3: Write this updated DataFrame back to the game_details table
df_game_details.to_sql('game_details', conn, if_exists='replace', index=False)


174

## Clean up The Column Names in the Player Stats table

In [19]:

############ 'Pt.' should be 'Pts' and '+/-' should be 'plus_minus'
#################################
player_stats_df = pd.read_sql_query("SELECT * FROM player_stats", conn)

if 'Pt.' in player_stats_df.columns:
    player_stats_df.rename(columns={'Pt.': 'Pts'}, inplace=True)
else:
    print("Column 'Pt.' not found.")

if '+/-' in player_stats_df.columns:
    player_stats_df.rename(columns={'+/-': 'plus_minus'}, inplace=True)
else:
    print("Column '+/-' not found.")


#################################

## CREATE A NEW TABLE WITH AGGRIGATED PLAYER STATS YEAR TO DATE

In [20]:
# Use player_stats_df from here on, instead of running another SQL query.
df_player_stats = player_stats_df.copy()


# Clean up the name format in player_stats for easier matching
# Replace the non-breaking space with a regular space
df_player_stats['Clean_Player'] = df_player_stats['Player'].apply(lambda x: x.replace('\xa0', ' '))

# Remove rows where Player is the team name (e.g., "Michigan State")
df_player_stats_cleaned = df_player_stats[df_player_stats['Player'] != df_player_stats['Team']]

# Convert relevant columns to integers for correct aggregation
cols_to_convert = ['G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM']
for col in cols_to_convert:
    df_player_stats_cleaned[col] = pd.to_numeric(df_player_stats_cleaned[col], errors='coerce')

# Aggregate the data for year-to-date stats
# Add a column for counting the number of games each player has played
agg_player_stats_corrected_with_games = df_player_stats_cleaned.groupby(['Clean_Player', 'Team']).agg({
    'G': 'sum',
    'A': 'sum',
    'Pts': 'sum',
    'plus_minus': 'sum',
    'Sh': 'sum',
    'PIM': 'sum',
    'Game_ID': 'count'  # Counting the number of unique Game_IDs for each player
}).reset_index()

# Rename the Game_ID column to Games_Played
agg_player_stats_corrected_with_games.rename(columns={'Game_ID': 'Games_Played'}, inplace=True)

# Save the updated aggregated data back to the database, replacing the existing table
agg_player_stats_corrected_with_games.to_sql('player_stats_ytd', conn, if_exists='replace', index=False)

# Verify by loading some sample data from the updated table
sample_updated_ytd = pd.read_sql_query("SELECT * FROM player_stats_ytd LIMIT 5;", conn)
sample_updated_ytd


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_stats_cleaned[col] = pd.to_numeric(df_player_stats_cleaned[col], errors='coerce')


Unnamed: 0,Clean_Player,Team,G,A,Pts,plus_minus,Sh,PIM,Games_Played
0,A.J. Hodges,Bentley,2,1,3,-1,17,0,6
1,A.J. Macaulay,Alaska,0,1,1,-3,11,0,6
2,AJ Casperson,Long Island,0,0,0,0,0,0,1
3,Aaron Bohlinger,Massachusetts,1,3,4,-1,2,0,5
4,Aaron Grounds,Long Island,1,2,3,0,8,6,7


## Add the Roster data from the CSVs to the Database

In [21]:
################## SET THE ROSTER DATAFRAME TO THE CORRECT YEAR ####################
## MATCH THE DATAFRAME NAMES
df_master_roster = df_2023

## Season Year Value
season_year = 2023

# Clean up the name formats for joining
# Master roster: Convert "Last Name, First Name" to "First Name Last Name"
# df_master_roster['Clean_Name'] = df_master_roster['Player'].apply(lambda x: ' '.join(reversed(x.split(', '))))

# Rename Player to Clean_Name
df_master_roster.rename(columns={'Player': 'Clean_Name'}, inplace=True)
# Rename School to Team
df_master_roster.rename(columns={'School': 'Team'}, inplace=True)

# Clean up the Team column, remove '-' and replace with ' '
# df_master_roster['School'] = df_master_roster['Team'].apply(lambda x: x.replace('-', ' '))

## If there are an period in the column names, remove them
df_master_roster.columns = df_master_roster.columns.str.replace('.', '')



In [22]:
### Finally add the roster to the database as it's own table

df_master_roster['SeasonYear'] = season_year

# Save the roster data as a new table in the database
roster_table_name = 'master_roster'
df_master_roster.to_sql(roster_table_name, conn, if_exists='replace', index=False)
############################################################

# Verify by listing all the tables in the database again
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()
table_names_updated = [table[0] for table in tables]
table_names_updated


['scoring_summary',
 'penalty_summary',
 'goalie_stats',
 'player_stats',
 'line_chart',
 'linescore',
 'advanced_metrics_team1',
 'advanced_metrics_team2',
 'advanced_metrics_combined',
 'game_details',
 'player_stats_ytd',
 'master_roster']

## Save a backup of the transformed database and proceed to adding the roster info


In [23]:
# Output the current state of the database to a new SQLite file in the temp folder
backup_db_path = '../data/2023_YTD_Game_Stats_NEW.db'

# Create a new SQLite database for backup and connect to it
conn_backup = sqlite3.connect(backup_db_path)

# Copy each table from the original database to the backup database
for table in table_names:
    query = f"SELECT * FROM {table};"
    df_table = pd.read_sql_query(query, conn)
    df_table.to_sql(table, conn_backup, if_exists='replace', index=False)

# Close the backup database connection
conn_backup.close()

backup_db_path


'../data/2023_YTD_Game_Stats_NEW.db'