## This notebook takes the database created by the raw scrape of game results, cleans up some problems with the data 

### Task List
- the advanced metrics tables should have the team name added to each player's row
    - can also probably store as a single table instead of two tables
        - would also want to add home or away to each row along with Team Name
        Should be able to do it with a rather simple if then and the team names in the Game_ID

- Import the master rosters that are scraped and stored as CSV into the database so we can join data on age, class rank, ect 
    - I 

In [None]:
## Dependencies

import pandas as pd
import numpy as np

import sqlite3

db_path = ' Game Stats.db'

conn = sqlite3.connect(db_path)

# Roster data
folder = '../data/rosters/'

df_2023 = pd.read_csv(folder + '2023_master_roster.csv')
df_2022 = pd.read_csv(folder + '2022_master_roster.csv')
df_2021 = pd.read_csv(folder + '2021_master_roster.csv')
df_2020 = pd.read_csv(folder + '2020_master_roster.csv')




## Clean and Transform Advanced Metrics
- add, Team and Home-Away columns, combine the two tables into a single table

In [None]:
# Load the data from the advanced metrics tables
query_team1 = "SELECT * FROM advanced_metrics_team1;"
query_team2 = "SELECT * FROM advanced_metrics_team2;"

df_advanced_team1 = pd.read_sql_query(query_team1, conn)
df_advanced_team2 = pd.read_sql_query(query_team2, conn)

# # Show some sample data to verify
# df_advanced_team1.head(), df_advanced_team2.head()

# Correctly extract team names from Game_ID
df_advanced_team1['Team'] = df_advanced_team1['Game_ID'].apply(lambda x: x.split('-')[3])
df_advanced_team1['Home/Away'] = 'Away'  # Team1 is the away team
df_advanced_team2['Team'] = df_advanced_team2['Game_ID'].apply(lambda x: x.split('-')[4])
df_advanced_team2['Home/Away'] = 'Home'  # Team2 is the home team

# Recombine the two dataframes into a single one, again
df_advanced_combined_corrected = pd.concat([df_advanced_team1, df_advanced_team2], ignore_index=True)

# Show some sample data to verify the correction
# df_advanced_combined_corrected.sample(10)

# Save the combined and corrected dataframe back to the database as a new table
new_table_name = 'advanced_metrics_combined'
df_advanced_combined_corrected.to_sql(new_table_name, conn, if_exists='replace', index=False)

# Drop the original advanced metrics tables from the database
drop_table1_query = "DROP TABLE IF EXISTS advanced_metrics_team1;"
drop_table2_query = "DROP TABLE IF EXISTS advanced_metrics_team2;"

# conn.execute(drop_table1_query)
# conn.execute(drop_table2_query)

# Verify by listing all the tables in the database again
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()
table_names = [table[0] for table in tables]
table_names



In [None]:
## Additional cleaning steps for the advanced metrics table
# Remove rows where Player is "TOTAL"
df_advanced_combined_cleaned = df_advanced_combined_corrected[df_advanced_combined_corrected['Player'] != 'TOTAL']
df_advanced_combined_cleaned = df_advanced_combined_cleaned[df_advanced_combined_cleaned['Player'] != 'Name: Clean_Name, dtype: object,']

## MATCH THE DATAFRAME NAMES
df_2023_master_roster = df_2023

# Clean up the name formats for joining
# Master roster: Convert "Last Name, First Name" to "First Name Last Name"
df_2023_master_roster['Clean_Name'] = df_2023_master_roster['Name'].apply(lambda x: ' '.join(reversed(x.split(', '))))

# Advanced metrics: Replace the non-breaking space with a regular space
df_advanced_combined_cleaned['Clean_Player'] = df_advanced_combined_cleaned['Player'].apply(lambda x: x.replace('\xa0', ' '))

# Show some sample cleaned names to verify
sample_clean_master_roster_names = df_2023_master_roster['Clean_Name'].sample(5)
sample_clean_advanced_metrics_names = df_advanced_combined_cleaned['Clean_Player'].sample(5)

sample_clean_master_roster_names, sample_clean_advanced_metrics_names



## Add the Roster data from the CSVs to the Database