## DATA CLEANING AND COMBINING

### College Hockey Box score scrape data

-Opened 4/3/2024

In [1]:
# Dependencies

import sqlite3
import pandas as pd
import numpy as np
import os



In [2]:
# paths and setup

roster_path = os.path.join('..', 'data', 'rosters', 'all_time_combined_roster.csv')
# read in the roster
roster = pd.read_csv(roster_path)
# roster.info() # Display the data types of each column


# Database folder path
db_folder = os.path.join('..', 'data', 'db', 'box_scores')

# list the db files in the directory
db_files = os.listdir(db_folder)
# db_files

  roster = pd.read_csv(roster_path)


## Check the roster data in each .db file
### Make corrections - also add Season column to all tables

In [3]:
## Verify and correct the data in the master_roster table of a .db file

# Includes a check to make sure the table is present in the database



In [4]:
def correct_db_file(db_file, roster_df):
    year = int(db_file[:4])  # Extract the year from the filename
    db_path = os.path.join(db_folder, db_file)
    conn = sqlite3.connect(db_path)
    
    cursor = conn.cursor()
    # Check if master_roster table exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='master_roster';")
    if cursor.fetchone():
        # Check if the Season column exists in the master_roster table
        cursor.execute("PRAGMA table_info(master_roster);")
        columns = [row[1] for row in cursor.fetchall()]
        if "Season" not in columns:
            cursor.execute("ALTER TABLE master_roster ADD COLUMN Season INTEGER")
        
        # Correct the master_roster table if needed
        cursor.execute("SELECT DISTINCT Season FROM master_roster")
        seasons = cursor.fetchall()
        if len(seasons) != 1 or seasons[0][0] != year:
            cursor.execute("DELETE FROM master_roster")
            # Handle column name variations
            correct_roster = roster_df[roster_df['Season'] == year].copy()
            if 'Clean_Name' not in correct_roster.columns and 'Clean-Name' in correct_roster.columns:
                correct_roster.rename(columns={'Clean-Name': 'Clean_Name'}, inplace=True)
            elif 'Clean_Name' not in correct_roster.columns and 'player' in correct_roster.columns:
                correct_roster.rename(columns={'player': 'Clean_Name'}, inplace=True)
            correct_roster.to_sql('master_roster', conn, if_exists='append', index=False)
    
    # Add Season column to all tables as redundancy, checking first if it exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = cursor.fetchall()
    for table in tables:
        table_name = table[0]
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = [row[1] for row in cursor.fetchall()]
        if "Season" not in columns:
            try:
                cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN Season INTEGER")
                cursor.execute(f"UPDATE {table_name} SET Season = {year}")
            except sqlite3.OperationalError as e:
                print(f"Error updating table {table_name}: {str(e)}")
    
    conn.commit()
    conn.close()
    print(f"Database {db_file} has been corrected.")

In [5]:
for db_file in db_files:
    if db_file.endswith('.db'):  # Ensure it's a database file
        correct_db_file(db_file, roster)


Database 2002_Box_Scores_v2.db has been corrected.
Database 2003_Box_Scores_v2.db has been corrected.
Database 2004_Box_Scores_v2.db has been corrected.
Database 2005_Box_Scores_v2.db has been corrected.
Database 2006_Box_Scores_v2.db has been corrected.
Database 2007_Box_Scores_v2.db has been corrected.
Database 2008_Box_Scores_v2.db has been corrected.
Database 2009_Box_Scores_v2.db has been corrected.
Database 2010_Box_Scores_v2.db has been corrected.
Database 2011_Box_Scores_v2.db has been corrected.
Database 2012_Box_Scores_v2.db has been corrected.
Database 2013_Box_Scores_v2.db has been corrected.
Database 2014_Box_Scores_v3.db has been corrected.
Database 2015_Box_Scores_v2.db has been corrected.
Database 2016_Box_Scores_v2.db has been corrected.
Database 2017_Box_Scores_v1.db has been corrected.
Database 2018_Box_Scores_v2.db has been corrected.
Database 2019_Box_Scores_v1.db has been corrected.
Database 2020_Box_Scores_v2.db has been corrected.


OperationalError: table master_roster has no column named Clean_Name