## DATA CLEANING AND COMBINING

### College Hockey Box score scrape data

-Opened 4/3/2024

# NOTE - Current version creates combined db with lots of duplicate rows.
## Run through the DB_DUP_CHECK_SCRATCH to remove until this can be fixed

In [1]:
# Dependencies

import sqlite3
import pandas as pd
import numpy as np
import os



In [2]:
# paths and setup

roster_path = os.path.join('..', 'data', 'rosters', 'all_time_combined_roster.csv')
# read in the roster
roster = pd.read_csv(roster_path)
# roster.info() # Display the data types of each column


# Database folder path
db_folder = os.path.join('..', 'data', 'db', 'box_scores')

# list the db files in the directory
db_files = os.listdir(db_folder)
# db_files

  roster = pd.read_csv(roster_path)


## Check the roster data in each .db file
### Make corrections - also add Season column to all tables

In [3]:
## Verify and correct the data in the master_roster table of a .db file

# Includes a check to make sure the table is present in the database



In [4]:
def correct_db_file(db_file, roster_df):
    year = int(db_file[:4])  # Extract the year from the filename
    db_path = os.path.join(db_folder, db_file)
    conn = sqlite3.connect(db_path)
    
    cursor = conn.cursor()
    # Check if master_roster table exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='master_roster';")
    if cursor.fetchone():
        # Check if the Season column exists in the master_roster table
        cursor.execute("PRAGMA table_info(master_roster);")
        columns = [row[1] for row in cursor.fetchall()]
        if "Season" not in columns:
            cursor.execute("ALTER TABLE master_roster ADD COLUMN Season INTEGER")
        
        # Correct the master_roster table if needed
        cursor.execute("SELECT DISTINCT Season FROM master_roster")
        seasons = cursor.fetchall()
        if len(seasons) != 1 or seasons[0][0] != year:
            cursor.execute("DELETE FROM master_roster")
            # Handle column name variations
            correct_roster = roster_df[roster_df['Season'] == year].copy()
            if 'Clean_Name' not in correct_roster.columns and 'Clean-Name' in correct_roster.columns:
                correct_roster.rename(columns={'Clean-Name': 'Clean_Name'}, inplace=True)
            elif 'Clean_Name' not in correct_roster.columns and 'player' in correct_roster.columns:
                correct_roster.rename(columns={'player': 'Clean_Name'}, inplace=True)
            correct_roster.to_sql('master_roster', conn, if_exists='append', index=False)
    
    # Add Season column to all tables as redundancy, checking first if it exists
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = cursor.fetchall()
    for table in tables:
        table_name = table[0]
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = [row[1] for row in cursor.fetchall()]
        if "Season" not in columns:
            try:
                cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN Season INTEGER")
                cursor.execute(f"UPDATE {table_name} SET Season = {year}")
            except sqlite3.OperationalError as e:
                print(f"Error updating table {table_name}: {str(e)}")
    
    conn.commit()
    conn.close()
    print(f"Database {db_file} has been corrected.")

In [5]:
# for db_file in db_files:
#     if db_file.endswith('.db'):  # Ensure it's a database file
#         correct_db_file(db_file, roster)


In [6]:
# ## FIX ie - add season column to the two Full_Stats.db files' tables

# # list the files in the directory that contain the string 'Full_Stats'
# full_stats_files = [file for file in db_files if 'Full_Stats' in file]
# full_stats_files



In [7]:
# def add_season_column_to_dbs(db_path, full_stats_files):
#     for db_file in full_stats_files:
#         year = int(db_file[:4])  # Extract the year from the filename
#         full_db_path = os.path.join(db_path, db_file)  # Construct the full path to the database file
#         conn = sqlite3.connect(full_db_path)
#         cursor = conn.cursor()
        
#         # Retrieve all table names in the database
#         cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
#         tables = cursor.fetchall()
        
#         for table in tables:
#             table_name = table[0]
#             # Check if the Season column exists
#             cursor.execute(f"PRAGMA table_info({table_name});")
#             columns = [row[1] for row in cursor.fetchall()]
#             if "Season" not in columns:
#                 cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN Season INTEGER DEFAULT {year}")
#                 cursor.execute(f"UPDATE {table_name} SET Season = {year}")
        
#         conn.commit()
#         conn.close()

# # Assume db_path is the directory where your DB files are stored
# db_path = '/mnt/data'  # Update this to your actual db directory path

# # List of database filenames to process
# full_stats_files = ['2021_Full_Stats.db', '2022_Full_Stats.db']

# # Add Season column to each table in the listed databases
# add_season_column_to_dbs(db_folder, full_stats_files)

# print("Season column added to all tables in the listed databases.")


In [8]:
### OLD CODE
# def adjust_table_schema(new_db_conn, source_conn, table_name):
#     """
#     Adjusts the table schema in the new database based on the source table schema,
#     adding any missing columns and renaming 'Clean_Name' to 'Player' if necessary.
#     """
#     source_cur = source_conn.cursor()
#     new_db_cur = new_db_conn.cursor()
    
#     # Get column info from the source table
#     source_cur.execute(f"PRAGMA table_info({table_name})")
#     source_columns = source_cur.fetchall()
    
#     # Get column info from the new (combined) table
#     new_db_cur.execute(f"PRAGMA table_info({table_name})")
#     new_columns = new_db_cur.fetchall()
    
#     # Convert fetched info into sets of column names for easy comparison
#     source_column_names = {col[1] for col in source_columns}
#     new_column_names = {col[1] for col in new_columns}
    
#     # Identify if we're dealing with a 'Clean_Name' to 'Player' transition
#     rename_clean_name_to_player = "Clean_Name" in new_column_names and "Player" not in source_column_names
    
#     # Determine missing columns and add them to the new table
#     missing_columns = source_column_names - new_column_names
#     for col in source_columns:
#         col_name, col_type = col[1], col[2]
#         if col_name in missing_columns:
#             # Handle the special case for renaming 'Clean_Name' to 'Player'
#             if table_name == "master_roster" and rename_clean_name_to_player:
#                 if col_name == "Clean_Name":
#                     # If "Clean_Name" should be renamed to "Player" and doesn't exist, skip adding "Clean_Name"
#                     continue
#                 elif col_name == "Player":
#                     # If dealing with "Player", ensure it's correctly added or renamed
#                     alter_stmt = f"ALTER TABLE \"{table_name}\" RENAME COLUMN \"Clean_Name\" TO \"Player\""
#                     new_db_cur.execute(alter_stmt)
#                     continue
#             else:
#                 # Regular column addition for non-special cases
#                 alter_stmt = f"ALTER TABLE \"{table_name}\" ADD COLUMN \"{col_name}\" {col_type}"
#                 new_db_cur.execute(alter_stmt)
    
#     new_db_conn.commit()


In [9]:
def adjust_table_schema(new_db_conn, source_conn, table_name):
    """
    Adjusts the table schema in the new database based on the source table schema,
    adding any missing columns and ensuring no duplicate additions.
    """
    source_cur = source_conn.cursor()
    new_db_cur = new_db_conn.cursor()
    
    # Get column info from the source table
    source_cur.execute(f"PRAGMA table_info({table_name})")
    source_columns = source_cur.fetchall()
    
    # Get column info from the new (combined) table
    new_db_cur.execute(f"PRAGMA table_info({table_name})")
    new_columns = new_db_cur.fetchall()
    
    new_column_names = {col[1] for col in new_columns}
    
    for col in source_columns:
        col_name, col_type = col[1], col[2]
        
        # Check directly if the column already exists in the new table
        if col_name not in new_column_names:
            # Proceed with adding the column
            try:
                alter_stmt = f"ALTER TABLE \"{table_name}\" ADD COLUMN \"{col_name}\" {col_type}"
                new_db_cur.execute(alter_stmt)
            except sqlite3.OperationalError as e:
                print(f"Error adding column {col_name} to table {table_name}: {e}")
    
    new_db_conn.commit()


## Combine the databases into a new file

- version 1 simple logic

In [10]:
import sqlite3
import os

def create_table_from_source(new_db_conn, source_conn, table_name):
    """
    Creates a table in the new database based on the schema of the source table.
    """
    source_cur = source_conn.cursor()
    source_cur.execute(f"PRAGMA table_info({table_name})")
    columns = source_cur.fetchall()
    
    # Rename 'Clean_Name' column to 'Player' for the master_roster table
    if table_name == "master_roster":
        columns = [(col[0], 'Player' if col[1] == 'Clean_Name' else col[1], col[2], col[3], col[4], col[5]) for col in columns]
    
    # Construct the CREATE TABLE statement
    col_defs = ', '.join([f'"{col[1]}" {col[2]}' for col in columns])
    create_stmt = f"CREATE TABLE IF NOT EXISTS {table_name} ({col_defs})"
    
    new_db_conn.execute(create_stmt)

def insert_data_from_source(new_db_conn, source_conn, table_name):
    """
    Inserts data from the source table into the corresponding table in the new database.
    Handles renaming 'Clean_Name' to 'Player' if necessary and quotes column names to avoid syntax errors.
    """
    source_cur = source_conn.cursor()
    source_cur.execute(f"SELECT * FROM {table_name}")
    column_names = [description[0] for description in source_cur.description]
    
    # Handle potential 'Clean_Name' to 'Player' renaming
    if 'Clean_Name' in column_names and table_name == "master_roster":
        column_names = ['Player' if col == 'Clean_Name' else col for col in column_names]
    
    # Quote column names to handle reserved words or special characters
    columns_str = ', '.join([f'"{col}"' for col in column_names])
    placeholders = ', '.join(['?'] * len(column_names))
    
    data = source_cur.fetchall()
    new_db_cur = new_db_conn.cursor()
    
    try:
        new_db_cur.executemany(f"INSERT INTO \"{table_name}\" ({columns_str}) VALUES ({placeholders})", data)
        new_db_conn.commit()
    except sqlite3.IntegrityError as e:
        print(f"Integrity error occurred while inserting data into {table_name}: {e}")


def combine_databases(db_folder, output_db_path):
    # Connect to the new database (this will create it if it doesn't exist)
    new_db_conn = sqlite3.connect(output_db_path)
    
    for db_file in os.listdir(db_folder):
        if db_file.endswith('.db'):
            source_db_path = os.path.join(db_folder, db_file)
            source_conn = sqlite3.connect(source_db_path)
            
            # Get list of tables from the source DB
            source_cur = source_conn.cursor()
            source_cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = source_cur.fetchall()
            
            for table_info in tables:
                table_name = table_info[0]
                
                # Ensure the table exists and its schema is up-to-date
                create_table_from_source(new_db_conn, source_conn, table_name)
                adjust_table_schema(new_db_conn, source_conn, table_name)
                
                # Then insert data
                insert_data_from_source(new_db_conn, source_conn, table_name)

            
            source_conn.close()
    
    new_db_conn.close()

# Example usage
# db_folder = 'path/to/your/db_folder'  # DEFINED ABOVE IN SETUP
output_db_path = f'{db_folder}/../Combined_DB_v2.db'  # Choose your output path
combine_databases(db_folder, output_db_path)

print("Databases combined successfully.")


Error adding column Game_ID to table goalie_stats: duplicate column name: Game_ID
Error adding column Game_ID to table game_details: duplicate column name: Game_ID
Error adding column Game_ID to table player_stats: duplicate column name: Game_ID
Error adding column Game_ID to table linescore: duplicate column name: Game_ID
Error adding column Game_ID to table penalty_summary: duplicate column name: Game_ID
Error adding column Game_ID to table scoring_summary: duplicate column name: Game_ID
Error adding column Game_ID to table goalie_stats: duplicate column name: Game_ID
Error adding column Game_ID to table game_details: duplicate column name: Game_ID
Error adding column Game_ID to table player_stats: duplicate column name: Game_ID
Error adding column Game_ID to table linescore: duplicate column name: Game_ID
Error adding column Game_ID to table penalty_summary: duplicate column name: Game_ID
Error adding column Game_ID to table scoring_summary: duplicate column name: Game_ID
Error addi

In [11]:
# Define your db_folder and the path for the new, combined database
# db_folder = 'path/to/your/db_folder'  # Update this path
