# Quick Utility to Clean DB Tables
- Create 12-14-24
    - Replace abreviations with full team names

In [1]:
## Dependencirs

import os
import pandas as pd
import numpy as np
import sqlite3


# Folder Paths
data_folder = os.path.join('..', 'data')
db_folder = os.path.join(data_folder, 'db')
temp_folder = os.path.join(data_folder, 'TEMP')

## Clean Team Name Function

In [2]:
def clean_team_name(team_name):
    """
    Cleans the team name by removing unwanted characters.
    Args:
        team_name (str): Team name.

    Returns:
        str: Cleaned team name.
    """
    # Replace unwanted characters with an empty string
    return team_name.replace('-', ' ').replace('.', '').replace("'", '').strip()

### Team info (Abbreviation, Full Name)

In [3]:
# school info file name
school_info_file = 'arena_school_info.csv'
school_info_df = pd.read_csv(os.path.join(data_folder, school_info_file)) # load school info to pandas

# Clean the Team and School Names
school_info_df['Team'] = school_info_df['Team'].apply(clean_team_name)
school_info_df['School'] = school_info_df['School'].apply(clean_team_name)

# Create a dictionary of the abv and school name
# school_info_dict = dict(zip(school_info_df['abv'], school_info_df['Team']))
abbreviation_dict = dict(zip(school_info_df['abv'], school_info_df['Team']))

# abbreviation_dict
# Check head of the data
# school_info_df.head()

In [4]:
# Define a function to replace abbreviations in a column with full team names
def replace_abbreviations_with_fullnames(df, column_name, abbreviation_dict):
    """
    Replaces abbreviations in the specified column of a DataFrame with full team names.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column to process.
        column_name (str): The column name where abbreviations need to be replaced.
        abbreviation_dict (dict): Dictionary mapping abbreviations to full names.
    
    Returns:
        pd.DataFrame: DataFrame with updated column values.
    """
    df[column_name] = df[column_name].replace(abbreviation_dict)
    return df






## Connect to Database

### Call The Function

In [5]:
# Source Database Path
filename = '2025_JAN_16_ROUGH.db'
## data\db\2024_Dec_10_CLEANED_OLD_METHOD.db

db_path = os.path.join(db_folder, filename)
print(db_path)


# Database Connection
conn = sqlite3.connect(db_path)

#  create list and print table names
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# Create list of table names
tables = cursor.fetchall()

# Print table names
print(tables)

..\data\db\2025_JAN_16_ROUGH.db
[('player_stats_ytd',), ('master_roster',), ('advanced_metrics',), ('game_details',), ('goalie_stats',), ('line_chart',), ('linescore',), ('penalty_summary',), ('player_stats',), ('scoring_summary',)]


In [6]:
## Load the tables into pandas and run the replace_abbreviations_with_fullnames function on the Team column of each table
# Load the tables into pandas

# Create a dictionary to store the table names and DataFrames
tables_dict = {}

# Load the tables into pandas
for table in tables:
    table_name = table[0]
    query = f"SELECT * FROM {table_name};"
    tables_dict[table_name] = pd.read_sql_query(query, conn)

# Replace abbreviations with full team names in each table only if the Team column exists
for table_name, table_df in tables_dict.items():
    if 'Team' in table_df.columns:
        tables_dict[table_name] = replace_abbreviations_with_fullnames(table_df, 'Team', abbreviation_dict)

# Write each table to back to the database
for table_name, table_df in tables_dict.items():
    table_df.to_sql(table_name, conn, if_exists='replace', index=False)

# Close the connection
conn.close()

print('Done')


Done


### Clean The team name

In [7]:
#### Call the function to remove periods, dashes, and apostrophes from the Team and School columns of the school_info_df DataFrame
# Open the connection to the database
conn = sqlite3.connect(db_path)

# Remove periods, dashes, and apostrophes from the Team, Away_Team, Home_Team and School columns every table in the database

# get list of tables
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Create dictionary to store the table names and DataFrames
tables_dict = {}

# Load the tables into pandas
for table in tables:
    table_name = table[0]
    query = f"SELECT * FROM {table_name};"
    tables_dict[table_name] = pd.read_sql_query(query, conn)

# Replace periods, dashes, and apostrophes with spaces in the Team, Away_Team, Home_Team, and School columns of each table
for table_name, table_df in tables_dict.items():
    if 'Team' in table_df.columns:
        table_df['Team'] = table_df['Team'].apply(clean_team_name)
    if 'Away_Team' in table_df.columns:
        table_df['Away_Team'] = table_df['Away_Team'].apply(clean_team_name)
    if 'Home_Team' in table_df.columns:
        table_df['Home_Team'] = table_df['Home_Team'].apply(clean_team_name)
    if 'School' in table_df.columns:
        table_df['School'] = table_df['School'].apply(clean_team_name)

# Write each table back to the database
for table_name, table_df in tables_dict.items():
    table_df.to_sql(table_name, conn, if_exists='replace', index=False)

# Close the connection
conn.close()

print('Done')



Done
