In [1]:
### Book to collect data about the current college hockey season from College Hockey News

## Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import logging
from tqdm import tqdm
import re

from sqlalchemy import create_engine


import sqlite3


FILE_TAG = 'MAR_18_Current_YTD_Stats'

## global variables
# # current_year_url = 'https://www.collegehockeynews.com/schedules/?season=20232024'
# tag = '2023_Season_Nov 2'

##### TEMP####
## global variables
current_year_url = 'https://www.collegehockeynews.com/schedules/?season=20232024'
# tag = 'NOV_13_Current Season YTD'
tag = FILE_TAG

result_table_name = '2023'


## Base usl for box scores and metrics

base_url = 'https://www.collegehockeynews.com'



### Download a table of every game in CHN database for the selected season

In [2]:
## Functions
### Parse the current season schedule / results page

def parse_current_season(url):
        # Initialize variables
    current_date = None
    current_conference = None
    game_notes = None

    # Initialize an empty list to hold the data
    data = []

    # Parse the page with BeautifulSoup
    # Get the page with requests
    response = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # select the table or tables
    tables = soup.find_all('table')

    rows = soup.find_all('tr')

    # Loop through each row to find relevant information
    for row in rows:
        # Check for date row
        if row.get('class') == ['stats-section']:
            current_date = row.find('td').text.strip()
        # Check for conference row
        elif row.get('class') == ['sked-header']:
            current_conference = row.find('td').text.strip()
        # Check for game notes
        elif len(row.find_all('td')) == 2:
            game_notes = row.find_all('td')[1].text.strip()
        # Process rows with game data
        elif row.get('valign') == 'top':
            cells = row.find_all('td')
            if len(cells) >= 9:
                home_team = cells[0].text.strip()
                # Remove any hyphens from the team name
                home_team = home_team.replace('-', ' ')
                home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
                home_score = cells[1].text.strip()
                away_team = cells[3].text.strip()
                away_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
                away_score = cells[4].text.strip()
                ot = cells[5].text.strip()
                box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
                metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None
                # Capture Game Notes
                game_notes_cell = cells[-1].find('small')
                game_notes = game_notes_cell.text.strip() if game_notes_cell else None

                # Append data to the list
                data.append([current_date, current_conference, game_notes, home_team, home_team_link, home_score, away_team, away_team_link, away_score, ot, box_link, metrics_link])
                game_notes = None  # Reset game notes for the next row
    return data

## call the function
data = parse_current_season(current_year_url)


# Create a dataframe from the list

columns = ['Date', 'Conference', 'Game_Notes', 'Home_Team', 'Home_Team_Link', 'Home_Score', 'Away_Team', 'Away_Team_Link', 'Away_Score', 'OT', 'Box_Link', 'Metrics_Link']
df = pd.DataFrame(data, columns=columns)
            
## Extract the day of the week from the date and save in new column
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
# remove day of the week from date
# format data column as YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

### Create a new column for the game ID
## Game ID will be a combination of the date and abbreviated team names

# Function to abbreviate the team names
for row in df.itertuples():
    home_team = row.Home_Team
    away_team = row.Away_Team
    home_team_abbr = home_team.split(' ')[-1]
    away_team_abbr = away_team.split(' ')[-1]
    # Remove any hyphens from the team name if there are any
    home_team_abbr = home_team_abbr.replace('-', ' ')
    away_team_abbr = away_team_abbr.replace('-', ' ')
    game_id = f'{row.Date}-{home_team_abbr}-{away_team_abbr}'
    df.loc[row.Index, 'Game_ID'] = game_id

# Create a new column for the game ID
df['Game_ID'] = df['Game_ID'].str.replace(',', '')

# Remove any hyphens from the team names if any
df['Home_Team'] = df['Home_Team'].str.replace('-', ' ')
df['Away_Team'] = df['Away_Team'].str.replace('-', ' ')

# Apply the function to the DataFrame
df['Game_ID'] = df.apply(lambda row: f'{row.Date}-{row.Home_Team}-{row.Away_Team}', axis=1)

## Filter out games that have not been played yet
df = df[df['Home_Score'] != '']

# Replace Nan values in metrics column with empty string
df['Metrics_Link'] = df['Metrics_Link'].fillna('')

# save the dataframe to a csv file for manual review
df.to_csv('../TEMP/season_table_test.csv')


In [3]:
print(len(df))
df.tail(10)

## Save csv of just the current season results
df.to_csv(f'../TEMP/{result_table_name}.csv', index=False)

# Store the dataframe as games_df
games_df = df.copy()

1186


# FUNCTIONS

In [4]:
## Functions for parsing the box score and metrics pages

# Initialize logging for Error and Warning messages
logging.basicConfig(filename='../TEMP/current_scrape.log', level=logging.INFO)

#### PARSE PLAYER STATS TABLE ####
def parse_player_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the playersums div
    playersums_div = soup.find('div', id='playersums')
    if playersums_div is None:
        return "Player summaries div not found"

    # Initialize list to store player stats
    player_stats = []

    # Loop through each playersum div
    for player_sum in playersums_div.find_all('div', class_='playersum'):
        team = player_sum.find('td').text.strip()
        
        # Loop through table rows
        for row in player_sum.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) > 1:
                player = cols[0].text.strip()
                goals = cols[1].text.strip()
                assists = cols[2].text.strip()
                points = cols[3].text.strip()
                plus_minus = cols[4].text.strip()
                shots = cols[5].text.strip()
                pim = cols[6].text.strip()
                fowl = cols[7].text.strip() if len(cols) > 7 else None
                
                fow, fol = None, None
                win_percentage = None
                
                

                try:
                    if fowl and '‑' in fowl:  # Checking if it contains a hyphen
                        fow, fol = map(int, fowl.split('‑'))
                        total_fo = fow + fol
                        win_percentage = (fow / total_fo) * 100 if total_fo > 0 else 0
                except ValueError:
                    fow, fol, win_percentage = None, None, None

                

                
                player_stat = {
                    'Team': team,
                    'Player': player,
                    'G': goals,
                    'A': assists,
                    'Pt.': points,
                    '+/-': plus_minus,
                    'Sh': shots,
                    'PIM': pim,
                    'FOW': fow,
                    'FOL': fol,
                    'FO%': win_percentage
                }
                player_stats.append(player_stat)

    return pd.DataFrame(player_stats)


############# PARSEING SCORING SUMMARY WITH BS4

def parse_scoring_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the scoring div and table
    scoring_div = soup.find('div', id='scoring')
    if scoring_div is None:
        logging.error("Scoring div not found")
        return None

    scoring_table = scoring_div.find('table')
    if scoring_table is None:
        logging.error("Scoring table not found within the scoring div")
        return None

    # Initialize list to store scoring events
    scoring_events = []
    period = None

    # Loop through table rows
    for row in scoring_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                try:
                    team = cols[0].text.strip()
                    pp = cols[1].text.strip()

                    player_data = cols[3].text.strip()
                    match = re.match(r"(.+)\s\((\d+)\)", player_data)
                    player = match.group(1) if match else player_data
                    goals = int(match.group(2)) if match else None

                    assist_data_raw = cols[4].text.strip()
                    assist_data = assist_data_raw.split(", ") if assist_data_raw else []
                    assist1 = assist_data[0] if len(assist_data) > 0 else None
                    assist2 = assist_data[1] if len(assist_data) > 1 else None

                    time = cols[5].text.strip()

                    scoring_event = {
                        'Period': period,
                        'Team': team,
                        'PP': pp,
                        'Player': player,
                        'Player_Goals': goals,
                        'Assist1': assist1,
                        'Assist2': assist2,
                        'Time': time
                    }
                    scoring_events.append(scoring_event)
                except Exception as e:
                    logging.error(f"An error occurred while parsing a scoring event row: {e}")
            else:
                logging.warning(f"Insufficient columns in scoring row: {len(cols)}")

    return pd.DataFrame(scoring_events)


############# PARSEING PENALTY SUMMARY WITH BS4

def parse_penalty_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the penalties div and table
    penalties_div = soup.find('div', id='penalties')
    if penalties_div is None:
        logging.error("Penalties div not found")
        return None

    penalties_table = penalties_div.find('table')
    if penalties_table is None:
        logging.error("Penalties table not found within the penalties div")
        return None

    # Initialize list to store penalty events
    penalty_events = []
    period = None

    # Loop through table rows
    for row in penalties_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                team = cols[0].text.strip()
                player = cols[1].text.strip()
                pen_length = cols[2].text.strip()
                penalty_type = cols[3].text.strip()
                time = cols[4].text.strip()

                penalty_event = {
                    'Period': period,
                    'Team': team,
                    'Player': player,
                    'Pen_Length': pen_length,
                    'Penalty_Type': penalty_type,
                    'Time': time
                }
                penalty_events.append(penalty_event)

    return pd.DataFrame(penalty_events)


############# GOALIE SUMMARY WITH BS4
def parse_goalie_stats(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the goalies div and table
    goalies_div = soup.find('div', id='goalies')
    if goalies_div is None:
        logging.error("Goalies div not found")
        return None

    goalies_table = goalies_div.find('table')
    if goalies_table is None:
        logging.error("Goalies table not found within the goalies div")
        return None

    # Initialize list to store goalie stats
    goalie_stats = []
    team = None

    # Loop through table rows
    for row in goalies_table.find_all('tr'):
        if 'stats-header' in row.get('class', []):
            td = row.find('td')
            if td:
                team = td.text.strip()
            else:
                logging.warning("Team name not found in 'stats-header' row")
                team = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                goalie = cols[0].text.strip()
                sv = cols[1].text.strip()
                ga = cols[2].text.strip()
                minutes = cols[3].text.strip()

                goalie_stat = {
                    'Team': team,
                    'Goalie': goalie,
                    'SV': sv,
                    'GA': ga,
                    'Minutes': minutes
                }
                goalie_stats.append(goalie_stat)

    return pd.DataFrame(goalie_stats)


#### PARSE THE ADVANCED TEAM METRICS TABLES ####
### NEW - IT WORKS IN THE NOTEBOOK BUT NOT IN THE FUNCTION
### RETURNS WHOLE ADVANCED METRICS AS SINGLE TABLE
####################################
def parse_new_advanced_metrics(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df

# Testing the new function with the provided HTML content
# advanced_df = parse_new_advanced_metrics(advanced_tie_game_html_content)
# advanced_df


######## NEWS TEST ###############  
def parse_advanced_metrics_tables(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df


######### OLDER FUNCTION for Test ################
# ########### UPDATED FUNCTION TO STORE TEAM NAMES IN THE DATAFRAMES
# def parse_advanced_metrics_tables(html_content):
#     # Initialize list to store DataFrames
#     dfs = []
    
#     # Parse HTML content
#     soup = BeautifulSoup(html_content, 'html.parser')
    
#     # Find all tables
#     tables = soup.find_all('table', {'class': 'sortable metrics'})
    
#     for table in tables:
#         # Extract team name
#         team_name = table.find('td').text
        
#         # Initialize list to store column names and data
#         col_names = []
#         col_names_final = []
#         data = []
        
#         # Get headers
#         headers = table.find_all('th')
#         for header in headers:
#             col_names.append(header.text)
        
#         # Add TOTAL, EVEN STRENGTH, POWER PLAY, CLOSE to column names
#         section_headers = ['TOTAL', 'EVEN STRENGTH', 'POWER PLAY', 'CLOSE']
#         for col in col_names:
#             for section in section_headers:
#                 if col in section_headers:
#                     temp_col = section
#                 else:
#                     temp_col = f"{section}_{col}"
#             col_names_final.append(temp_col)
        
#         # Get data rows
#         rows = table.find_all('tr')[2:]  # skip header rows
#         for row in rows:
#             row_data = [team_name]  # Add team name as the first element
#             cells = row.find_all('td')
#             for cell in cells:
#                 row_data.append(cell.text.strip())
#             data.append(row_data)
        
#         # Add "Team" to the column names
#         new_names = ['Team', 'Player', 'TOTAL_Block', 'TOTAL_Miss', 'TOTAL_Saved', 'TOTAL_Goals', 'TOTAL_Total_Shots',
#                      'EVEN_Block', 'EVEN_Miss', 'EVEN_Saved', 'EVEN_Goals', 'EVEN_Total_Shots',
#                      'PP_Block', 'PP_Miss', 'PP_Saved', 'PP_Goals', 'PP_Total_Shots',
#                      'CLOSE_Block', 'CLOSE_Miss', 'CLOSE_Saved', 'CLOSE_Goals', 'CLOSE_Total_Shots',
#                      'D_Blocks', 'Faceoffs']

#         # Create DataFrame and set the column names
#         df = pd.DataFrame(data, columns=new_names)

#         # Append DataFrame to list
#         dfs.append(df)
    
#     return dfs


# Complete code for parsing the line chart information with specific positions for forwards and defensemen.


def parse_line_chart(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    line_chart_div = soup.find('div', id='linechart')

    if line_chart_div is None:
        logging.error("Line chart div not found")
        return pd.DataFrame()

    line_data = []

    for team_div in line_chart_div.find_all('div', recursive=False):
        h3 = team_div.find('h3')
        if h3 is None:
            logging.warning("Team name not found")
            continue
        
        team_name = h3.text.strip()
        
        for line_type_div in team_div.find_all('div', recursive=False):
            line_type = line_type_div.get('class')[0] if line_type_div.get('class') else None
            if line_type is None:
                logging.warning("Line type not found")
                continue
            
            if line_type == 'f':
                position_types = ['Left Wing', 'Center', 'Right Wing']
            elif line_type == 'd':
                position_types = ['Left D', 'Right D']
            elif line_type == 'x':
                position_types = ['Extra']
            elif line_type == 'g':
                position_types = ['Goalie']
                goalie_count = 1  # Initialize goalie count
            else:
                continue

            players = line_type_div.find_all('div')
            if not players:
                logging.warning(f"No players found for {team_name} in {line_type}")
                continue
            
            for i, player in enumerate(players):
                player_name = player.text.strip()
                if line_type == 'x':
                    player_name = player_name.split(' ')[0]
                if line_type == 'g':
                    line_number = f"Goalie {goalie_count}"
                    goalie_count += 1
                else:
                    line_number = i // len(position_types) + 1

                position = position_types[i % len(position_types)]
                line_data.append({
                    'Team': team_name,
                    'Line': line_number,
                    'Position': position,
                    'Player': player_name
                })

    if not line_data:
        logging.error("No line data was collected")

    df = pd.DataFrame(line_data)
    
    # # Log DataFrame info for debugging
    # if df.empty:
    #     logging.warning("Generated line chart DataFrame is empty.")
    # else:
    #     logging.info(f"Generated line chart DataFrame with columns: {df.columns.tolist()}")

    return df

### Get the Linescore Elements - Score, shots, ect by period####
### NEEDS UPDATE NOW THAT POSTSEASON MEAND 5th, 6th, ect PERIODS

def parse_linescore(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    linescore_data = []
    
    # Parsing the Goals table
    goals_table = soup.select_one("#goals table")
    if goals_table is None:
        logging.error("Goals table not found")
        return None
    
    rows = goals_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Goals table")
        return None
    
    for row in rows:
        team_data = {}
        td = row.select_one('td')
        if td:
            team_data['Team'] = td.text
        else:
            logging.warning("Team name not found in Goals table")
            continue

        goals = row.select('td')[1:]
        for i, goal in enumerate(goals):
            period = i + 1
            column_name = f'goals{period}' if i < len(goals) - 1 else 'goalsT'
            team_data[column_name] = int(goal.text)
        
        linescore_data.append(team_data)
    

    # Parsing the Shots table
    shots_table = soup.select_one("#shots table")
    if shots_table is None:
        logging.error("Shots table not found")
        return None

    rows = shots_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Shots table")
        return None

    for i, row in enumerate(rows):
        shots = row.select('td')[1:]
        if not shots:
            logging.warning(f"No shot data found for row {i+1} in Shots table")
            continue

        for j, shot in enumerate(shots):
            period = j + 1
            column_name = f'shots{period}' if j < len(shots) - 1 else 'shotsT'
            try:
                linescore_data[i][column_name] = int(shot.text.strip())
            except ValueError:
                logging.warning(f"Could not convert shot data to integer for row {i+1}, column {j+1}")
                linescore_data[i][column_name] = None

    # Parsing the PP table
    pp_table = soup.select_one("#pp table")
    if pp_table is None:
        logging.error("PP table not found")
        return None

    rows = pp_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in PP table")
        return None

    for i, row in enumerate(rows):
        try:
            pen_pim = row.select('td')[1].text.split('‑')
            linescore_data[i]['Pen'] = int(pen_pim[0])
            linescore_data[i]['PIM'] = int(pen_pim[1])

            ppg_ppo = row.select('td')[2].text.split('‑')
            linescore_data[i]['PPG'] = int(ppg_ppo[0])
            linescore_data[i]['PPO'] = int(ppg_ppo[1])

            fow_fol = row.select('td')[3].text.split('‑')
            linescore_data[i]['FOW'] = int(fow_fol[0])
            linescore_data[i]['FOL'] = int(fow_fol[1])
            linescore_data[i]['FOW%'] = (linescore_data[i]['FOW'] / (linescore_data[i]['FOW'] + linescore_data[i]['FOL'])) * 100

        except (ValueError, IndexError) as e:
            logging.warning(f"Could not process PP data for row {i+1}. Error: {e}")
            continue

    # Convert to DataFrame early
    df = pd.DataFrame(linescore_data)

    # Ensure all columns exist
    expected_goals_columns = [f'goals{i}' for i in range(1, 7)] + ['goalsT']
    expected_shots_columns = [f'shots{i}' for i in range(1, 7)] + ['shotsT']

    for col in expected_goals_columns + expected_shots_columns:
        if col not in df.columns:
            df[col] = 0

    # Return the final DataFrame
    return df


# ############################## ORIGINAL FUNCTION #########################################
# def parse_linescore(html_content):
#     soup = BeautifulSoup(html_content, 'html.parser')
#     linescore_data = []
    
#     # Parsing the Goals table
#     goals_table = soup.select_one("#goals table")
#     if goals_table is None:
#         logging.error("Goals table not found")
#         return None
    
#     rows = goals_table.select('tbody tr')
#     if not rows:
#         logging.warning("No rows found in Goals table")
#         return None
    
#     for row in rows:
#         team_data = {}
#         td = row.select_one('td')
#         if td:
#             team_data['Team'] = td.text
#         else:
#             logging.warning("Team name not found in Goals table")
#             continue

#         goals = row.select('td')[1:]
#         for i, goal in enumerate(goals):
#             period = i + 1
#             column_name = f'goals{period}' if i < len(goals) - 1 else 'goalsT'
#             team_data[column_name] = int(goal.text)
        
#         linescore_data.append(team_data)
    

#     # Parsing the Shots table
#     shots_table = soup.select_one("#shots table")
#     if shots_table is None:
#         logging.error("Shots table not found")
#         return None

#     rows = shots_table.select('tbody tr')
#     if not rows:
#         logging.warning("No rows found in Shots table")
#         return None

#     for i, row in enumerate(rows):
#         shots = row.select('td')[1:]
#         if not shots:
#             logging.warning(f"No shot data found for row {i+1} in Shots table")
#             continue

#         for j, shot in enumerate(shots):
#             period = j + 1
#             column_name = f'shots{period}' if j < len(shots) - 1 else 'shotsT'
#             try:
#                 linescore_data[i][column_name] = int(shot.text.strip())
#             except ValueError:
#                 logging.warning(f"Could not convert shot data to integer for row {i+1}, column {j+1}")
#                 linescore_data[i][column_name] = None

#     # Parsing the PP table
#     pp_table = soup.select_one("#pp table")
#     if pp_table is None:
#         logging.error("PP table not found")
#         return None

#     rows = pp_table.select('tbody tr')
#     if not rows:
#         logging.warning("No rows found in PP table")
#         return None

#     for i, row in enumerate(rows):
#         try:
#             pen_pim = row.select('td')[1].text.split('‑')
#             linescore_data[i]['Pen'] = int(pen_pim[0])
#             linescore_data[i]['PIM'] = int(pen_pim[1])

#             ppg_ppo = row.select('td')[2].text.split('‑')
#             linescore_data[i]['PPG'] = int(ppg_ppo[0])
#             linescore_data[i]['PPO'] = int(ppg_ppo[1])

#             fow_fol = row.select('td')[3].text.split('‑')
#             linescore_data[i]['FOW'] = int(fow_fol[0])
#             linescore_data[i]['FOL'] = int(fow_fol[1])
#             linescore_data[i]['FOW%'] = (linescore_data[i]['FOW'] / (linescore_data[i]['FOW'] + linescore_data[i]['FOL'])) * 100

#         except (ValueError, IndexError) as e:
#             logging.warning(f"Could not process PP data for row {i+1}. Error: {e}")
#             continue

#     return pd.DataFrame(linescore_data)



# Function to parse game details


def parse_game_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meta_div = soup.find('div', {'id': 'meta'})
    if meta_div is None:
        logging.error("Meta div not found")
        return None
    
    game_details_div = meta_div.find_all('div')[-1]
    if game_details_div is None:
        logging.error("Game details div not found")
        return None
    
    try:
        date_str = game_details_div.h4.string
        day_of_week, date = date_str.split(", ", 1)
        
        p_elements = game_details_div.find_all('p')
        
        # Extract conference and location details
        for p in p_elements:
            if "Game" in p.text:  # e.g., "Big Ten Game"
                details_strs = p.get_text(separator='|').split('|')
                conference = details_strs[0]
                location = details_strs[-1].split('at ')[-1]
                break
        else:  # Defaults if not found
            conference, location = None, None
        
        # Extract referees and assistant referees details
        for p in p_elements:
            if "Referees" in p.text:
                refs_str = p.strong.next_sibling if p.strong else None
                asst_refs_str = p.find_all('strong')[1].next_sibling if len(p.find_all('strong')) > 1 else None
                break
        else:  # Defaults if not found
            refs_str, asst_refs_str = None, None
        
        refs = refs_str.split(', ') if refs_str else []
        asst_refs = asst_refs_str.split(', ') if asst_refs_str else []
        refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in refs]
        asst_refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in asst_refs]
        
        # Extract attendance details using regex for better accuracy
        attendance_pattern = r"Attendance:\s?(\d+[\d,]*)"
        attendance_match = re.search(attendance_pattern, html_content)
        attendance = int(attendance_match.group(1).replace(',', '')) if attendance_match else None
        
        # Extract game details (like shootout results)
        details = None
        for p in p_elements:
            if "shootout" in p.text:
                details = p.text
                break
        
        # Clean details if present
        if details and '\n' in details:
            details = details.replace('\n', '').strip()
        if details and '\t' in details:
            details = re.sub('\t', ' ', details)
        
        game_details = {
            'Day': day_of_week,
            'Date': date,
            'Conference': conference,
            'Details': details,
            'Location': location,
            'Ref1': refs[0] if refs else None,
            'Ref2': refs[1] if len(refs) > 1 else None,
            'Asst_Ref1': asst_refs[0] if asst_refs else None,
            'Asst_Ref2': asst_refs[1] if len(asst_refs) > 1 else None,
            'Attendance': attendance
        }
        
        game_details_df = pd.DataFrame([game_details])
        return game_details_df

    except (AttributeError, IndexError, ValueError) as e:
        logging.error(f"Error while parsing game details: {e}")
        return None


def parse_box_score(box_score_html):
    # Initialize DataFrames to None
    scoring_summary = penalty_summary = goalie_stats = player_stats = line_chart = linescore = game_details = None
    
    try:
        scoring_summary = parse_scoring_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_scoring_summary: {e}")
    
    try:
        penalty_summary = parse_penalty_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_penalty_summary: {e}")
    
    try:
        goalie_stats = parse_goalie_stats(box_score_html)
    except Exception as e:
        print(f"Error in parse_goalie_stats: {e}")
    
    try:
        player_stats = parse_player_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_player_summary: {e}")
    
    try:
        line_chart = parse_line_chart(box_score_html)
        if line_chart.empty:
            logging.info("Line chart is empty. Skipping the insert for this game.")
        else:
            logging.info(f"Line chart DataFrame structure: {line_chart.dtypes}")

        # Insert into database (make sure this part works as expected)

    except Exception as e:
        logging.error(f"Error in parse_line_chart: {e}")


    
    try:
        linescore = parse_linescore(box_score_html)
    except Exception as e:
        print(f"Error in parse_linescore: {e}")
    
    try:
        game_details = parse_game_details(box_score_html)
    except Exception as e:
        print(f"Error in parse_game_details: {e}")
    
    # Combine DataFrames into a list
    all_dfs = [game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore]
    
    return all_dfs


def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated()].unique(): 
        cols[df.columns.get_loc(dup)] = [f"{dup}_{i}" if i != 0 else dup for i in range(df.columns.get_loc(dup).sum())]
    df.columns = cols
    return df




# Function to save DataFrames to SQLite database
import sqlite3
from sqlalchemy import create_engine

# Modified Function to save DataFrames to SQLite database
def save_to_sqlite_db(df_list, table_names, tag=''):
    db_name = f"../TEMP/{FILE_TAG}_Game_Stats.db"
    engine = create_engine(f'sqlite:///{db_name}')
    
    for df, table in zip(df_list, table_names):
        try:
            df = rename_duplicate_columns(df)
            df.to_sql(table, engine, if_exists='append', index=False)
        except Exception as e:
            print(f"Error saving to table {table}: {e}")

# ############ OLDER FUNCTION ################
# def save_to_sqlite_db(df_list, table_names, tag=''):
#     db_name = f"../TEMP/Season_Data_2020.db"  # Create DB file name using the TAG
#     engine = create_engine(f'sqlite:///{db_name}')
    
#     for df, table in zip(df_list, table_names):
#         # Assuming rename_duplicate_columns is a function you've defined earlier
#         df = rename_duplicate_columns(df)
#         df.to_sql(table, engine, if_exists='append', index=False)

# Function to fetch and save data
# Modified Function to fetch and save data
def fetch_and_save_data_to_db(box_score_url, advanced_metrics_url, tag=f'{tag}'):  # Added tag parameter
    db_name = f"{tag} CHN Data.db"  # Create DB name using the TAG
    # Fetch HTML content for box score
    box_score_response = requests.get(box_score_url)
    box_score_html = box_score_response.text
    
    # Fetch HTML content for advanced metrics
    advanced_metrics_response = requests.get(advanced_metrics_url)
    advanced_metrics_html = advanced_metrics_response.text
    
    # Parse box score into list of DataFrames
    box_score_dfs = parse_box_score(box_score_html)
    
    # Parse advanced metrics into list of DataFrames
    advanced_metrics_df = parse_advanced_metrics_tables(advanced_metrics_html)
    advanced_metrics_dfs = [advanced_metrics_df]
    
    # Combine all DataFrames into a list
    all_dfs = box_score_dfs + advanced_metrics_dfs
    
    # Define table names for these DataFrames
    table_names = ['game_details', 'scoring_summary', 'penalty_summary', 
                    'goalie_stats', 'player_stats', 'line_chart', 'linescore',
                    'advanced_metrics']


    # if len(all_dfs) != len(table_names):
    #     raise ValueError("Mismatch between number of DataFrames and table names!")

    # Diagnostic Step 1: Print the number of DataFrames and table names
    # print(f"Number of DataFrames: {len(all_dfs)}")
    # print(f"Number of table names: {len(table_names)}")

    # Diagnostic Step 2: Print the names of the columns for each DataFrame
    # for df in all_dfs:
    #     print(df.columns.tolist())


    # Create a game_id for the game and apply it to all dataframes
    # Game ID YYYMMDD-HomeTeam-AwayTeam
    for df in all_dfs:
        df['Game_ID'] = game_id

        # Diagnostic check
    if len(all_dfs) != len(table_names):
        print(f"Mismatch detected!")
        print(f"box_score_url: {box_score_url}")
        print(f"advanced_metrics_url: {advanced_metrics_url}")
    
    # Save DataFrames to SQLite database
    save_to_sqlite_db(all_dfs, table_names, tag)  # Pass the tag here
    
    return all_dfs


In [5]:
# # Example usage:
# TAG = "NEW2022-2023"
# save_to_sqlite_db(df_list, table_names, TAG)
# fetch_and_save_data_to_db(box_score_url, advanced_metrics_url, TAG)

## Compare the GAME IDS from the results tab to the current databasefile and filter out games that there are already records for to avoid scraping the 4e3ntire season worth of games every time through the code

In [6]:
# ### Path to current database

# current_db_path = f"../TEMP/{FILE_TAG}_Game_Stats.db"

# # Create a connection to the database
# # conn = sqlite3.connect(current_db_path)



# Loop and Scrape

In [7]:
## Run the scrape to get game data using the functions above and infor from games_df

## Change the variable name to reuse the old code
sampled_games = games_df

# Initialize counters & logs
error_count = 0
error_games = []

# Loop over sampled games and fetch data
for idx, row in tqdm(sampled_games.iterrows(), total=sampled_games.shape[0], desc="Scraping games"):
    retries = 3  # Number of retries
    success = False

    
    while retries > 0 and not success:
        try:
            box_score_url = base_url + row['Box_Link']
            advanced_metrics_url = base_url + row['Metrics_Link']

            # create a unique game id
            game_id = str(row['Date']) + '-' + str(row['Home_Team']) + '-' + str(row['Away_Team'])
            
            logging.info(f"Fetching data for game: {row['Home_Team']} vs {row['Away_Team']}")
            
            # Your existing function to fetch and save data
            all_dfs = fetch_and_save_data_to_db(box_score_url, advanced_metrics_url)
            
            # If reached here, the fetching was successful
            success = True
            
            # Adaptive rate limiting
            
            
        except requests.exceptions.RequestException as e:  # Network-related errors
            logging.error(f"Network error for game: {row['Home_Team']} vs {row['Away_Team']}. Error: {e}")
            retries -= 1
            time.sleep(5)  # Wait for 5 seconds before retrying
        
        except Exception as e:  # Other exceptions
            logging.error(f"An error occurred for game: {row['Game_ID']} - {row['Home_Team']} vs {row['Away_Team']}. Error: {e}")
            error_count += 1
            error_games.append((row['Home_Team'], row['Away_Team']))
            break  # Break the while loop; no retries for these types of errors


# Close the logging file
logging.shutdown()

# Close the database connection
# conn.close()



Scraping games:   0%|          | 0/1186 [00:00<?, ?it/s]

Scraping games:  97%|█████████▋| 1145/1186 [40:08<01:23,  2.03s/it]

Error saving to table linescore: (sqlite3.OperationalError) table linescore has no column named goals5
[SQL: INSERT INTO linescore ("Team", goals1, goals2, goals3, goals4, goals5, "goalsT", shots1, shots2, shots3, shots4, shots5, "shotsT", "Pen", "PIM", "PPG", "PPO", "FOW", "FOL", "FOW%", "Game_ID") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [('Canisius', 1, 0, 0, 0, 0, 1, 9, 14, 12, 3, 1, 39, 3, 6, 0, 6, 40, 52, 43.47826086956522, '2024-03-09-Canisius-Holy Cross'), ('HC', 0, 1, 0, 0, 1, 2, 8, 13, 6, 14, 1, 42, 6, 12, 0, 3, 52, 40, 56.52173913043478, '2024-03-09-Canisius-Holy Cross')]]
(Background on this error at: https://sqlalche.me/e/20/e3q8)


Scraping games:  97%|█████████▋| 1156/1186 [40:29<00:59,  1.98s/it]

Error saving to table linescore: (sqlite3.OperationalError) table linescore has no column named goals5
[SQL: INSERT INTO linescore ("Team", goals1, goals2, goals3, goals4, goals5, "goalsT", shots1, shots2, shots3, shots4, shots5, "shotsT", "Pen", "PIM", "PPG", "PPO", "FOW", "FOL", "FOW%", "Game_ID") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [('SLU', 0, 2, 0, 0, 1, 3, 3, 16, 2, 7, 5, 33, 2, 4, 1, 1, 28, 50, 35.8974358974359, '2024-03-15-St. Lawrence-Colgate'), ('Colgate', 0, 0, 2, 0, 0, 2, 17, 7, 9, 12, 4, 49, 1, 2, 1, 2, 50, 28, 64.1025641025641, '2024-03-15-St. Lawrence-Colgate')]]
(Background on this error at: https://sqlalche.me/e/20/e3q8)


Scraping games: 100%|██████████| 1186/1186 [41:39<00:00,  2.11s/it]


In [8]:
import datetime

## Print a summary of the errors
print(f"Total games: {sampled_games.shape[0]}")
print(f"Games With Errors: {error_count}")
for game in error_games:
    print(f"Error: {game}")

# Print Timestamp
print(f"Timestamp: {datetime.datetime.now()}")



Total games: 1186
Games With Errors: 43
Error: ('Arizona', 'Arizona State')
Error: ('McGill', 'Vermont')
Error: ('Merrimack', 'Sacred Heart')
Error: ('Union', 'Rensselaer')
Error: ('Royal Military', 'Niagara')
Error: ('New Hampshire', 'Maine')
Error: ('Western Michigan', 'US Under 18')
Error: ('Manitoba', 'North Dakota')
Error: ('Omaha', 'Minnesota State')
Error: ('Simon Fraser', 'Colorado College')
Error: ('Guelph', 'Rensselaer')
Error: ('Canisius', 'Niagara')
Error: ('Massachusetts', 'Dartmouth')
Error: ('Ottawa', 'Sacred Heart')
Error: ('Quinnipiac', 'Northeastern')
Error: ('Bemidji State', 'Minnesota')
Error: ('Guelph', 'RIT')
Error: ('Toronto', 'Princeton')
Error: ('US Under 18', 'Boston University')
Error: ('TMU', 'Cornell')
Error: ('Grand Valley St.', 'Ferris State')
Error: ('Simon Fraser', 'Lake Superior')
Error: ('US Under 18', 'Cornell')
Error: ('St. Anselm', 'Long Island')
Error: ('Simon Fraser', 'Robert Morris')
Error: ('Anna Maria', 'Stonehill')
Error: ('Simon Fraser', 'Lo

In [9]:
break



SyntaxError: 'break' outside loop (3716208712.py, line 1)

## Fix / Notes

### Added 1/29/2024



# DB Cleaning

In [None]:
### Paths and File Names for the Data

current_db_path = f"../TEMP/{FILE_TAG}.db"
current_db_path = '../TEMP/FEB_22_Current_YTD_Stats_Game_Stats.db'

conn = sqlite3.connect(current_db_path)

# Roster data
folder = '../data/rosters/'

df_2023 = pd.read_csv(folder + '2023_master_roster.csv')
df_2022 = pd.read_csv(folder + '2022_master_roster.csv')
df_2021 = pd.read_csv(folder + '2021_master_roster.csv')
df_2020 = pd.read_csv(folder + '2020_master_roster.csv')


# Load the Correct Roster into the database
roster_df = df_2023.copy()

# Set the SeasonYear in the database_roster - Set to year the season started
season_year_setting = 2023

## Print tables in database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())


## Create dictionary of Team Primary Names to Abbreviations

- Needed to Add IVY teams becuase of low amount of games. will have to do for harvard, yale, ect next week


In [None]:
## Create dataframe from SQL query
df = pd.read_sql_query("SELECT * FROM advanced_metrics", conn)

# Function to count the occurrences of primary team names for unmatched abbreviations
def count_primary_names_for_abbreviation(abbreviation):
    filtered_rows = df[df['Team'] == abbreviation]
    team_counts = {}
    
    for _, row in filtered_rows.iterrows():
        teams = row['Game_ID'].split('-')[-2:]
        for team in teams:
            if team not in team_counts:
                team_counts[team] = 0
            team_counts[team] += 1
            
    return team_counts


# Attempt to match abbreviations to primary names based on substrings and common naming conventions
matched_dict = {}
unmatched_abbreviations = []

for abbreviation in df['Team'].unique():



# Match the abbreviation to the primary team name with the highest occurrence

    team_counts = count_primary_names_for_abbreviation(abbreviation)
    # Get the team with the highest count
    matched_team = max(team_counts, key=team_counts.get)
    matched_dict[abbreviation] = matched_team

# matched_dict

# Manually fix the unmatched abbreviations - IVY League Teams with no of very few games throw a wrench in the above method
# Brown: Brown
# Cornell: Cornell

# Make those substitutions
matched_dict['Brown'] = 'Brown'
matched_dict['Cornell'] = 'Cornell'
# yale
matched_dict['Yale'] = 'Yale'
# princeton
matched_dict['Princeton'] = 'Princeton'

# harvard
matched_dict['Harvard'] = 'Harvard'
# columbia
matched_dict['Columbia'] = 'Columbia'

# dartmouth
matched_dict['Dartmouth'] = 'Dartmouth'

# penn
matched_dict['Penn'] = 'Penn'

# BC
matched_dict['BC'] = 'Boston College'



# matched_dict

## Clean and Transform Advanced Metrics
- add, Team and Home-Away columns, combine the two tables into a single table

In [None]:
# ## NEW Handling of Advanced Stats
# # Create dataframe from SQL query
# df = pd.read_sql_query("SELECT * FROM advanced_metrics", conn)

# # Rename columns
# new_names = ['Team', 'Player', 'TOTAL_Block', 'TOTAL_Miss', 'TOTAL_Saved', 'TOTAL_Goals', 'TOTAL_Total_Shots',
#                 'EVEN_Block', 'EVEN_Miss', 'EVEN_Saved', 'EVEN_Goals', 'EVEN_Total_Shots',
#                 'PP_Block', 'PP_Miss', 'PP_Saved', 'PP_Goals', 'PP_Total_Shots',
#                 'CLOSE_Block', 'CLOSE_Miss', 'CLOSE_Saved', 'CLOSE_Goals', 'CLOSE_Total_Shots',

#                 'D_Blocks', 'Faceoffs', 'Game_ID']

# df.columns = new_names

# # # Apply the matched_dict to the Team column
# df['Team'] = df['Team'].apply(lambda x: matched_dict[x])

# ## Fill all NaN values with 0
# df = df.fillna(0)

# # Display the dataframe
# df.head()

# # Save back to the database
# df.to_sql('advanced_metrics', conn, if_exists='replace', index=False)

## Add Home and Away Columns to game_details table

In [None]:
# # Step 1: Read the game_details table into a DataFrame
# df_game_details = pd.read_sql("SELECT * FROM game_details", conn)

# # Step 2: Create new columns for Home and Away Teams by parsing Game_ID
# df_game_details['Away_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[3])
# df_game_details['Home_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[4])

# # Step 3: Write this updated DataFrame back to the game_details table
# df_game_details.to_sql('game_details', conn, if_exists='replace', index=False)

## Add Home_Team and Away_Team Columns to all tables
### Use the Game_ID column to determin home and away, add to all tables
- This will make infering stuff like Home Goals, away goals, ect much easier to work out

In [None]:
### Define function to add Home and Away Columns to tables

def add_team_columns_to_tables(conn):
    # Function to add team columns to a DataFrame
    def add_team_columns(df):
        df['Away_Team'] = df['Game_ID'].apply(lambda x: x.split('-')[3])
        df['Home_Team'] = df['Game_ID'].apply(lambda x: x.split('-')[4])
        return df

    # Get the list of tables
    tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
    table_names = [t[0] for t in conn.execute(tables_query).fetchall()]

    # Process each table
    for table in table_names:
        # Read the table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)

        # Check if the table contains the Game_ID column
        if 'Game_ID' in df.columns:
            # Add team columns
            df = add_team_columns(df)

            # Write the updated DataFrame back to the table
            df.to_sql(table, conn, if_exists='replace', index=False)

## Run The function on the open DB

add_team_columns_to_tables(conn)

            

In [None]:
# ## Clean up The Column Names and extra header rows in the Player Stats table
# ############ 'Pt.' should be 'Pts' and '+/-' should be 'plus_minus'
# #################################

# # Define a dictionary for column renaming
# column_renames = {
#     'Pt.': 'Pts',
#     '+/-': 'plus_minus'
# }

# # Rename columns based on the dictionary
# player_stats_df.rename(columns=column_renames, inplace=True)


# # Drop rows where Team name is in the Player column
# player_stats_df = player_stats_df[player_stats_df['Team'] != player_stats_df['Player']]

# # Print the length of the dataframe
# len(player_stats_df)

In [None]:
## Change the Column names to be easy to work with
############ 'Pt.' should be 'Pts' and '+/-' should be 'plus_minus'
#################################
player_stats_df = pd.read_sql_query("SELECT * FROM player_stats", conn)

if 'Pt.' in player_stats_df.columns:
    player_stats_df.rename(columns={'Pt.': 'Pts'}, inplace=True)
else:
    print("Column 'Pt.' not found.")

if '+/-' in player_stats_df.columns:
    player_stats_df.rename(columns={'+/-': 'plus_minus'}, inplace=True)
else:
    print("Column '+/-' not found.")

print(len(player_stats_df))

# Drop rows if Team name is in the player column
# If ['Team'] is the same as ['Player'] then drop that row
player_stats_df = player_stats_df[player_stats_df['Team'] != player_stats_df['Player']]

# add the dataframe back to the database
player_stats_df.to_sql('player_stats', conn, if_exists='replace', index=False)

# print(len(player_stats_df))
#################################
# player_stats_df.head()

In [None]:
## Add The primary team names to the linescores table
# Read the linescores table into a DataFrame
df_linescores = pd.read_sql("SELECT * FROM linescore", conn)

# Apply the dictionary to the Team column
# df_linescores['Team'] = df_linescores['Team'].apply(lambda x: matched_dict[x])

df_linescores.head()

In [None]:
# # Penalty Table & Scoring Summary

# ## Add The primary team names to the linescores table
# # Read the linescores table into a DataFrame

# df_penalty = pd.read_sql("SELECT * FROM penalty_summary", conn)

# # Apply the dictionary to the Team column
# #$ Skip if not found

# df_penalty['Team'] = df_penalty['Team'].apply(lambda x: matched_dict[x])

    

# # Apply same method to scorring_summary:
# df_scoring = pd.read_sql("SELECT * FROM scoring_summary", conn)
# df_scoring['Team'] = df_scoring['Team'].apply(lambda x: matched_dict[x])


# ## Add each table back to database
# # Write the updated linescores DataFrame back to the linescore table
# df_linescores.to_sql('linescore', conn, if_exists='replace', index=False)

# # Write the updated penalty DataFrame back to the penalty_summary table
# df_penalty.to_sql('penalty_summary', conn, if_exists='replace', index=False)

# # Write the updated scoring DataFrame back to the scoring_summary table
# df_scoring.to_sql('scoring_summary', conn, if_exists='replace', index=False)

In [None]:
## CREATE A NEW TABLE WITH AGGRIGATED PLAYER STATS YEAR TO DATE

# Use player_stats_df from here on, instead of running another SQL query.
df_player_stats = player_stats_df.copy()


# Clean up the name format in player_stats for easier matching
# Replace the non-breaking space with a regular space
df_player_stats['Clean_Player'] = df_player_stats['Player'].apply(lambda x: x.replace('\xa0', ' '))

# Remove rows where Player is the team name (e.g., "Michigan State")
df_player_stats_cleaned = df_player_stats[df_player_stats['Player'] != df_player_stats['Team']]

# Convert relevant columns to integers for correct aggregation
cols_to_convert = ['G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM']
for col in cols_to_convert:
    df_player_stats_cleaned[col] = pd.to_numeric(df_player_stats_cleaned[col], errors='coerce')

# Aggregate the data for year-to-date stats
# Add a column for counting the number of games each player has played
agg_player_stats_corrected_with_games = df_player_stats_cleaned.groupby(['Clean_Player', 'Team']).agg({
    'G': 'sum',
    'A': 'sum',
    'Pts': 'sum',
    'plus_minus': 'sum',
    'Sh': 'sum',
    'PIM': 'sum',
    'Game_ID': 'count'  # Counting the number of unique Game_IDs for each player
}).reset_index()

# Rename the Game_ID column to Games_Played
agg_player_stats_corrected_with_games.rename(columns={'Game_ID': 'Games_Played'}, inplace=True)

# Save the updated aggregated data back to the database, replacing the existing table
agg_player_stats_corrected_with_games.to_sql('player_stats_ytd', conn, if_exists='replace', index=False)

# Verify by loading some sample data from the updated table
sample_updated_ytd = pd.read_sql_query("SELECT * FROM player_stats_ytd LIMIT 5;", conn)
sample_updated_ytd

## Add the Roster data from the CSVs to the Database

In [None]:
################## SET THE ROSTER DATAFRAME TO THE CORRECT YEAR ####################
## MATCH THE DATAFRAME NAMES
df_master_roster = roster_df.copy()

## Season Year Value
season_year = season_year_setting

# Clean up the name formats for joining
# Master roster: Convert "Last Name, First Name" to "First Name Last Name"
# df_master_roster['Clean_Name'] = df_master_roster['Player'].apply(lambda x: ' '.join(reversed(x.split(', '))))

# Rename Player to Clean_Name
df_master_roster.rename(columns={'Player': 'Clean_Name'}, inplace=True)
# Rename School to Team
df_master_roster.rename(columns={'School': 'Team'}, inplace=True)

# Clean up the Team column, remove '-' and replace with ' '
# df_master_roster['School'] = df_master_roster['Team'].apply(lambda x: x.replace('-', ' '))

## If there are an period in the column names, remove them
df_master_roster.columns = df_master_roster.columns.str.replace('.', '')

### Finally add the roster to the database as it's own table

df_master_roster['SeasonYear'] = season_year

# Save the roster data as a new table in the database
roster_table_name = 'master_roster'
df_master_roster.to_sql(roster_table_name, conn, if_exists='replace', index=False)
############################################################

# Verify by listing all the tables in the database again
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()
table_names_updated = [table[0] for table in tables]
table_names_updated

## Close database

In [None]:
### Close connection to database

conn.close()