# Game Data Scraper & Cleaner
- created 12/9/24 
- refactor of legacy code from Game_Data_Scraper_1 and Game_Data_Cleaner

In [1]:
## Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import logging
from tqdm import tqdm
import re
import os 
from sqlalchemy import create_engine
import sqlite3


# Create timestamp string to use in file names
timestamp = time.strftime("%Y%m%d-%H%M%S")

## FILE PATHS and CONSTANTS
#Local Folder paths
temp_folder = os.path.join('..', 'TEMP')
data_folder = os.path.join('..', 'data')
db_folder = os.path.join(data_folder, 'db')
log_folder = os.path.join(temp_folder, 'logs')

# Check Paths
if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
if not os.path.exists(db_folder):
    os.makedirs(db_folder)

# Remote URL
base_url = 'https://www.collegehockeynews.com' ## Base usl for box scores and metrics
current_year_url = 'https://www.collegehockeynews.com/schedules/?season=20242025' ## Current year schedule


## Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.info('Logging Started')

# Database Name
DB_FILE_NAME = 'NEW_Scrape_NEW_Code.db'
# DB_FILE_NAME = '2024_Dec_03_CLEAN.db'



2024-12-09 17:44:53,462 - INFO - Logging Started


### Initiate DB Connection
- backup and open DB file if it exists
- create new DB file with DB_FILE_NAME if none exists

In [2]:
## Database Path
db_path = os.path.join(db_folder, DB_FILE_NAME)
# Create Backup in TEMP folder with datetime stamp
backup_db_path = os.path.join(temp_folder, f"{DB_FILE_NAME}_{timestamp}.backup")
# backup_db_path = os.path.join(temp_folder, f"{DB_FILE_NAME}.backup")

# Check if the database file exists
if os.path.exists(db_path):
    # Backup the existing database
    logger.info(f"Database file found at {db_path}. Backing it up to {backup_db_path}.")
    try:
        # Perform the backup
        with open(db_path, 'rb') as original_db, open(backup_db_path, 'wb') as backup_db:
            backup_db.write(original_db.read())
        logger.info(f"Backup successful: {backup_db_path}")
    except Exception as e:
        logger.error(f"Failed to back up the database: {e}")
else:
    logger.info(f"No database file found at {db_path}. A new database will be created.")

# Create a database connection
try:
    engine = create_engine(f"sqlite:///{db_path}")
    conn = sqlite3.connect(db_path)
    logger.info(f"Database connection established at {db_path}")
except Exception as e:
    logger.error(f"Failed to establish database connection: {e}")
    raise


2024-12-09 17:44:53,480 - INFO - No database file found at ..\data\db\NEW_Scrape_NEW_Code.db. A new database will be created.
2024-12-09 17:44:53,499 - INFO - Database connection established at ..\data\db\NEW_Scrape_NEW_Code.db


#### Create a Dictionary of Team names and abbreviations
- from arena_school_info table

In [3]:
## Load school infomation from arena_school_info.csv
school_info_df = pd.read_csv(os.path.join(data_folder, 'arena_school_info.csv'))

# Create a dictionary for abbreviations to full team names
abbreviation_to_fullname = school_info_df.set_index('abv')['School'].to_dict()

# Define a function to replace abbreviations in a column with full team names
def replace_abbreviations_with_fullnames(df, column_name, abbreviation_dict):
    """
    Replaces abbreviations in the specified column of a DataFrame with full team names.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column to process.
        column_name (str): The column name where abbreviations need to be replaced.
        abbreviation_dict (dict): Dictionary mapping abbreviations to full names.
    
    Returns:
        pd.DataFrame: DataFrame with updated column values.
    """
    df[column_name] = df[column_name].replace(abbreviation_dict)
    return df


### Download a table of every game in CHN database for the selected season
- output games_df table of every game listed on CHN site
- Clean team names of unwanted characters and create unique Game_ID

In [4]:
## Function to Parse the Current Season Schedule / Results Page
def parse_current_season(url):
    """
    Parses the current season schedule/results page.
    Args:
        url (str): URL of the current season schedule/results page.

    Returns:
        list: Parsed data as a list of rows.
    """
    import requests
    from bs4 import BeautifulSoup

    # Initialize variables
    current_date, current_conference, game_notes = None, None, None
    data = []  # List to store game data

    # Fetch the page
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Failed to retrieve data from {url}, status code {response.status_code}")

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')

    # Process each row
    for row in rows:
        row_class = row.get('class')
        
        if row_class == ['stats-section']:  # Date row
            current_date = row.find('td').text.strip()
        elif row_class == ['sked-header']:  # Conference row
            current_conference = row.find('td').text.strip()
        elif len(row.find_all('td')) == 2:  # Game notes row
            game_notes = row.find_all('td')[1].text.strip()
        elif row.get('valign') == 'top':  # Game data row
            game_data = extract_game_data(row, current_date, current_conference, game_notes)
            if game_data:
                data.append(game_data)
            game_notes = None  # Reset game notes for the next row

    return data


def extract_game_data(row, current_date, current_conference, game_notes):
    """
    Extracts game data from a table row.
    Args:
        row (Tag): BeautifulSoup row tag.
        current_date (str): Current date of the game.
        current_conference (str): Current conference of the game.
        game_notes (str): Notes for the game.

    Returns:
        list: Extracted game data or None if row is invalid.
    """
    cells = row.find_all('td')
    if len(cells) < 9:
        return None

    # Extract data
    home_team = clean_team_name(cells[3].text.strip())
    home_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
    home_score = cells[4].text.strip()

    # home_team = clean_team_name(cells[0].text.strip())
    # home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
    # home_score = cells[1].text.strip()

    away_team = clean_team_name(cells[0].text.strip())
    away_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
    away_score = cells[1].text.strip()

    ot = cells[5].text.strip()
    box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
    metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None

    # Optional game notes
    game_notes_cell = cells[-1].find('small')
    game_notes = game_notes_cell.text.strip() if game_notes_cell else game_notes

    return [
        current_date, current_conference, game_notes,
        away_team, away_team_link, away_score,
        home_team, home_team_link, home_score,
        
        ot, box_link, metrics_link
    ]


def clean_team_name(team_name):
    """
    Cleans the team name by removing unwanted characters.
    Args:
        team_name (str): Team name.

    Returns:
        str: Cleaned team name.
    """
    # Replace unwanted characters with an empty string
    return team_name.replace('-', ' ').replace('.', '').replace("'", '').strip()


## Call the function
parsed_data = parse_current_season(current_year_url)

## Create a DataFrame from the parsed data
columns = [
    'Date', 'Conference', 'Game_Notes', 'Away_Team', 'Away_Team_Link', 'Away_Score',
    'Home_Team', 'Home_Team_Link', 'Home_Score',  'OT',
    'Box_Link', 'Metrics_Link'
]
df = pd.DataFrame(parsed_data, columns=columns)

## Process the DataFrame
# Extract the day of the week and reformat the date
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')


In [5]:
def generate_game_id(row):
    """
    Generates a unique Game_ID based on the date, home team, and away team.
    Args:
        row (pd.Series): A row of the DataFrame.

    Returns:
        str: A unique Game_ID.
    """
    return f'{row.Date}-{row.Away_Team}-{row.Home_Team}'


def clean_column_data(df):
    """
    Cleans team names and ensures consistent formatting for the DataFrame columns.
    Args:
        df (pd.DataFrame): The DataFrame to clean.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    # Remove hyphens from team names
    df['Home_Team'] = df['Home_Team'].str.replace('-', ' ', regex=False)
    df['Away_Team'] = df['Away_Team'].str.replace('-', ' ', regex=False)

    # Filter out games that haven't been played yet
    df = df[df['Home_Score'] != '']

    # Replace NaN values in Metrics_Link with an empty string
    df['Metrics_Link'] = df['Metrics_Link'].fillna('')

    return df


# Apply the cleaning function to the DataFrame
df = clean_column_data(df)

# Generate Game_ID column
df['Game_ID'] = df.apply(generate_game_id, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Metrics_Link'] = df['Metrics_Link'].fillna('')


#### Compare game results table to games already in the DB

In [6]:

def filter_and_compare_games(df, conn, recent_days=5):
    """
    Compares the Game_ID in the extracted dataframe to the database, filters exhibition games,
    and prints a summary of game counts. Handles cases where the database is empty or does not exist.

    Args:
        df (pd.DataFrame): Extracted games dataframe.
        conn (sqlite3.Connection): Database connection.
        recent_days (int): Number of recent days to re-scrape games for updates.

    Returns:
        pd.DataFrame: DataFrame of games to scrape.
    """
    import datetime

    # Print initial game count
    total_games = len(df)
    print(f"Total games in the extracted dataset: {total_games}")

    # Filter out exhibition games
    df = df[df['Conference'] != 'Exhibition']
    filtered_games = len(df)
    print(f"Games remaining after filtering exhibition games: {filtered_games}")

    # Check if the database has any data in the game_details table
    try:
        existing_game_ids_query = "SELECT DISTINCT Game_ID FROM game_details"
        existing_game_ids = pd.read_sql(existing_game_ids_query, conn)['Game_ID'].tolist()
    except Exception as e:
        logging.warning(f"Database access failed or no game_details table found: {e}")
        existing_game_ids = []  # Treat as no games in the database

    # Handle empty database: scrape all games
    if not existing_game_ids:
        print("Database is empty or non-existent. Scraping all available games.")
        return df

    # Find games not in the database
    df['In_Database'] = df['Game_ID'].isin(existing_game_ids)
    new_games_df = df[~df['In_Database']]
    games_not_in_dataset = len(new_games_df)
    print(f"Games not in the database: {games_not_in_dataset}")

    # Add recent games to the scrape list (past `recent_days`)
    recent_date_threshold = (datetime.datetime.now() - datetime.timedelta(days=recent_days)).strftime('%Y-%m-%d')
    recent_games_df = df[df['Date'] >= recent_date_threshold]
    games_to_rescrape = len(recent_games_df)
    print(f"Games to re-scrape from the last {recent_days} days: {games_to_rescrape}")

    # Combine new games and recent games for scraping
    games_to_scrape_df = pd.concat([new_games_df, recent_games_df]).drop_duplicates(subset='Game_ID')
    print(f"Total games to scrape: {len(games_to_scrape_df)}")

    return games_to_scrape_df

## ORIG CODE, GETS ERROR IF GIVEN NON EXI FILEPATH
# def filter_and_compare_games(df, conn, recent_days=5):
#     """
#     Compares the Game_ID in the extracted dataframe to the database, filters exhibition games,
#     and prints a summary of game counts.
    
#     Args:
#         df (pd.DataFrame): Extracted games dataframe.
#         conn (sqlite3.Connection): Database connection.
#         recent_days (int): Number of recent days to re-scrape games for updates.

#     Returns:
#         pd.DataFrame: DataFrame of games to scrape.
#     """
#     import datetime

#     # Print initial game count
#     total_games = len(df)
#     print(f"Total games in the extracted dataset: {total_games}")

#     # Filter out exhibition games
#     df = df[df['Conference'] != 'Exhibition']
#     filtered_games = len(df)
#     print(f"Games remaining after filtering exhibition games: {filtered_games}")

#     # Retrieve existing Game_IDs from the database
#     existing_game_ids_query = "SELECT DISTINCT Game_ID FROM game_details"  # Correct table name
#     existing_game_ids = pd.read_sql(existing_game_ids_query, conn)['Game_ID'].tolist()
#     print(f"Total games already in the database: {len(existing_game_ids)}")

#     # Find games not in the database
#     df['In_Database'] = df['Game_ID'].isin(existing_game_ids)
#     new_games_df = df[~df['In_Database']]
#     games_not_in_dataset = len(new_games_df)
#     print(f"Games not in the database: {games_not_in_dataset}")

#     # Add recent games to the scrape list (past `recent_days`)
#     recent_date_threshold = (datetime.datetime.now() - datetime.timedelta(days=recent_days)).strftime('%Y-%m-%d')
#     recent_games_df = df[df['Date'] >= recent_date_threshold]
#     games_to_rescrape = len(recent_games_df)
#     print(f"Games to re-scrape from the last {recent_days} days: {games_to_rescrape}")

#     # Combine new games and recent games for scraping
#     games_to_scrape_df = pd.concat([new_games_df, recent_games_df]).drop_duplicates(subset='Game_ID')
#     print(f"Total games to scrape: {len(games_to_scrape_df)}")

#     return games_to_scrape_df


# Example usage
games_to_scrape = filter_and_compare_games(df, conn)




Total games in the extracted dataset: 533
Games remaining after filtering exhibition games: 497
Database is empty or non-existent. Scraping all available games.


## Scraping Functions
- currently untouched from legacy code

<!-- ##### LEGACY CODE - WORKING FROM G_D_S_1
-  -->

In [7]:
#### PARSE PLAYER STATS TABLE ####
def parse_player_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the playersums div
    playersums_div = soup.find('div', id='playersums')
    if playersums_div is None:
        return "Player summaries div not found"

    # Initialize list to store player stats
    player_stats = []

    # Loop through each playersum div
    for player_sum in playersums_div.find_all('div', class_='playersum'):
        team = player_sum.find('td').text.strip()
        
        # Loop through table rows
        for row in player_sum.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) > 1:
                player = cols[0].text.strip()
                goals = cols[1].text.strip()
                assists = cols[2].text.strip()
                points = cols[3].text.strip()
                plus_minus = cols[4].text.strip()
                shots = cols[5].text.strip()
                pim = cols[6].text.strip()
                fowl = cols[7].text.strip() if len(cols) > 7 else None
                
                fow, fol = None, None
                win_percentage = None
                
                

                try:
                    if fowl and '‑' in fowl:  # Checking if it contains a hyphen
                        fow, fol = map(int, fowl.split('‑'))
                        total_fo = fow + fol
                        win_percentage = (fow / total_fo) * 100 if total_fo > 0 else 0
                except ValueError:
                    fow, fol, win_percentage = None, None, None

                

                
                player_stat = {
                    'Team': team,
                    'Player': player,
                    'G': goals,
                    'A': assists,
                    'Pt.': points,
                    '+/-': plus_minus,
                    'Sh': shots,
                    'PIM': pim,
                    'FOW': fow,
                    'FOL': fol,
                    'FO%': win_percentage
                }
                player_stats.append(player_stat)

    return pd.DataFrame(player_stats)
    


############# PARSEING SCORING SUMMARY WITH BS4
def parse_scoring_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the scoring div and table
    scoring_div = soup.find('div', id='scoring')
    if scoring_div is None:
        logging.error("Scoring div not found")
        return None

    scoring_table = scoring_div.find('table')
    if scoring_table is None:
        logging.error("Scoring table not found within the scoring div")
        return None

    # Initialize list to store scoring events
    scoring_events = []
    period = None

    # Loop through table rows
    for row in scoring_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                try:
                    team = cols[0].text.strip()
                    team = abbreviation_to_fullname.get(team, team)  # Replace abbreviation
                    team = clean_team_name(team)  # Clean team name
                    pp = cols[1].text.strip()

                    player_data = cols[3].text.strip()
                    match = re.match(r"(.+)\s\((\d+)\)", player_data)
                    player = match.group(1) if match else player_data
                    goals = int(match.group(2)) if match else None

                    assist_data_raw = cols[4].text.strip()
                    assist_data = assist_data_raw.split(", ") if assist_data_raw else []
                    assist1 = assist_data[0] if len(assist_data) > 0 else None
                    assist2 = assist_data[1] if len(assist_data) > 1 else None

                    time = cols[5].text.strip()

                    scoring_event = {
                        'Period': period,
                        'Team': team,
                        'PP': pp,
                        'Player': player,
                        'Player_Goals': goals,
                        'Assist1': assist1,
                        'Assist2': assist2,
                        'Time': time
                    }
                    scoring_events.append(scoring_event)
                except Exception as e:
                    logging.error(f"An error occurred while parsing a scoring event row: {e}")
            else:
                logging.warning(f"Insufficient columns in scoring row: {len(cols)}")

    return pd.DataFrame(scoring_events)


############# PARSEING PENALTY SUMMARY WITH BS4
def parse_penalty_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the penalties div and table
    penalties_div = soup.find('div', id='penalties')
    if penalties_div is None:
        logging.error("Penalties div not found")
        return None

    penalties_table = penalties_div.find('table')
    if penalties_table is None:
        logging.error("Penalties table not found within the penalties div")
        return None

    # Initialize list to store penalty events
    penalty_events = []
    period = None

    # Loop through table rows
    for row in penalties_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                team = cols[0].text.strip()
                team = abbreviation_to_fullname.get(team, team)  # Replace abbreviation
                team = clean_team_name(team)  # Clean team name

                player = cols[1].text.strip()
                pen_length = cols[2].text.strip()
                penalty_type = cols[3].text.strip()
                time = cols[4].text.strip()

                penalty_event = {
                    'Period': period,
                    'Team': team,
                    'Player': player,
                    'Pen_Length': pen_length,
                    'Penalty_Type': penalty_type,
                    'Time': time
                }
                penalty_events.append(penalty_event)

    return pd.DataFrame(penalty_events)


############# GOALIE SUMMARY WITH BS4
def parse_goalie_stats(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the goalies div and table
    goalies_div = soup.find('div', id='goalies')
    if goalies_div is None:
        logging.error("Goalies div not found")
        return None

    goalies_table = goalies_div.find('table')
    if goalies_table is None:
        logging.error("Goalies table not found within the goalies div")
        return None

    # Initialize list to store goalie stats
    goalie_stats = []
    team = None

    # Loop through table rows   
    for row in goalies_table.find_all('tr'):
        if 'stats-header' in row.get('class', []):  # Team header rows
            td = row.find('td')
            team = td.text.strip() if td else "Unknown"
            # Replace abbreviation and clean team name
            team = abbreviation_to_fullname.get(team, team)
            team = clean_team_name(team)
        else:  # Data rows
            cols = row.find_all('td')
            if len(cols) > 1:
                goalie = cols[0].text.strip()
                sv = cols[1].text.strip()
                ga = cols[2].text.strip()
                minutes = cols[3].text.strip()

                # Build goalie stat dictionary
                goalie_stat = {
                    'Team': team,
                    'Goalie': goalie,
                    'SV': sv,
                    'GA': ga,
                    'Minutes': minutes
                }
                goalie_stats.append(goalie_stat)

    # Convert list to DataFrame
    return pd.DataFrame(goalie_stats)


#### PARSE THE ADVANCED TEAM METRICS TABLES ####
### RETURNS WHOLE ADVANCED METRICS AS SINGLE TABLE
####################################
# def parse_new_advanced_metrics(html_content):
    # Parse HTML content
    # soup = BeautifulSoup(html_content, 'html.parser')
    
    # # Find all tables with advanced metrics
    # tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # # List to store all parsed data
    # all_data = []
    
    # for table in tables:
    #     # Extract team name from the table header
    #     team_name = table.find('td').text.strip()
    #     team_name = abbreviation_to_fullname.get(team_name, team_name)  # Replace abbreviation
    #     team_name = clean_team_name(team_name)  # Clean team name
        
    #     # Extract headers (skipping the Player header)
    #     headers = [header.text for header in table.find_all('th')][1:]
        
    #     # Prepare final column headers
    #     col_names = ['Team', 'Player']
    #     for header in headers:
    #         col_names.append(header)
        
    #     # Extract player data
    #     rows = table.find_all('tr')[2:]  # skipping the two header rows
    #     for row in rows:
    #         player_data = [team_name]  # start with team name
    #         cells = row.find_all('td')
    #         player_data.append(cells[0].text.strip())  # player name
    #         for cell in cells[1:]:
    #             player_data.append(cell.text.strip())
    #         all_data.append(player_data)
    
    # # Convert the list of data to a DataFrame
    # df = pd.DataFrame(all_data, columns=col_names)
    # return df

######## NEW TEST ###############  
def parse_advanced_metrics_tables(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        team_name = abbreviation_to_fullname.get(team_name, team_name)  # Replace abbreviation
        team_name = clean_team_name(team_name)  # Clean team name
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)

    # # Rename columns for clairity
    # new_names = ['Team', 'Player', 'TOTAL_Block', 'TOTAL_Miss', 'TOTAL_Saved', 'TOTAL_Goals', 'TOTAL_Total_Shots',
    #                 'EVEN_Block', 'EVEN_Miss', 'EVEN_Saved', 'EVEN_Goals', 'EVEN_Total_Shots',
    #                 'PP_Block', 'PP_Miss', 'PP_Saved', 'PP_Goals', 'PP_Total_Shots',
    #                 'CLOSE_Block', 'CLOSE_Miss', 'CLOSE_Saved', 'CLOSE_Goals', 'CLOSE_Total_Shots',

    #                 'D_Blocks', 'Faceoffs', 'Game_ID']
    # df.columns = new_names

    ## Fill all NaN values with 0
    df = df.fillna(0)

    return df

# Parsing the line chart information with specific positions for forwards and defensemen.
def parse_line_chart(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    line_chart_div = soup.find('div', id='linechart')

    if line_chart_div is None:
        logging.error("Line chart div not found")
        return pd.DataFrame()

    line_data = []

    for team_div in line_chart_div.find_all('div', recursive=False):
        h3 = team_div.find('h3')
        if h3 is None:
            logging.warning("Team name not found")
            continue
        
        team_name = h3.text.strip()
        # Replace abbreviation and clean team name
        team_name = abbreviation_to_fullname.get(team_name, team_name)
        team_name = clean_team_name(team_name)
        
        for line_type_div in team_div.find_all('div', recursive=False):
            line_type = line_type_div.get('class')[0] if line_type_div.get('class') else None
            if line_type is None:
                logging.warning("Line type not found")
                continue
            
            if line_type == 'f':
                position_types = ['Left Wing', 'Center', 'Right Wing']
            elif line_type == 'd':
                position_types = ['Left D', 'Right D']
            elif line_type == 'x':
                position_types = ['Extra']
            elif line_type == 'g':
                position_types = ['Goalie']
                goalie_count = 1  # Initialize goalie count
            else:
                continue

            players = line_type_div.find_all('div')
            if not players:
                logging.warning(f"No players found for {team_name} in {line_type}")
                continue
            
            for i, player in enumerate(players):
                player_name = player.text.strip()
                if line_type == 'x':
                    player_name = player_name.split(' ')[0]
                if line_type == 'g':
                    line_number = f"Goalie {goalie_count}"
                    goalie_count += 1
                else:
                    line_number = i // len(position_types) + 1

                position = position_types[i % len(position_types)]
                line_data.append({
                    'Team': team_name,
                    'Line': line_number,
                    'Position': position,
                    'Player': player_name
                })

    if not line_data:
        logging.error("No line data was collected")

    df = pd.DataFrame(line_data)
    
    # # Log DataFrame info for debugging
    # if df.empty:
    #     logging.warning("Generated line chart DataFrame is empty.")
    # else:
    #     logging.info(f"Generated line chart DataFrame with columns: {df.columns.tolist()}")

    return df

from sqlalchemy import inspect, text

def ensure_columns_exist(table_name, columns, engine):
    """
    Ensures that the specified columns exist in the given table. Adds them if they are missing.
    
    Args:
        table_name (str): Name of the table.
        columns (list): List of column names to check/add.
        engine (sqlalchemy.engine): SQLAlchemy database engine.
    """
    inspector = inspect(engine)
    existing_columns = [col['name'] for col in inspector.get_columns(table_name)]

    missing_columns = [col for col in columns if col not in existing_columns]

    if missing_columns:
        with engine.connect() as conn:
            for col in missing_columns:
                sql = text(f"ALTER TABLE {table_name} ADD COLUMN {col} INTEGER DEFAULT 0;")
                conn.execute(sql)
                logging.info(f"Added missing column: {col} to table: {table_name}")

### Get the Linescore Elements - Score, shots, ect by period####
def parse_linescore(html_content):
    """
    Parses the linescore tables (Goals, Shots, PP) and dynamically handles periods and schema alignment.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        pd.DataFrame: Linescore data as a DataFrame.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    linescore_data = []
    max_periods = 0

    # Parsing the Goals table
    goals_table = soup.select_one("#goals table")
    if goals_table is None:
        logging.error("Goals table not found")
        return pd.DataFrame()

    rows = goals_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Goals table")
        return pd.DataFrame()

    for row in rows:
        team_data = {}
        td = row.select_one('td')
        if td:
            team_data['Team'] = td.text.strip()
        else:
            logging.warning("Team name not found in Goals table")
            continue

        goals = row.select('td')[1:]
        max_periods = max(max_periods, len(goals))  # Update max periods dynamically
        for i, goal in enumerate(goals):
            column_name = f'goals{i + 1}' if i < len(goals) - 1 else 'goalsT'
            try:
                team_data[column_name] = int(goal.text.strip())
            except ValueError:
                team_data[column_name] = None
                logging.warning(f"Invalid goal value in column {column_name}")

        linescore_data.append(team_data)

    # Convert to DataFrame early
    df = pd.DataFrame(linescore_data)

    # Ensure all columns exist dynamically
    expected_columns = [f'goals{i}' for i in range(1, max_periods)] + ['goalsT']
    for col in expected_columns:
        if col not in df.columns:
            df[col] = 0  # Add missing columns with default value 0

    return df


# Function to parse game details table
def parse_game_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meta_div = soup.find('div', {'id': 'meta'})
    if meta_div is None:
        logging.error("Meta div not found")
        return None
    
    game_details_div = meta_div.find_all('div')[-1]
    if game_details_div is None:
        logging.error("Game details div not found")
        return None
    
    try:
        date_str = game_details_div.h4.string
        day_of_week, date = date_str.split(", ", 1)
        
        p_elements = game_details_div.find_all('p')
        
        # Extract conference and location details
        for p in p_elements:
            if "Game" in p.text:  # e.g., "Big Ten Game"
                details_strs = p.get_text(separator='|').split('|')
                conference = details_strs[0]
                location = details_strs[-1].split('at ')[-1]
                break
        else:  # Defaults if not found
            conference, location = None, None
        
        # Extract referees and assistant referees details
        for p in p_elements:
            if "Referees" in p.text:
                refs_str = p.strong.next_sibling if p.strong else None
                asst_refs_str = p.find_all('strong')[1].next_sibling if len(p.find_all('strong')) > 1 else None
                break
        else:  # Defaults if not found
            refs_str, asst_refs_str = None, None
        
        refs = refs_str.split(', ') if refs_str else []
        asst_refs = asst_refs_str.split(', ') if asst_refs_str else []
        refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in refs]
        asst_refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in asst_refs]
        
        # Extract attendance details using regex for better accuracy
        attendance_pattern = r"Attendance:\s?(\d+[\d,]*)"
        attendance_match = re.search(attendance_pattern, html_content)
        attendance = int(attendance_match.group(1).replace(',', '')) if attendance_match else None
        
        # Extract game details (like shootout results)
        details = None
        for p in p_elements:
            if "shootout" in p.text:
                details = p.text
                break
        
        # Clean details if present
        if details and '\n' in details:
            details = details.replace('\n', '').strip()
        if details and '\t' in details:
            details = re.sub('\t', ' ', details)
        
        game_details = {
            'Day': day_of_week,
            'Date': date,
            'Conference': conference,
            'Details': details,
            'Location': location,
            'Ref1': refs[0] if refs else None,
            'Ref2': refs[1] if len(refs) > 1 else None,
            'Asst_Ref1': asst_refs[0] if asst_refs else None,
            'Asst_Ref2': asst_refs[1] if len(asst_refs) > 1 else None,
            'Attendance': attendance
        }
        
        game_details_df = pd.DataFrame([game_details])
        return game_details_df

    except (AttributeError, IndexError, ValueError) as e:
        logging.error(f"Error while parsing game details: {e}")
        return None


# Parse the box score page - player stats table (G, A, Pt, +/-, Sh, PIM)
def parse_box_score(box_score_html):
    # Initialize DataFrames to None
    scoring_summary = penalty_summary = goalie_stats = player_stats = line_chart = linescore = game_details = None
    
    try:
        scoring_summary = parse_scoring_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_scoring_summary: {e}")
    
    try:
        penalty_summary = parse_penalty_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_penalty_summary: {e}")
    
    try:
        goalie_stats = parse_goalie_stats(box_score_html)
    except Exception as e:
        print(f"Error in parse_goalie_stats: {e}")
    
    try:
        player_stats = parse_player_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_player_summary: {e}")
    
    try:
        line_chart = parse_line_chart(box_score_html)
        if line_chart.empty:
            logging.info("Line chart is empty. Skipping the insert for this game.")
        else:
            logging.info(f"Line chart DataFrame structure: {line_chart.dtypes}")

        # Insert into database (make sure this part works as expected)

    except Exception as e:
        logging.error(f"Error in parse_line_chart: {e}")


    try:
        linescore_df = parse_linescore(box_score_html)

        if not linescore_df.empty:
            process_and_save_linescore(linescore_df, engine, table_name="linescore")
        else:
            logging.warning("No linescore data to save for this game.")
    except Exception as e:
        logging.error(f"Error processing linescore: {e}")

    
    try:
        game_details = parse_game_details(box_score_html)
    except Exception as e:
        print(f"Error in parse_game_details: {e}")
    
    # Combine DataFrames into a list
    all_dfs = [game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore]
    
    return all_dfs

def rename_duplicate_columns(df):
    """
    Renames duplicate columns in a DataFrame to make them unique.
    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: DataFrame with unique column names.
    """
    cols = pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated()].unique(): 
        cols[df.columns.get_loc(dup)] = [f"{dup}_{i}" if i != 0 else dup for i in range(sum(df.columns == dup))]
    df.columns = cols
    return df


##### End Legacy Code

In [8]:
from tqdm import tqdm
import logging
import requests
import pandas as pd
import time
from sqlalchemy import create_engine
from sqlalchemy import inspect, text

def ensure_columns_exist(table_name, columns, engine):
    """
    Ensures that the specified columns exist in the given table. Adds them if missing.

    Args:
        table_name (str): Name of the table.
        columns (list): List of column names to check/add.
        engine (sqlalchemy.engine): SQLAlchemy database engine.
    """
    inspector = inspect(engine)
    existing_columns = [col['name'] for col in inspector.get_columns(table_name)]

    missing_columns = [col for col in columns if col not in existing_columns]

    if missing_columns:
        with engine.connect() as conn:
            for col in missing_columns:
                sql = text(f"ALTER TABLE {table_name} ADD COLUMN {col} INTEGER DEFAULT 0;")
                conn.execute(sql)
                logging.info(f"Added missing column: {col} to table: {table_name}")

def process_and_save_linescore(linescore_df, engine, table_name="linescore"):
    """
    Process and save the linescore DataFrame to the database, ensuring schema alignment.

    Args:
        linescore_df (pd.DataFrame): Linescore DataFrame.
        engine (sqlalchemy.engine): SQLAlchemy database engine.
        table_name (str): Table name in the database.
    """
    # Ensure columns exist dynamically in the database table
    ensure_columns_exist(table_name, linescore_df.columns, engine)

    # Save the DataFrame to the database
    try:
        linescore_df.to_sql(table_name, engine, if_exists='append', index=False)
        logging.info(f"Linescore data saved to table: {table_name}")
    except Exception as e:
        logging.error(f"Error saving linescore to table {table_name}: {e}")





def fetch_and_save_data(row, base_url, game_id, conn):
    """
    Fetches and parses data for a single game, then saves it to the database.
    Args:
        row (pd.Series): Row from the games DataFrame containing game details.
        base_url (str): Base URL for the scraping website.
        game_id (str): Unique game identifier.
        conn (sqlite3.Connection): SQLite connection to the database.

    Returns:
        bool: True if successful, False otherwise.
    """
    try:
        box_score_url = f"{base_url}{row['Box_Link']}"
        metrics_url = f"{base_url}{row['Metrics_Link']}" if row['Metrics_Link'] else None

        # Fetch HTML for box score
        box_score_response = requests.get(box_score_url, timeout=10)
        box_score_response.raise_for_status()
        box_score_html = box_score_response.text

        # Parse box score data
        box_score_dfs = parse_box_score(box_score_html)

        # Fetch and parse advanced metrics if available
        if metrics_url:
            metrics_response = requests.get(metrics_url, timeout=10)
            metrics_response.raise_for_status()
            metrics_html = metrics_response.text
            advanced_metrics_df = parse_advanced_metrics_tables(metrics_html)
        else:
            advanced_metrics_df = pd.DataFrame()

        # Combine all DataFrames
        all_dfs = box_score_dfs + [advanced_metrics_df]

        # Apply Game_ID and remove duplicate columns
        for df in all_dfs:
            if df is not None and not df.empty:
                df['Game_ID'] = game_id
                df = rename_duplicate_columns(df)

        # Save data to database
        table_names = [
            'game_details', 'scoring_summary', 'penalty_summary',
            'goalie_stats', 'player_stats', 'line_chart', 'linescore', 'advanced_metrics'
        ]

        for df, table in zip(all_dfs, table_names):
            if df is not None and not df.empty:
                df.to_sql(table, conn, if_exists='append', index=False)

        logging.info(f"Successfully scraped and stored data for game: {game_id}")
        return True

    except requests.exceptions.RequestException as e:
        logging.error(f"Network error for game {game_id}: {e}")
    except Exception as e:
        logging.error(f"Error processing game {game_id}: {e}")

    return False


def scrape_games_and_store(sampled_games, base_url, conn):
    """
    Main function to scrape and store data for a list of games.
    Args:
        sampled_games (pd.DataFrame): DataFrame of games to scrape.
        base_url (str): Base URL for the scraping website.
        conn (sqlite3.Connection): SQLite connection to the database.

    Returns:
        None
    """
    error_count = 0
    error_games = []

    for _, row in tqdm(sampled_games.iterrows(), total=sampled_games.shape[0], desc="Scraping games"):
        game_id = f"{row['Date']}-{row['Home_Team']}-{row['Away_Team']}"
        retries = 3
        success = False

        while retries > 0 and not success:
            success = fetch_and_save_data(row, base_url, game_id, conn)
            if not success:
                retries -= 1
                time.sleep(5)  # Wait before retrying

        if not success:
            error_count += 1
            error_games.append(game_id)

    logging.info(f"Scraping completed with {error_count} errors.")
    if error_games:
        logging.warning(f"Failed games: {error_games}")



## Call Functions to Perform scrape

In [9]:
## Call the function to scrape and store data from the games
# Example: Scraping games and storing results

# Example Usage
if __name__ == "__main__":
    # Set up database connection
    # db_path = "../TEMP/CHN_Scrape_TEST_7.db"
    # conn = sqlite3.connect(db_path)

    # Set up logging
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

    
    # Base URL
    base_url = "https://www.collegehockeynews.com"

    # Scrape and store games
    scrape_games_and_store(games_to_scrape, base_url, conn)

    # Close database connection
    # conn.close()



Scraping games:   0%|          | 0/497 [00:00<?, ?it/s]2024-12-09 17:44:56,446 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player      object
dtype: object
2024-12-09 17:44:56,512 - ERROR - Error processing linescore: linescore
2024-12-09 17:44:57,066 - INFO - Successfully scraped and stored data for game: 2024-10-04-Lake Superior-Michigan State
Scraping games:   0%|          | 1/497 [00:01<12:40,  1.53s/it]2024-12-09 17:44:57,928 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player      object
dtype: object
2024-12-09 17:44:57,996 - ERROR - Error processing linescore: linescore
2024-12-09 17:44:58,529 - INFO - Successfully scraped and stored data for game: 2024-10-04-Michigan-Minnesota State
Scraping games:   0%|          | 2/497 [00:03<12:22,  1.50s/it]2024-12-09 17:44:59,514 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player 

#### Add the Roster data from the CSV to the Database

In [10]:
# open the database connection
# conn = sqlite3.connect(db_path)

In [11]:
################## SET THE ROSTER DATAFRAME TO THE CORRECT YEAR ####################
## Load the roster data from CSV
roster_filename = 'roster_2024_current_v3.csv'
# Load to DataFrame
roster_df = pd.read_csv(f'../data/{roster_filename}')

# Set the SeasonYear in the database_roster
season_year_setting = 2024

## MATCH THE DATAFRAME NAMES
df_master_roster = roster_df.copy()

## Season Year Value
season_year = season_year_setting

# Clean up the name formats for joining
# Master roster: Convert "Last Name, First Name" to "First Name Last Name"
# df_master_roster['Clean_Name'] = df_master_roster['Player'].apply(lambda x: ' '.join(reversed(x.split(', '))))

# Rename Player to Clean_Name
df_master_roster.rename(columns={'Player': 'Clean_Name'}, inplace=True)
# Rename School to Team
df_master_roster.rename(columns={'School': 'Team'}, inplace=True)

# Clean up the Team column, remove '-' and replace with ' '
# df_master_roster['School'] = df_master_roster['Team'].apply(lambda x: x.replace('-', ' '))

## If there are an period in the column names, remove them
df_master_roster.columns = df_master_roster.columns.str.replace('.', '')

### Finally add the roster to the database as it's own table

df_master_roster['SeasonYear'] = season_year

# Save the roster data as a new table in the database
roster_table_name = 'master_roster'
df_master_roster.to_sql(roster_table_name, conn, if_exists='replace', index=False)
############################################################

# Verify by listing all the tables in the database again
tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(tables_query).fetchall()
table_names_updated = [table[0] for table in tables]
table_names_updated



['game_details',
 'scoring_summary',
 'penalty_summary',
 'goalie_stats',
 'player_stats',
 'line_chart',
 'advanced_metrics',
 'master_roster']

#### Clean up The Column Names and extra header rows in the Player Stats table

In [12]:
player_stats_df = pd.read_sql_query("SELECT * FROM player_stats", conn)

# Define a dictionary for column renaming
column_renames = {
    'Pt.': 'Pts',
    '+/-': 'plus_minus'
}

# Rename columns based on the dictionary
player_stats_df.rename(columns=column_renames, inplace=True)


# Drop rows where Team name is in the Player column
player_stats_df = player_stats_df[player_stats_df['Team'] != player_stats_df['Player']]

## Change the Column names to be easy to work with
############ 'Pt.' should be 'Pts' and '+/-' should be 'plus_minus'
#################################
player_stats_df = pd.read_sql_query("SELECT * FROM player_stats", conn)

if 'Pt.' in player_stats_df.columns:
    player_stats_df.rename(columns={'Pt.': 'Pts'}, inplace=True)
else:
    print("Column 'Pt.' not found.")

if '+/-' in player_stats_df.columns:
    player_stats_df.rename(columns={'+/-': 'plus_minus'}, inplace=True)
else:
    print("Column '+/-' not found.")

print(len(player_stats_df))

# Drop rows if Team name is in the player column
# If ['Team'] is the same as ['Player'] then drop that row
player_stats_df = player_stats_df[player_stats_df['Team'] != player_stats_df['Player']]

# add the dataframe back to the database
player_stats_df.to_sql('player_stats', conn, if_exists='replace', index=False)


21000


20006

#### CREATE A NEW TABLE WITH AGGRIGATED PLAYER STATS YEAR TO DATE

In [13]:
## Load the player_stats table into df_player_stats

# Query to load the player_stats table
player_stats_query = "SELECT * FROM player_stats"
df_player_stats = pd.read_sql(player_stats_query, conn)

# Address problem where header rows of each game were added to the table
# If Team and Player columns match remove the row
df_player_stats = df_player_stats[df_player_stats['Team'] != 'Team']

# Replace the non-breaking space with a regular space
df_player_stats['Player'] = df_player_stats['Player'].apply(lambda x: x.replace('\xa0', ' '))

# Convert relevant columns to integers for correct aggregation
cols_to_convert = ['G', 'A', 'Pts', 'plus_minus', 'Sh', 'PIM']
for col in cols_to_convert:
    df_player_stats[col] = pd.to_numeric(df_player_stats[col], errors='coerce')

# Aggregate the data for year-to-date stats
# Add a column for counting the number of games each player has played
agg_player_stats_corrected_with_games = df_player_stats.groupby(['Player', 'Team']).agg({
    'G': 'sum',
    'A': 'sum',
    'Pts': 'sum',
    'plus_minus': 'sum',
    'Sh': 'sum',
    'PIM': 'sum',
    'Game_ID': 'count'  # Counting the number of unique Game_IDs for each player
}).reset_index()

# Rename the Game_ID column to Games_Played
agg_player_stats_corrected_with_games.rename(columns={'Game_ID': 'Games_Played'}, inplace=True)

# Save the updated aggregated data back to the database, replacing the existing table
agg_player_stats_corrected_with_games.to_sql('player_stats_ytd', conn, if_exists='replace', index=False)




1696

## Clean up / Rename COlumns in Advanced Stats

In [14]:
## NEW Handling of Advanced Stats
# Create dataframe from SQL query
df = pd.read_sql_query("SELECT * FROM advanced_metrics", conn)

# Rename columns
new_names = ['Team', 'Player', 'TOTAL_Block', 'TOTAL_Miss', 'TOTAL_Saved', 'TOTAL_Goals', 'TOTAL_Total_Shots',
                'EVEN_Block', 'EVEN_Miss', 'EVEN_Saved', 'EVEN_Goals', 'EVEN_Total_Shots',
                'PP_Block', 'PP_Miss', 'PP_Saved', 'PP_Goals', 'PP_Total_Shots',
                'CLOSE_Block', 'CLOSE_Miss', 'CLOSE_Saved', 'CLOSE_Goals', 'CLOSE_Total_Shots',

                'D_Blocks', 'Faceoffs', 'Game_ID']

df.columns = new_names

# Remove all rows where Player = TOTAL
df = df[df['Player'] != 'TOTAL']

# # Apply the matched_dict to the Team column
# df['Team'] = df['Team'].apply(lambda x: matched_dict[x])

## Fill all NaN values with 0
df = df.fillna(0)

# Display the dataframe
df.head()

# Save back to the database
df.to_sql('advanced_metrics', conn, if_exists='replace', index=False)

18927

In [19]:
## TEMP CODE
# Open the database connection
conn = sqlite3.connect(db_path)

In [20]:
# Step 1: Read the game_details table into a DataFrame
df_game_details = pd.read_sql("SELECT * FROM game_details", conn)

# Step 2: Create new columns for Home and Away Teams by parsing Game_ID
df_game_details['Away_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[4])
df_game_details['Home_Team'] = df_game_details['Game_ID'].apply(lambda x: x.split('-')[3])

# Step 3: Write this updated DataFrame back to the game_details table
df_game_details.to_sql('game_details', conn, if_exists='replace', index=False)

497

In [21]:
## Close the database connection
conn.close()

# shutdown logging
logging.shutdown()

In [16]:
# df.head(20)