# Game Data Scraper & Cleaner
- created 12/9/24 
- refactor of legacy code from Game_Data_Scraper_1 and Game_Data_Cleaner

In [1]:
## Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import logging
from tqdm import tqdm
import re
import os 
from sqlalchemy import create_engine
import sqlite3


# Create timestamp string to use in file names
timestamp = time.strftime("%Y%m%d-%H%M%S")

## FILE PATHS and CONSTANTS
#Local Folder paths
temp_folder = "os.path.join('..', 'TEMP')"
data_folder = os.path.join('..', 'data')
db_folder = os.path.join(data_folder, 'db')
log_folder = os.path.join(temp_folder, 'logs')

# Check Paths
if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
if not os.path.exists(db_folder):
    os.makedirs(db_folder)

# Remote URL
base_url = 'https://www.collegehockeynews.com' ## Base usl for box scores and metrics
current_year_url = 'https://www.collegehockeynews.com/schedules/?season=20242025' ## Current year schedule


## Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.info('Logging Started')

# Database Name
# DB_FILE_NAME = 'NEW_Scrape_NEW_Code.db'
DB_FILE_NAME = '2024_Dec_03_CLEAN.db'



2024-12-09 16:21:26,110 - INFO - Logging Started


### Initiate DB Connection
- backup and open DB file if it exists
- create new DB file with DB_FILE_NAME if none exists

In [2]:
## Database Path
db_path = os.path.join(db_folder, DB_FILE_NAME)
# Create Backup in TEMP folder with datetime stamp
backup_db_path = os.path.join(temp_folder, f"{DB_FILE_NAME}_{timestamp}.backup")
# backup_db_path = os.path.join(temp_folder, f"{DB_FILE_NAME}.backup")

# Check if the database file exists
if os.path.exists(db_path):
    # Backup the existing database
    logger.info(f"Database file found at {db_path}. Backing it up to {backup_db_path}.")
    try:
        # Perform the backup
        with open(db_path, 'rb') as original_db, open(backup_db_path, 'wb') as backup_db:
            backup_db.write(original_db.read())
        logger.info(f"Backup successful: {backup_db_path}")
    except Exception as e:
        logger.error(f"Failed to back up the database: {e}")
else:
    logger.info(f"No database file found at {db_path}. A new database will be created.")

# Create a database connection
try:
    engine = create_engine(f"sqlite:///{db_path}")
    conn = sqlite3.connect(db_path)
    logger.info(f"Database connection established at {db_path}")
except Exception as e:
    logger.error(f"Failed to establish database connection: {e}")
    raise


2024-12-09 16:21:26,115 - INFO - Database file found at ..\data\db\2024_Dec_03_CLEAN.db. Backing it up to os.path.join('..', 'TEMP')\2024_Dec_03_CLEAN.db_20241209-162126.backup.
2024-12-09 16:21:26,128 - INFO - Backup successful: os.path.join('..', 'TEMP')\2024_Dec_03_CLEAN.db_20241209-162126.backup
2024-12-09 16:21:26,135 - INFO - Database connection established at ..\data\db\2024_Dec_03_CLEAN.db


#### Create a Dictionary of Team names and abbreviations
- from arena_school_info table

In [3]:
## Load school infomation from arena_school_info.csv
school_info_df = pd.read_csv(os.path.join(data_folder, 'arena_school_info.csv'))

# Create a dictionary for abbreviations to full team names
abbreviation_to_fullname = school_info_df.set_index('abv')['School'].to_dict()

# Define a function to replace abbreviations in a column with full team names
def replace_abbreviations_with_fullnames(df, column_name, abbreviation_dict):
    """
    Replaces abbreviations in the specified column of a DataFrame with full team names.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the column to process.
        column_name (str): The column name where abbreviations need to be replaced.
        abbreviation_dict (dict): Dictionary mapping abbreviations to full names.
    
    Returns:
        pd.DataFrame: DataFrame with updated column values.
    """
    df[column_name] = df[column_name].replace(abbreviation_dict)
    return df


### Download a table of every game in CHN database for the selected season
- output games_df table of every game listed on CHN site
- Clean team names of unwanted characters and create unique Game_ID

In [4]:
## Function to Parse the Current Season Schedule / Results Page
def parse_current_season(url):
    """
    Parses the current season schedule/results page.
    Args:
        url (str): URL of the current season schedule/results page.

    Returns:
        list: Parsed data as a list of rows.
    """
    import requests
    from bs4 import BeautifulSoup

    # Initialize variables
    current_date, current_conference, game_notes = None, None, None
    data = []  # List to store game data

    # Fetch the page
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Failed to retrieve data from {url}, status code {response.status_code}")

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')

    # Process each row
    for row in rows:
        row_class = row.get('class')
        
        if row_class == ['stats-section']:  # Date row
            current_date = row.find('td').text.strip()
        elif row_class == ['sked-header']:  # Conference row
            current_conference = row.find('td').text.strip()
        elif len(row.find_all('td')) == 2:  # Game notes row
            game_notes = row.find_all('td')[1].text.strip()
        elif row.get('valign') == 'top':  # Game data row
            game_data = extract_game_data(row, current_date, current_conference, game_notes)
            if game_data:
                data.append(game_data)
            game_notes = None  # Reset game notes for the next row

    return data


def extract_game_data(row, current_date, current_conference, game_notes):
    """
    Extracts game data from a table row.
    Args:
        row (Tag): BeautifulSoup row tag.
        current_date (str): Current date of the game.
        current_conference (str): Current conference of the game.
        game_notes (str): Notes for the game.

    Returns:
        list: Extracted game data or None if row is invalid.
    """
    cells = row.find_all('td')
    if len(cells) < 9:
        return None

    # Extract data
    home_team = clean_team_name(cells[3].text.strip())
    home_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
    home_score = cells[4].text.strip()

    # home_team = clean_team_name(cells[0].text.strip())
    # home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
    # home_score = cells[1].text.strip()

    away_team = clean_team_name(cells[0].text.strip())
    away_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
    away_score = cells[1].text.strip()

    ot = cells[5].text.strip()
    box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
    metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None

    # Optional game notes
    game_notes_cell = cells[-1].find('small')
    game_notes = game_notes_cell.text.strip() if game_notes_cell else game_notes

    return [
        current_date, current_conference, game_notes,
        away_team, away_team_link, away_score,
        home_team, home_team_link, home_score,
        
        ot, box_link, metrics_link
    ]


def clean_team_name(team_name):
    """
    Cleans the team name by removing unwanted characters.
    Args:
        team_name (str): Team name.

    Returns:
        str: Cleaned team name.
    """
    # Replace unwanted characters with an empty string
    return team_name.replace('-', ' ').replace('.', '').replace("'", '').strip()


## Call the function
parsed_data = parse_current_season(current_year_url)

## Create a DataFrame from the parsed data
columns = [
    'Date', 'Conference', 'Game_Notes', 'Away_Team', 'Away_Team_Link', 'Away_Score',
    'Home_Team', 'Home_Team_Link', 'Home_Score',  'OT',
    'Box_Link', 'Metrics_Link'
]
df = pd.DataFrame(parsed_data, columns=columns)

## Process the DataFrame
# Extract the day of the week and reformat the date
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')


In [5]:
def generate_game_id(row):
    """
    Generates a unique Game_ID based on the date, home team, and away team.
    Args:
        row (pd.Series): A row of the DataFrame.

    Returns:
        str: A unique Game_ID.
    """
    return f'{row.Date}-{row.Away_Team}-{row.Home_Team}'


def clean_column_data(df):
    """
    Cleans team names and ensures consistent formatting for the DataFrame columns.
    Args:
        df (pd.DataFrame): The DataFrame to clean.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    # Remove hyphens from team names
    df['Home_Team'] = df['Home_Team'].str.replace('-', ' ', regex=False)
    df['Away_Team'] = df['Away_Team'].str.replace('-', ' ', regex=False)

    # Filter out games that haven't been played yet
    df = df[df['Home_Score'] != '']

    # Replace NaN values in Metrics_Link with an empty string
    df['Metrics_Link'] = df['Metrics_Link'].fillna('')

    return df


# Apply the cleaning function to the DataFrame
df = clean_column_data(df)

# Generate Game_ID column
df['Game_ID'] = df.apply(generate_game_id, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Metrics_Link'] = df['Metrics_Link'].fillna('')


#### Compare game results table to games already in the DB

In [6]:
def filter_and_compare_games(df, conn, recent_days=5):
    """
    Compares the Game_ID in the extracted dataframe to the database, filters exhibition games,
    and prints a summary of game counts.
    
    Args:
        df (pd.DataFrame): Extracted games dataframe.
        conn (sqlite3.Connection): Database connection.
        recent_days (int): Number of recent days to re-scrape games for updates.

    Returns:
        pd.DataFrame: DataFrame of games to scrape.
    """
    import datetime

    # Print initial game count
    total_games = len(df)
    print(f"Total games in the extracted dataset: {total_games}")

    # Filter out exhibition games
    df = df[df['Conference'] != 'Exhibition']
    filtered_games = len(df)
    print(f"Games remaining after filtering exhibition games: {filtered_games}")

    # Retrieve existing Game_IDs from the database
    existing_game_ids_query = "SELECT DISTINCT Game_ID FROM game_details"  # Correct table name
    existing_game_ids = pd.read_sql(existing_game_ids_query, conn)['Game_ID'].tolist()
    print(f"Total games already in the database: {len(existing_game_ids)}")

    # Find games not in the database
    df['In_Database'] = df['Game_ID'].isin(existing_game_ids)
    new_games_df = df[~df['In_Database']]
    games_not_in_dataset = len(new_games_df)
    print(f"Games not in the database: {games_not_in_dataset}")

    # Add recent games to the scrape list (past `recent_days`)
    recent_date_threshold = (datetime.datetime.now() - datetime.timedelta(days=recent_days)).strftime('%Y-%m-%d')
    recent_games_df = df[df['Date'] >= recent_date_threshold]
    games_to_rescrape = len(recent_games_df)
    print(f"Games to re-scrape from the last {recent_days} days: {games_to_rescrape}")

    # Combine new games and recent games for scraping
    games_to_scrape_df = pd.concat([new_games_df, recent_games_df]).drop_duplicates(subset='Game_ID')
    print(f"Total games to scrape: {len(games_to_scrape_df)}")

    return games_to_scrape_df


# Example usage
games_to_scrape = filter_and_compare_games(df, conn)


Total games in the extracted dataset: 533
Games remaining after filtering exhibition games: 497
Total games already in the database: 438
Games not in the database: 130
Games to re-scrape from the last 5 days: 57
Total games to scrape: 130


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['In_Database'] = df['Game_ID'].isin(existing_game_ids)


## Scraping Functions
- currently untouched from legacy code

##### Newly Refactored Scraping Funcitons

###### Utility Functions

In [7]:

# Utility function to safely find and validate an HTML element
def safe_find(soup, tag, attrs=None, error_message="Element not found"):
    """
    Finds an HTML element and logs an error if it's not found.
    Args:
        soup (BeautifulSoup): The parsed HTML content.
        tag (str): The tag name to find (e.g., 'div', 'table').
        attrs (dict, optional): Attributes to filter the tag (e.g., {'id': 'playersums'}).
        error_message (str): Error message to log if the element is not found.

    Returns:
        element: The found element or None if not found.
    """
    element = soup.find(tag, attrs)
    if element is None:
        logging.error(error_message)
    return element

# Utility function to extract and clean text
def extract_text(element, default=""):
    """
    Extracts and cleans text from an HTML element.
    Args:
        element: The HTML element (e.g., Tag object).
        default (str): Default value to return if the element is None.

    Returns:
        str: Cleaned text or the default value.
    """
    return element.text.strip() if element else default

# Utility function for extracting table rows safely
def extract_table_rows(table, error_message="Table rows not found"):
    """
    Extracts rows from an HTML table.
    Args:
        table (Tag): The table element.
        error_message (str): Error message to log if rows are not found.

    Returns:
        list: List of rows or an empty list if not found.
    """
    rows = table.find_all('tr') if table else []
    if not rows:
        logging.error(error_message)
    return rows

###### Refactored Subfunctions

In [8]:
### NOT WORKING, JUST RETURNING ERROR- div not FOUND
# def parse_player_summary(html_content):
#     """
#     Parses the player summary table from the HTML content.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Player stats as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Validate playersums div
#     playersums_div = safe_find(soup, 'div', {'id': 'playersums'}, "Player summaries div not found")
#     if playersums_div is None:
#         return pd.DataFrame()

#     player_stats = []
#     for player_sum in playersums_div.find_all('div', class_='playersum'):
#         team = extract_text(player_sum.find('td'), "Unknown Team")
#         for row in extract_table_rows(player_sum, "No rows found in player summary table"):
#             cols = row.find_all('td')
#             if len(cols) > 1:
#                 player = extract_text(cols[0])
#                 goals = extract_text(cols[1])
#                 assists = extract_text(cols[2])
#                 points = extract_text(cols[3])
#                 plus_minus = extract_text(cols[4])
#                 shots = extract_text(cols[5])
#                 pim = extract_text(cols[6])
#                 fowl = extract_text(cols[7]) if len(cols) > 7 else None

#                 fow, fol, win_percentage = None, None, None
#                 try:
#                     if fowl and '‑' in fowl:  # Checking if it contains a hyphen
#                         fow, fol = map(int, fowl.split('‑'))
#                         total_fo = fow + fol
#                         win_percentage = (fow / total_fo) * 100 if total_fo > 0 else 0
#                 except ValueError as e:
#                     logging.warning(f"Error parsing faceoff data for player {player}: {e}")

#                 player_stat = {
#                     'Team': team,
#                     'Player': player,
#                     'G': goals,
#                     'A': assists,
#                     'Pt.': points,
#                     '+/-': plus_minus,
#                     'Sh': shots,
#                     'PIM': pim,
#                     'FOW': fow,
#                     'FOL': fol,
#                     'FO%': win_percentage
#                 }
#                 player_stats.append(player_stat)

#     return pd.DataFrame(player_stats)

# def parse_scoring_summary(html_content):
#     """
#     Parses the scoring summary table from the HTML content.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Scoring events as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     scoring_div = safe_find(soup, 'div', {'id': 'scoring'}, "Scoring div not found")
#     if scoring_div is None:
#         return pd.DataFrame()

#     scoring_table = safe_find(scoring_div, 'table', error_message="Scoring table not found within the scoring div")
#     if scoring_table is None:
#         return pd.DataFrame()

#     scoring_events = []
#     period = None
#     for row in extract_table_rows(scoring_table, "No rows found in scoring table"):
#         if 'stats-section' in row.get('class', []):
#             period = extract_text(row.find('td'), "Unknown")
#         else:
#             cols = row.find_all('td')
#             if len(cols) > 1:
#                 team = extract_text(cols[0])
#                 pp = extract_text(cols[1])
#                 player_data = extract_text(cols[3])
#                 match = re.match(r"(.+)\s\((\d+)\)", player_data)
#                 player = match.group(1) if match else player_data
#                 goals = int(match.group(2)) if match else None
#                 assist_data_raw = extract_text(cols[4])
#                 assist_data = assist_data_raw.split(", ") if assist_data_raw else []
#                 assist1 = assist_data[0] if len(assist_data) > 0 else None
#                 assist2 = assist_data[1] if len(assist_data) > 1 else None
#                 time = extract_text(cols[5])

#                 scoring_event = {
#                     'Period': period,
#                     'Team': team,
#                     'PP': pp,
#                     'Player': player,
#                     'Player_Goals': goals,
#                     'Assist1': assist1,
#                     'Assist2': assist2,
#                     'Time': time
#                 }
#                 scoring_events.append(scoring_event)

#     return pd.DataFrame(scoring_events)

# def parse_penalty_summary(html_content):
#     """
#     Parses the penalty summary table from the HTML content.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Penalty events as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Validate the penalties div
#     penalties_div = safe_find(soup, 'div', {'id': 'penalties'}, "Penalties div not found")
#     if penalties_div is None:
#         return pd.DataFrame()

#     # Validate the penalties table
#     penalties_table = safe_find(penalties_div, 'table', "Penalties table not found within the penalties div")
#     if penalties_table is None:
#         return pd.DataFrame()

#     # Initialize list to store penalty events
#     penalty_events = []
#     period = None

#     # Loop through table rows
#     for row in extract_table_rows(penalties_table, "No rows found in penalties table"):
#         if 'stats-section' in row.get('class', []):  # Header row indicating the period
#             period = extract_text(row.find('td'), "Unknown")
#         else:  # Data rows
#             cols = row.find_all('td')
#             if len(cols) > 1:
#                 penalty_event = {
#                     'Period': period,
#                     'Team': extract_text(cols[0]),
#                     'Player': extract_text(cols[1]),
#                     'Pen_Length': extract_text(cols[2]),
#                     'Penalty_Type': extract_text(cols[3]),
#                     'Time': extract_text(cols[4]),
#                 }
#                 penalty_events.append(penalty_event)

#     return pd.DataFrame(penalty_events)

# def parse_goalie_stats(html_content):
#     """
#     Parses the goalie stats table from the HTML content.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Goalie stats as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Validate the goalies div
#     goalies_div = safe_find(soup, 'div', {'id': 'goalies'}, "Goalies div not found")
#     if goalies_div is None:
#         return pd.DataFrame()

#     # Validate the goalies table
#     goalies_table = safe_find(goalies_div, 'table', "Goalies table not found within the goalies div")
#     if goalies_table is None:
#         return pd.DataFrame()

#     # Initialize list to store goalie stats
#     goalie_stats = []
#     team = None

#     # Loop through table rows
#     for row in extract_table_rows(goalies_table, "No rows found in goalies table"):
#         if 'stats-header' in row.get('class', []):  # Header row indicating the team
#             team = extract_text(row.find('td'), "Unknown")
#         else:  # Data rows
#             cols = row.find_all('td')
#             if len(cols) > 1:
#                 goalie_stat = {
#                     'Team': team,
#                     'Goalie': extract_text(cols[0]),
#                     'SV': extract_text(cols[1]),
#                     'GA': extract_text(cols[2]),
#                     'Minutes': extract_text(cols[3]),
#                 }
#                 goalie_stats.append(goalie_stat)

#     return pd.DataFrame(goalie_stats)

# def parse_line_chart(html_content):
#     """
#     Parses the line chart information for forwards, defensemen, and goalies.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Line chart data as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Validate the line chart div
#     line_chart_div = safe_find(soup, 'div', {'id': 'linechart'}, "Line chart div not found")
#     if line_chart_div is None:
#         return pd.DataFrame()

#     line_data = []

#     for team_div in line_chart_div.find_all('div', recursive=False):
#         team_name = extract_text(safe_find(team_div, 'h3', error_message="Team name not found in line chart"))
#         if not team_name:
#             continue

#         for line_type_div in team_div.find_all('div', recursive=False):
#             line_type = line_type_div.get('class', [None])[0]
#             if line_type is None:
#                 logging.warning(f"Line type not found for team {team_name}")
#                 continue

#             # Define position types based on line type
#             if line_type == 'f':
#                 position_types = ['Left Wing', 'Center', 'Right Wing']
#             elif line_type == 'd':
#                 position_types = ['Left D', 'Right D']
#             elif line_type == 'x':
#                 position_types = ['Extra']
#             elif line_type == 'g':
#                 position_types = ['Goalie']
#                 goalie_count = 1  # Initialize goalie count
#             else:
#                 continue

#             # Extract player data
#             players = line_type_div.find_all('div')
#             if not players:
#                 logging.warning(f"No players found for {team_name} in {line_type}")
#                 continue

#             for i, player in enumerate(players):
#                 player_name = extract_text(player)
#                 if line_type == 'x':
#                     player_name = player_name.split(' ')[0]
#                 if line_type == 'g':
#                     line_number = f"Goalie {goalie_count}"
#                     goalie_count += 1
#                 else:
#                     line_number = i // len(position_types) + 1

#                 position = position_types[i % len(position_types)]
#                 line_data.append({
#                     'Team': team_name,
#                     'Line': line_number,
#                     'Position': position,
#                     'Player': player_name
#                 })

#     if not line_data:
#         logging.error("No line data was collected")

#     return pd.DataFrame(line_data)

# def parse_linescore(html_content):
#     """
#     Parses the linescore elements (scores, shots, etc.) by period, dynamically handling extra periods.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Linescore data as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')
#     linescore_data = []

#     # Helper function to parse a table and populate linescore data
#     def parse_table(table_selector, data_key_prefix, warning_message):
#         table = safe_find(soup, 'table', {'id': table_selector}, f"{data_key_prefix.capitalize()} table not found")
#         if table is None:
#             return

#         rows = extract_table_rows(table, warning_message)
#         for i, row in enumerate(rows):
#             cells = row.find_all('td')
#             if not cells:
#                 logging.warning(f"No data found for row {i + 1} in {data_key_prefix.capitalize()} table")
#                 continue

#             team_name = extract_text(cells[0])
#             if len(linescore_data) <= i:
#                 linescore_data.append({'Team': team_name})
#             elif linescore_data[i]['Team'] != team_name:
#                 logging.warning(f"Mismatch in team names for row {i + 1}: {team_name} vs {linescore_data[i]['Team']}")
#                 continue

#             for j, cell in enumerate(cells[1:], start=1):
#                 column_name = f"{data_key_prefix}{j}"
#                 try:
#                     linescore_data[i][column_name] = int(extract_text(cell))
#                 except ValueError:
#                     logging.warning(f"Could not convert {data_key_prefix} data to integer for row {i + 1}, column {j}")
#                     linescore_data[i][column_name] = None

#     # Parse Goals table
#     parse_table('goals', 'goals', "No rows found in Goals table")

#     # Parse Shots table
#     parse_table('shots', 'shots', "No rows found in Shots table")

#     # Parse PP table for penalties, power plays, and faceoffs
#     pp_table = safe_find(soup, 'table', {'id': 'pp'}, "PP table not found")
#     if pp_table:
#         rows = extract_table_rows(pp_table, "No rows found in PP table")
#         for i, row in enumerate(rows):
#             cells = row.find_all('td')
#             if len(cells) < 4:
#                 logging.warning(f"Insufficient columns in PP table row {i + 1}")
#                 continue

#             try:
#                 pen_pim = extract_text(cells[1]).split('‑')
#                 linescore_data[i]['Pen'] = int(pen_pim[0])
#                 linescore_data[i]['PIM'] = int(pen_pim[1])

#                 ppg_ppo = extract_text(cells[2]).split('‑')
#                 linescore_data[i]['PPG'] = int(ppg_ppo[0])
#                 linescore_data[i]['PPO'] = int(ppg_ppo[1])

#                 fow_fol = extract_text(cells[3]).split('‑')
#                 linescore_data[i]['FOW'] = int(fow_fol[0])
#                 linescore_data[i]['FOL'] = int(fow_fol[1])
#                 total_fo = linescore_data[i]['FOW'] + linescore_data[i]['FOL']
#                 linescore_data[i]['FOW%'] = (linescore_data[i]['FOW'] / total_fo) * 100 if total_fo > 0 else None
#             except (ValueError, IndexError) as e:
#                 logging.warning(f"Error processing PP data for row {i + 1}: {e}")
#                 continue

#     # Convert linescore data to DataFrame
#     df = pd.DataFrame(linescore_data)

#     # Dynamically ensure columns exist for all periods
#     max_period = max(
#         [int(col[len('goals'):]) for col in df.columns if col.startswith('goals') and col[len('goals'):].isdigit()],
#         default=3  # Default to 3 periods for standard games
#     )
#     expected_goals_columns = [f'goals{i}' for i in range(1, max_period + 1)] + ['goalsT']
#     expected_shots_columns = [f'shots{i}' for i in range(1, max_period + 1)] + ['shotsT']

#     for col in expected_goals_columns + expected_shots_columns:
#         if col not in df.columns:
#             df[col] = 0

#     return df

# def parse_game_details(html_content):
#     """
#     Parses game details from the HTML content.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Game details as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Validate the meta div
#     meta_div = safe_find(soup, 'div', {'id': 'meta'}, "Meta div not found")
#     if meta_div is None:
#         return pd.DataFrame()

#     # Validate the game details div
#     game_details_div = safe_find(meta_div, 'div', error_message="Game details div not found", attrs=None)
#     if game_details_div is None:
#         return pd.DataFrame()

#     try:
#         # Extract date and day of the week
#         date_str = extract_text(game_details_div.find('h4'))
#         day_of_week, date = date_str.split(", ", 1) if ", " in date_str else (None, None)

#         # Extract conference and location
#         p_elements = game_details_div.find_all('p')
#         conference, location = None, None
#         for p in p_elements:
#             if "Game" in p.get_text():
#                 details_strs = p.get_text(separator='|').split('|')
#                 conference = details_strs[0]
#                 location = details_strs[-1].split('at ')[-1] if 'at ' in details_strs[-1] else None
#                 break

#         # Extract referees and assistant referees
#         refs_str, asst_refs_str = None, None
#         for p in p_elements:
#             if "Referees" in p.get_text():
#                 refs_str = extract_text(p.find('strong'))
#                 asst_refs_str = extract_text(p.find_all('strong')[1]) if len(p.find_all('strong')) > 1 else None
#                 break

#         refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in refs_str.split(', ')] if refs_str else []
#         asst_refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in asst_refs_str.split(', ')] if asst_refs_str else []

#         # Extract attendance using regex
#         attendance_pattern = r"Attendance:\s?(\d+[\d,]*)"
#         attendance_match = re.search(attendance_pattern, html_content)
#         attendance = int(attendance_match.group(1).replace(',', '')) if attendance_match else None

#         # Extract additional game details (e.g., shootout results)
#         details = None
#         for p in p_elements:
#             if "shootout" in p.get_text():
#                 details = p.get_text().replace('\n', ' ').replace('\t', ' ').strip()
#                 break

#         # Build game details dictionary
#         game_details = {
#             'Day': day_of_week,
#             'Date': date,
#             'Conference': conference,
#             'Details': details,
#             'Location': location,
#             'Ref1': refs[0] if refs else None,
#             'Ref2': refs[1] if len(refs) > 1 else None,
#             'Asst_Ref1': asst_refs[0] if asst_refs else None,
#             'Asst_Ref2': asst_refs[1] if len(asst_refs) > 1 else None,
#             'Attendance': attendance
#         }

#         return pd.DataFrame([game_details])

#     except (AttributeError, IndexError, ValueError) as e:
#         logging.error(f"Error while parsing game details: {e}")
#         return pd.DataFrame()

# def parse_new_advanced_metrics(html_content):
#     """
#     Parses advanced team metrics tables and returns data as a single DataFrame.
#     Args:
#         html_content (str): The HTML content of the page.

#     Returns:
#         pd.DataFrame: Advanced metrics data as a DataFrame.
#     """
#     soup = BeautifulSoup(html_content, 'html.parser')

#     # Find all tables with advanced metrics
#     tables = soup.find_all('table', {'class': 'sortable metrics'})
#     if not tables:
#         logging.error("No advanced metrics tables found")
#         return pd.DataFrame()

#     # List to store all parsed data
#     all_data = []

#     for table in tables:
#         # Extract team name
#         team_name_cell = safe_find(table, 'td', error_message="Team name cell not found in metrics table")
#         team_name = extract_text(team_name_cell, "Unknown Team")

#         # Extract headers
#         headers = [
#             extract_text(header) for header in table.find_all('th')
#         ][1:]  # Skip the "Player" header

#         # Prepare column headers
#         col_names = ['Team', 'Player'] + headers

#         # Extract player data
#         rows = extract_table_rows(table, "No rows found in metrics table")[2:]  # Skip two header rows
#         for row in rows:
#             player_data = [team_name]  # Start with team name
#             cells = row.find_all('td')
#             if not cells:
#                 logging.warning(f"No data cells found for a row in table for team {team_name}")
#                 continue

#             player_data.append(extract_text(cells[0]))  # Player name
#             for cell in cells[1:]:
#                 player_data.append(extract_text(cell))  # Add the rest of the cell values

#             all_data.append(player_data)

#     # Convert data to DataFrame
#     try:
#         df = pd.DataFrame(all_data, columns=col_names)
#     except ValueError as e:
#         logging.error(f"Error creating DataFrame: {e}")
#         return pd.DataFrame()

#     return df



###### Function to Scr4ape and Score Games using New Subfunctions

In [9]:
# def scrape_and_store_games(games_to_scrape, base_url, conn):
#     """
#     Scrapes data for a list of games, applies scraping subfunctions, and stores the results.
#     Args:
#         games_to_scrape (pd.DataFrame): DataFrame containing games to scrape.
#         base_url (str): Base URL of the website.
#         conn (sqlite3.Connection): Database connection to store results.

#     Returns:
#         None
#     """
#     import time

#     # Initialize lists to collect data
#     all_game_details = []
#     all_scoring_summaries = []
#     all_penalty_summaries = []
#     all_goalie_stats = []
#     all_player_stats = []
#     all_line_charts = []
#     all_linescores = []
#     all_advanced_metrics = []

#     for _, game in games_to_scrape.iterrows():
#         game_id = game['Game_ID']
#         box_score_url = f"{base_url}/{game['Box_Link']}"
#         metrics_url = f"{base_url}/{game['Metrics_Link']}" if game['Metrics_Link'] else None

#         logging.info(f"Scraping game: {game_id}")

#         try:
#             # Request box score page
#             response = requests.get(box_score_url)
#             response.raise_for_status()
#             box_score_html = response.text

#             # Call scraping subfunctions
#             game_details = parse_game_details(box_score_html)
#             scoring_summary = parse_scoring_summary(box_score_html)
#             penalty_summary = parse_penalty_summary(box_score_html)
#             goalie_stats = parse_goalie_stats(box_score_html)
#             player_stats = parse_player_summary(box_score_html)
#             line_chart = parse_line_chart(box_score_html)
#             linescore = parse_linescore(box_score_html)

#             # Request advanced metrics page (if available)
#             if metrics_url:
#                 response_metrics = requests.get(metrics_url)
#                 response_metrics.raise_for_status()
#                 metrics_html = response_metrics.text
#                 advanced_metrics = parse_new_advanced_metrics(metrics_html)
#             else:
#                 advanced_metrics = pd.DataFrame()

#             # Append results to lists
#             if not game_details.empty:
#                 game_details['Game_ID'] = game_id
#                 all_game_details.append(game_details)

#             if not scoring_summary.empty:
#                 scoring_summary['Game_ID'] = game_id
#                 all_scoring_summaries.append(scoring_summary)

#             if not penalty_summary.empty:
#                 penalty_summary['Game_ID'] = game_id
#                 all_penalty_summaries.append(penalty_summary)

#             if not goalie_stats.empty:
#                 goalie_stats['Game_ID'] = game_id
#                 all_goalie_stats.append(goalie_stats)

#             if not player_stats.empty:
#                 player_stats['Game_ID'] = game_id
#                 all_player_stats.append(player_stats)

#             if not line_chart.empty:
#                 line_chart['Game_ID'] = game_id
#                 all_line_charts.append(line_chart)

#             if not linescore.empty:
#                 linescore['Game_ID'] = game_id
#                 all_linescores.append(linescore)

#             if not advanced_metrics.empty:
#                 advanced_metrics['Game_ID'] = game_id
#                 all_advanced_metrics.append(advanced_metrics)

#             logging.info(f"Successfully scraped game: {game_id}")

#         except Exception as e:
#             logging.error(f"Failed to scrape game {game_id}: {e}")

#         # Sleep to avoid overloading the server
#         time.sleep(1)

#     # Consolidate and store results in the database
#     try:
#         if all_game_details:
#             pd.concat(all_game_details).to_sql('game_details', conn, if_exists='append', index=False)
#         if all_scoring_summaries:
#             pd.concat(all_scoring_summaries).to_sql('scoring_summary', conn, if_exists='append', index=False)
#         if all_penalty_summaries:
#             pd.concat(all_penalty_summaries).to_sql('penalty_summary', conn, if_exists='append', index=False)
#         if all_goalie_stats:
#             pd.concat(all_goalie_stats).to_sql('goalie_stats', conn, if_exists='append', index=False)
#         if all_player_stats:
#             pd.concat(all_player_stats).to_sql('player_stats', conn, if_exists='append', index=False)
#         if all_line_charts:
#             pd.concat(all_line_charts).to_sql('line_chart', conn, if_exists='append', index=False)
#         if all_linescores:
#             pd.concat(all_linescores).to_sql('linescore', conn, if_exists='append', index=False)
#         if all_advanced_metrics:
#             pd.concat(all_advanced_metrics).to_sql('advanced_metrics', conn, if_exists='append', index=False)

#         logging.info("All data successfully stored in the database.")
#     except Exception as e:
#         logging.error(f"Failed to store data in the database: {e}")


In [10]:
# ### Call the Function
# # Example: Scraping games and storing results
# scrape_and_store_games(
#     games_to_scrape=games_to_scrape,  # DataFrame of games to scrape
#     base_url='https://www.collegehockeynews.com',  # Base URL
#     conn=conn  # SQLite database connection
# )


##### END REFACTORED CODE - CURRENTLY NOT WORKING

<!-- ##### LEGACY CODE - WORKING FROM G_D_S_1
-  -->

In [11]:

#### PARSE PLAYER STATS TABLE ####
def parse_player_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the playersums div
    playersums_div = soup.find('div', id='playersums')
    if playersums_div is None:
        return "Player summaries div not found"

    # Initialize list to store player stats
    player_stats = []

    # Loop through each playersum div
    for player_sum in playersums_div.find_all('div', class_='playersum'):
        team = player_sum.find('td').text.strip()

        # Replace abbreviation and clean team name
        team = abbreviation_to_fullname.get(team, team)
        team = clean_team_name(team)
        
        # Loop through table rows
        for row in player_sum.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) > 1:
                player = cols[0].text.strip()
                goals = cols[1].text.strip()
                assists = cols[2].text.strip()
                points = cols[3].text.strip()
                plus_minus = cols[4].text.strip()
                shots = cols[5].text.strip()
                pim = cols[6].text.strip()
                fowl = cols[7].text.strip() if len(cols) > 7 else None
                
                fow, fol = None, None
                win_percentage = None
                
                

                try:
                    if fowl and '‑' in fowl:  # Checking if it contains a hyphen
                        fow, fol = map(int, fowl.split('‑'))
                        total_fo = fow + fol
                        win_percentage = (fow / total_fo) * 100 if total_fo > 0 else 0
                except ValueError:
                    fow, fol, win_percentage = None, None, None

                

                
                player_stat = {
                    'Team': team,
                    'Player': player,
                    'G': goals,
                    'A': assists,
                    'Pt.': points,
                    '+/-': plus_minus,
                    'Sh': shots,
                    'PIM': pim,
                    'FOW': fow,
                    'FOL': fol,
                    'FO%': win_percentage
                }
                player_stats.append(player_stat)

    return pd.DataFrame(player_stats)


############# PARSEING SCORING SUMMARY WITH BS4
def parse_scoring_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the scoring div and table
    scoring_div = soup.find('div', id='scoring')
    if scoring_div is None:
        logging.error("Scoring div not found")
        return None

    scoring_table = scoring_div.find('table')
    if scoring_table is None:
        logging.error("Scoring table not found within the scoring div")
        return None

    # Initialize list to store scoring events
    scoring_events = []
    period = None

    # Loop through table rows
    for row in scoring_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                try:
                    team = cols[0].text.strip()
                    team = abbreviation_to_fullname.get(team, team)  # Replace abbreviation
                    team = clean_team_name(team)  # Clean team name
                    pp = cols[1].text.strip()

                    player_data = cols[3].text.strip()
                    match = re.match(r"(.+)\s\((\d+)\)", player_data)
                    player = match.group(1) if match else player_data
                    goals = int(match.group(2)) if match else None

                    assist_data_raw = cols[4].text.strip()
                    assist_data = assist_data_raw.split(", ") if assist_data_raw else []
                    assist1 = assist_data[0] if len(assist_data) > 0 else None
                    assist2 = assist_data[1] if len(assist_data) > 1 else None

                    time = cols[5].text.strip()

                    scoring_event = {
                        'Period': period,
                        'Team': team,
                        'PP': pp,
                        'Player': player,
                        'Player_Goals': goals,
                        'Assist1': assist1,
                        'Assist2': assist2,
                        'Time': time
                    }
                    scoring_events.append(scoring_event)
                except Exception as e:
                    logging.error(f"An error occurred while parsing a scoring event row: {e}")
            else:
                logging.warning(f"Insufficient columns in scoring row: {len(cols)}")

    return pd.DataFrame(scoring_events)


############# PARSEING PENALTY SUMMARY WITH BS4
def parse_penalty_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the penalties div and table
    penalties_div = soup.find('div', id='penalties')
    if penalties_div is None:
        logging.error("Penalties div not found")
        return None

    penalties_table = penalties_div.find('table')
    if penalties_table is None:
        logging.error("Penalties table not found within the penalties div")
        return None

    # Initialize list to store penalty events
    penalty_events = []
    period = None

    # Loop through table rows
    for row in penalties_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                team = cols[0].text.strip()
                team = abbreviation_to_fullname.get(team, team)  # Replace abbreviation
                team = clean_team_name(team)  # Clean team name

                player = cols[1].text.strip()
                pen_length = cols[2].text.strip()
                penalty_type = cols[3].text.strip()
                time = cols[4].text.strip()

                penalty_event = {
                    'Period': period,
                    'Team': team,
                    'Player': player,
                    'Pen_Length': pen_length,
                    'Penalty_Type': penalty_type,
                    'Time': time
                }
                penalty_events.append(penalty_event)

    return pd.DataFrame(penalty_events)


############# GOALIE SUMMARY WITH BS4
def parse_goalie_stats(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the goalies div and table
    goalies_div = soup.find('div', id='goalies')
    if goalies_div is None:
        logging.error("Goalies div not found")
        return None

    goalies_table = goalies_div.find('table')
    if goalies_table is None:
        logging.error("Goalies table not found within the goalies div")
        return None

    # Initialize list to store goalie stats
    goalie_stats = []
    team = None

    # Loop through table rows   
    for row in goalies_table.find_all('tr'):
        if 'stats-header' in row.get('class', []):  # Team header rows
            td = row.find('td')
            team = td.text.strip() if td else "Unknown"
            # Replace abbreviation and clean team name
            team = abbreviation_to_fullname.get(team, team)
            team = clean_team_name(team)
        else:  # Data rows
            cols = row.find_all('td')
            if len(cols) > 1:
                goalie = cols[0].text.strip()
                sv = cols[1].text.strip()
                ga = cols[2].text.strip()
                minutes = cols[3].text.strip()

                # Build goalie stat dictionary
                goalie_stat = {
                    'Team': team,
                    'Goalie': goalie,
                    'SV': sv,
                    'GA': ga,
                    'Minutes': minutes
                }
                goalie_stats.append(goalie_stat)

    # Convert list to DataFrame
    return pd.DataFrame(goalie_stats)


#### PARSE THE ADVANCED TEAM METRICS TABLES ####
### RETURNS WHOLE ADVANCED METRICS AS SINGLE TABLE
####################################
def parse_new_advanced_metrics(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        team_name = abbreviation_to_fullname.get(team_name, team_name)  # Replace abbreviation
        team_name = clean_team_name(team_name)  # Clean team name
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df

######## NEW TEST ###############  
def parse_advanced_metrics_tables(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        team_name = abbreviation_to_fullname.get(team_name, team_name)  # Replace abbreviation
        team_name = clean_team_name(team_name)  # Clean team name
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df

# Parsing the line chart information with specific positions for forwards and defensemen.
def parse_line_chart(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    line_chart_div = soup.find('div', id='linechart')

    if line_chart_div is None:
        logging.error("Line chart div not found")
        return pd.DataFrame()

    line_data = []

    for team_div in line_chart_div.find_all('div', recursive=False):
        h3 = team_div.find('h3')
        if h3 is None:
            logging.warning("Team name not found")
            continue
        
        team_name = h3.text.strip()
        # Replace abbreviation and clean team name
        team_name = abbreviation_to_fullname.get(team_name, team_name)
        team_name = clean_team_name(team_name)
        
        for line_type_div in team_div.find_all('div', recursive=False):
            line_type = line_type_div.get('class')[0] if line_type_div.get('class') else None
            if line_type is None:
                logging.warning("Line type not found")
                continue
            
            if line_type == 'f':
                position_types = ['Left Wing', 'Center', 'Right Wing']
            elif line_type == 'd':
                position_types = ['Left D', 'Right D']
            elif line_type == 'x':
                position_types = ['Extra']
            elif line_type == 'g':
                position_types = ['Goalie']
                goalie_count = 1  # Initialize goalie count
            else:
                continue

            players = line_type_div.find_all('div')
            if not players:
                logging.warning(f"No players found for {team_name} in {line_type}")
                continue
            
            for i, player in enumerate(players):
                player_name = player.text.strip()
                if line_type == 'x':
                    player_name = player_name.split(' ')[0]
                if line_type == 'g':
                    line_number = f"Goalie {goalie_count}"
                    goalie_count += 1
                else:
                    line_number = i // len(position_types) + 1

                position = position_types[i % len(position_types)]
                line_data.append({
                    'Team': team_name,
                    'Line': line_number,
                    'Position': position,
                    'Player': player_name
                })

    if not line_data:
        logging.error("No line data was collected")

    df = pd.DataFrame(line_data)
    
    # # Log DataFrame info for debugging
    # if df.empty:
    #     logging.warning("Generated line chart DataFrame is empty.")
    # else:
    #     logging.info(f"Generated line chart DataFrame with columns: {df.columns.tolist()}")

    return df

from sqlalchemy import inspect, text

def ensure_columns_exist(table_name, columns, engine):
    """
    Ensures that the specified columns exist in the given table. Adds them if they are missing.
    
    Args:
        table_name (str): Name of the table.
        columns (list): List of column names to check/add.
        engine (sqlalchemy.engine): SQLAlchemy database engine.
    """
    inspector = inspect(engine)
    existing_columns = [col['name'] for col in inspector.get_columns(table_name)]

    missing_columns = [col for col in columns if col not in existing_columns]

    if missing_columns:
        with engine.connect() as conn:
            for col in missing_columns:
                sql = text(f"ALTER TABLE {table_name} ADD COLUMN {col} INTEGER DEFAULT 0;")
                conn.execute(sql)
                logging.info(f"Added missing column: {col} to table: {table_name}")

### Get the Linescore Elements - Score, shots, ect by period####
### NEEDS UPDATE NOW THAT POSTSEASON MEAND 5th, 6th, ect PERIODS
def parse_linescore(html_content):
    """
    Parses the linescore tables (Goals, Shots, PP) and dynamically handles periods and schema alignment.

    Args:
        html_content (str): HTML content of the page.

    Returns:
        pd.DataFrame: Linescore data as a DataFrame.
    """
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    linescore_data = []
    max_periods = 0

    # Parsing the Goals table
    goals_table = soup.select_one("#goals table")
    if goals_table is None:
        logging.error("Goals table not found")
        return pd.DataFrame()

    rows = goals_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Goals table")
        return pd.DataFrame()

    for row in rows:
        team_data = {}
        td = row.select_one('td')
        if td:
            team_data['Team'] = td.text.strip()
        else:
            logging.warning("Team name not found in Goals table")
            continue

        goals = row.select('td')[1:]
        max_periods = max(max_periods, len(goals) - 1)  # Update max periods dynamically
        for i, goal in enumerate(goals):
            column_name = f'goals{i + 1}' if i < len(goals) - 1 else 'goalsT'
            try:
                team_data[column_name] = int(goal.text.strip())
            except ValueError:
                team_data[column_name] = None
                logging.warning(f"Invalid goal value in column {column_name}")

        linescore_data.append(team_data)

    # Parsing the Shots table
    shots_table = soup.select_one("#shots table")
    if shots_table is None:
        logging.error("Shots table not found")
        return pd.DataFrame()

    rows = shots_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Shots table")
        return pd.DataFrame()

    for i, row in enumerate(rows):
        shots = row.select('td')[1:]
        if not shots:
            logging.warning(f"No shot data found for row {i + 1} in Shots table")
            continue

        for j, shot in enumerate(shots):
            column_name = f'shots{j + 1}' if j < len(shots) - 1 else 'shotsT'
            try:
                if len(linescore_data) > i:
                    linescore_data[i][column_name] = int(shot.text.strip())
                else:
                    logging.error(f"Mismatch in rows for shots and goals for row {i + 1}")
            except ValueError:
                if len(linescore_data) > i:
                    linescore_data[i][column_name] = None
                logging.warning(f"Invalid shot value in column {column_name} for row {i + 1}")

    # Convert to DataFrame early
    df = pd.DataFrame(linescore_data)

    # Ensure all columns exist dynamically
    expected_goals_columns = [f'goals{i}' for i in range(1, max_periods + 1)] + ['goalsT']
    expected_shots_columns = [f'shots{i}' for i in range(1, max_periods + 1)] + ['shotsT']

    for col in expected_goals_columns + expected_shots_columns:
        if col not in df.columns:
            df[col] = 0  # Add missing columns with default value 0

    return df



# Function to parse game details table
def parse_game_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meta_div = soup.find('div', {'id': 'meta'})
    if meta_div is None:
        logging.error("Meta div not found")
        return None
    
    game_details_div = meta_div.find_all('div')[-1]
    if game_details_div is None:
        logging.error("Game details div not found")
        return None
    
    try:
        date_str = game_details_div.h4.string
        day_of_week, date = date_str.split(", ", 1)
        
        p_elements = game_details_div.find_all('p')
        
        # Extract conference and location details
        for p in p_elements:
            if "Game" in p.text:  # e.g., "Big Ten Game"
                details_strs = p.get_text(separator='|').split('|')
                conference = details_strs[0]
                location = details_strs[-1].split('at ')[-1]
                break
        else:  # Defaults if not found
            conference, location = None, None
        
        # Extract referees and assistant referees details
        for p in p_elements:
            if "Referees" in p.text:
                refs_str = p.strong.next_sibling if p.strong else None
                asst_refs_str = p.find_all('strong')[1].next_sibling if len(p.find_all('strong')) > 1 else None
                break
        else:  # Defaults if not found
            refs_str, asst_refs_str = None, None
        
        refs = refs_str.split(', ') if refs_str else []
        asst_refs = asst_refs_str.split(', ') if asst_refs_str else []
        refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in refs]
        asst_refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in asst_refs]
        
        # Extract attendance details using regex for better accuracy
        attendance_pattern = r"Attendance:\s?(\d+[\d,]*)"
        attendance_match = re.search(attendance_pattern, html_content)
        attendance = int(attendance_match.group(1).replace(',', '')) if attendance_match else None
        
        # Extract game details (like shootout results)
        details = None
        for p in p_elements:
            if "shootout" in p.text:
                details = p.text
                break
        
        # Clean details if present
        if details and '\n' in details:
            details = details.replace('\n', '').strip()
        if details and '\t' in details:
            details = re.sub('\t', ' ', details)
        
        game_details = {
            'Day': day_of_week,
            'Date': date,
            'Conference': conference,
            'Details': details,
            'Location': location,
            'Ref1': refs[0] if refs else None,
            'Ref2': refs[1] if len(refs) > 1 else None,
            'Asst_Ref1': asst_refs[0] if asst_refs else None,
            'Asst_Ref2': asst_refs[1] if len(asst_refs) > 1 else None,
            'Attendance': attendance
        }
        
        game_details_df = pd.DataFrame([game_details])
        return game_details_df

    except (AttributeError, IndexError, ValueError) as e:
        logging.error(f"Error while parsing game details: {e}")
        return None


# Parse the box score page - player stats table (G, A, Pt, +/-, Sh, PIM)
def parse_box_score(box_score_html):
    # Initialize DataFrames to None
    scoring_summary = penalty_summary = goalie_stats = player_stats = line_chart = linescore = game_details = None
    
    try:
        scoring_summary = parse_scoring_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_scoring_summary: {e}")
    
    try:
        penalty_summary = parse_penalty_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_penalty_summary: {e}")
    
    try:
        goalie_stats = parse_goalie_stats(box_score_html)
    except Exception as e:
        print(f"Error in parse_goalie_stats: {e}")
    
    try:
        player_stats = parse_player_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_player_summary: {e}")
    
    try:
        line_chart = parse_line_chart(box_score_html)
        if line_chart.empty:
            logging.info("Line chart is empty. Skipping the insert for this game.")
        else:
            logging.info(f"Line chart DataFrame structure: {line_chart.dtypes}")

        # Insert into database (make sure this part works as expected)

    except Exception as e:
        logging.error(f"Error in parse_line_chart: {e}")


    try:
        linescore = parse_linescore(box_score_html)

        if not linescore.empty:
            process_and_save_linescore(linescore, engine, table_name="linescore")
        else:
            logging.warning("No linescore data to save for this game.")
    except Exception as e:
        logging.error(f"Error processing linescore: {e}")
    
    # try:
    #     linescore = parse_linescore(box_score_html)
    # except Exception as e:
    #     print(f"Error in parse_linescore: {e}")
    
    try:
        game_details = parse_game_details(box_score_html)
    except Exception as e:
        print(f"Error in parse_game_details: {e}")
    
    # Combine DataFrames into a list
    all_dfs = [game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore]
    
    return all_dfs

def rename_duplicate_columns(df):
    """
    Renames duplicate columns in a DataFrame to make them unique.
    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: DataFrame with unique column names.
    """
    cols = pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated()].unique(): 
        cols[df.columns.get_loc(dup)] = [f"{dup}_{i}" if i != 0 else dup for i in range(sum(df.columns == dup))]
    df.columns = cols
    return df


##### End Legacy Code

In [12]:
from tqdm import tqdm
import logging
import requests
import pandas as pd
import time
from sqlalchemy import create_engine
from sqlalchemy import inspect, text

def ensure_columns_exist(table_name, columns, engine):
    """
    Ensures that the specified columns exist in the given table. Adds them if missing.

    Args:
        table_name (str): Name of the table.
        columns (list): List of column names to check/add.
        engine (sqlalchemy.engine): SQLAlchemy database engine.
    """
    inspector = inspect(engine)
    existing_columns = [col['name'] for col in inspector.get_columns(table_name)]

    missing_columns = [col for col in columns if col not in existing_columns]

    if missing_columns:
        with engine.connect() as conn:
            for col in missing_columns:
                sql = text(f"ALTER TABLE {table_name} ADD COLUMN {col} INTEGER DEFAULT 0;")
                conn.execute(sql)
                logging.info(f"Added missing column: {col} to table: {table_name}")

def process_and_save_linescore(linescore_df, engine, table_name="linescore"):
    """
    Process and save the linescore DataFrame to the database, ensuring schema alignment.

    Args:
        linescore_df (pd.DataFrame): Linescore DataFrame.
        engine (sqlalchemy.engine): Database engine.
        table_name (str): Table name in the database.
    """
    # Dynamically ensure all columns exist in the database table
    ensure_columns_exist(table_name, linescore_df.columns, engine)

    # Save the DataFrame to the database
    linescore_df.to_sql(table_name, engine, if_exists='append', index=False)
    logging.info(f"Linescore data saved to table: {table_name}")




def fetch_and_save_data(row, base_url, game_id, conn):
    """
    Fetches and parses data for a single game, then saves it to the database.
    Args:
        row (pd.Series): Row from the games DataFrame containing game details.
        base_url (str): Base URL for the scraping website.
        game_id (str): Unique game identifier.
        conn (sqlite3.Connection): SQLite connection to the database.

    Returns:
        bool: True if successful, False otherwise.
    """
    try:
        box_score_url = f"{base_url}{row['Box_Link']}"
        metrics_url = f"{base_url}{row['Metrics_Link']}" if row['Metrics_Link'] else None

        # Fetch HTML for box score
        box_score_response = requests.get(box_score_url, timeout=10)
        box_score_response.raise_for_status()
        box_score_html = box_score_response.text

        # Parse box score data
        box_score_dfs = parse_box_score(box_score_html)

        # Fetch and parse advanced metrics if available
        if metrics_url:
            metrics_response = requests.get(metrics_url, timeout=10)
            metrics_response.raise_for_status()
            metrics_html = metrics_response.text
            advanced_metrics_df = parse_advanced_metrics_tables(metrics_html)
        else:
            advanced_metrics_df = pd.DataFrame()

        # Combine all DataFrames
        all_dfs = box_score_dfs + [advanced_metrics_df]

        # Apply Game_ID and remove duplicate columns
        for df in all_dfs:
            if df is not None and not df.empty:
                df['Game_ID'] = game_id
                df = rename_duplicate_columns(df)

        # Save data to database
        table_names = [
            'game_details', 'scoring_summary', 'penalty_summary',
            'goalie_stats', 'player_stats', 'line_chart', 'linescore', 'advanced_metrics'
        ]

        for df, table in zip(all_dfs, table_names):
            if df is not None and not df.empty:
                df.to_sql(table, conn, if_exists='append', index=False)

        logging.info(f"Successfully scraped and stored data for game: {game_id}")
        return True

    except requests.exceptions.RequestException as e:
        logging.error(f"Network error for game {game_id}: {e}")
    except Exception as e:
        logging.error(f"Error processing game {game_id}: {e}")

    return False


def scrape_games_and_store(sampled_games, base_url, conn):
    """
    Main function to scrape and store data for a list of games.
    Args:
        sampled_games (pd.DataFrame): DataFrame of games to scrape.
        base_url (str): Base URL for the scraping website.
        conn (sqlite3.Connection): SQLite connection to the database.

    Returns:
        None
    """
    error_count = 0
    error_games = []

    for _, row in tqdm(sampled_games.iterrows(), total=sampled_games.shape[0], desc="Scraping games"):
        game_id = f"{row['Date']}-{row['Home_Team']}-{row['Away_Team']}"
        retries = 3
        success = False

        while retries > 0 and not success:
            success = fetch_and_save_data(row, base_url, game_id, conn)
            if not success:
                retries -= 1
                time.sleep(5)  # Wait before retrying

        if not success:
            error_count += 1
            error_games.append(game_id)

    logging.info(f"Scraping completed with {error_count} errors.")
    if error_games:
        logging.warning(f"Failed games: {error_games}")



In [13]:
## Call the function to scrape and store data from the games
# Example: Scraping games and storing results

# Example Usage
if __name__ == "__main__":
    # Set up database connection
    db_path = "../TEMP/CHN_Scrape_TEST_6.db"
    conn = sqlite3.connect(db_path)

    # Set up logging
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

    
    # Base URL
    base_url = "https://www.collegehockeynews.com"

    # Scrape and store games
    scrape_games_and_store(games_to_scrape, base_url, conn)

    # Close database connection
    conn.close()



Scraping games:   0%|          | 0/130 [00:00<?, ?it/s]2024-12-09 16:21:29,366 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player      object
dtype: object
2024-12-09 16:21:29,455 - INFO - Linescore data saved to table: linescore
2024-12-09 16:21:30,046 - INFO - Successfully scraped and stored data for game: 2024-10-05-Maine-American Intl
Scraping games:   1%|          | 1/130 [00:01<03:39,  1.70s/it]2024-12-09 16:21:30,956 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player      object
dtype: object
2024-12-09 16:21:31,031 - INFO - Linescore data saved to table: linescore
2024-12-09 16:21:31,616 - ERROR - Error processing game 2024-10-05-St Lawrence-RIT: table linescore has no column named goals4
2024-12-09 16:21:37,618 - INFO - Line chart DataFrame structure: Team        object
Line        object
Position    object
Player      object
dtype: object
2024-12-09 16:21:37,693 - INFO 

In [14]:
## print table list from the database
table_list = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print(table_list)

ProgrammingError: Cannot operate on a closed database.

In [None]:
# df.head(20)