## Adapted from Current_season_scrape notebook

In [1]:
## Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import logging
from tqdm import tqdm
import re
import os
from sqlalchemy import create_engine


import sqlite3

## Global Variables
### pasth to schedule results table on colleg ehockey news
current_year_url = 'https://www.collegehockeynews.com/schedules/?season=20232024'

# Path to database file - using os.path.join to make it OS agnostic
# data\db\Current_YTD_Stats.db
db_path = os.path.join('..', 'data', 'db', 'END_FEB_YTD_Stats.db') 

# Path to log file
log_path = '../TEMP/SCRAPE_Current_YTD_Stats.log'

### Check DB path and tables
if not os.path.exists(db_path):
    raise ValueError(f'Database file not found at {db_path}')

# Print table names in database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

# Count and print the number of Games in the database (unique game_id's)
cursor.execute("SELECT COUNT(DISTINCT game_id) FROM game_details;")
print(cursor.fetchall())



[('goalie_stats',), ('line_chart',), ('advanced_metrics',), ('game_details',), ('player_stats',), ('linescore',), ('penalty_summary',), ('scoring_summary',), ('player_stats_ytd',), ('master_roster',)]
[(1006,)]


## Get Schedule / Results Table

In [2]:
# Parses the season schedule page and returns a list of list with all games with links to box scores and metrics
def parse_current_season(url):
        # Initialize variables
    current_date = None
    current_conference = None
    game_notes = None

    # Initialize an empty list to hold the data
    data = []

    # Parse the page with BeautifulSoup
    # Get the page with requests
    response = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # select the table or tables
    tables = soup.find_all('table')

    rows = soup.find_all('tr')

    # Loop through each row to find relevant information
    for row in rows:
        # Check for date row
        if row.get('class') == ['stats-section']:
            current_date = row.find('td').text.strip()
        # Check for conference row
        elif row.get('class') == ['sked-header']:
            current_conference = row.find('td').text.strip()
        # Check for game notes
        elif len(row.find_all('td')) == 2:
            game_notes = row.find_all('td')[1].text.strip()
        # Process rows with game data
        elif row.get('valign') == 'top':
            cells = row.find_all('td')
            if len(cells) >= 9:
                home_team = cells[0].text.strip()
                # Remove any hyphens from the team name
                home_team = home_team.replace('-', ' ')
                home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
                home_score = cells[1].text.strip()
                away_team = cells[3].text.strip()
                away_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
                away_score = cells[4].text.strip()
                ot = cells[5].text.strip()
                box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
                metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None
                # Capture Game Notes
                game_notes_cell = cells[-1].find('small')
                game_notes = game_notes_cell.text.strip() if game_notes_cell else None

                # Append data to the list
                data.append([current_date, current_conference, game_notes, home_team, home_team_link, home_score, away_team, away_team_link, away_score, ot, box_link, metrics_link])
                game_notes = None  # Reset game notes for the next row
    return data

## call the function
data = parse_current_season(current_year_url)


# Create a dataframe from the list

columns = ['Date', 'Conference', 'Game_Notes', 'Home_Team', 'Home_Team_Link', 'Home_Score', 'Away_Team', 'Away_Team_Link', 'Away_Score', 'OT', 'Box_Link', 'Metrics_Link']
df = pd.DataFrame(data, columns=columns)
            
## Extract the day of the week from the date and save in new column
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
# remove day of the week from date
# format data column as YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

### Create a new column for the game ID
## Game ID will be a combination of the date and abbreviated team names

# Function to abbreviate the team names
for row in df.itertuples():
    home_team = row.Home_Team
    away_team = row.Away_Team
    home_team_abbr = home_team.split(' ')[-1]
    away_team_abbr = away_team.split(' ')[-1]
    # Remove any hyphens from the team name if there are any
    home_team_abbr = home_team_abbr.replace('-', ' ')
    away_team_abbr = away_team_abbr.replace('-', ' ')
    game_id = f'{row.Date}-{home_team_abbr}-{away_team_abbr}'
    df.loc[row.Index, 'Game_ID'] = game_id

# Create a new column for the game ID
df['Game_ID'] = df['Game_ID'].str.replace(',', '')

# Remove any hyphens from the team names if any
df['Home_Team'] = df['Home_Team'].str.replace('-', ' ')
df['Away_Team'] = df['Away_Team'].str.replace('-', ' ')

# Apply the function to the DataFrame
df['Game_ID'] = df.apply(lambda row: f'{row.Date}-{row.Home_Team}-{row.Away_Team}', axis=1)

## Filter out games that have not been played yet
df = df[df['Home_Score'] != '']

# Replace Nan values in metrics column with empty string
df['Metrics_Link'] = df['Metrics_Link'].fillna('')

# Print the amount of Total Games Played games
print(f'Total games played so far: {len(df) }')

Total games played so far: 1186


In [3]:
### Filter to only games that have not been scraped yet
# Connect to the database
conn = sqlite3.connect(db_path)

# Query the database for the games that have already been scraped
query = 'SELECT DISTINCT Game_ID FROM game_details'
scraped_games = pd.read_sql(query, conn)

# Print length of scraped games
print(f'Number of games already scraped: {len(scraped_games)}')
# Close the connection
conn.close()

# Filter the DataFrame to only games that have not been scraped
df = df[~df['Game_ID'].isin(scraped_games['Game_ID'])]

# Reset the index
df.reset_index(drop=True, inplace=True)

# # Check the first few rows of the DataFrame
# df.tail()

# df.info()

# RENAME THE DATAFRAME TO FEED INTO SCRAPER
unscraped_games = df

# Print the amount of games to be scraped
print(f'Number of games to be scraped: {len(unscraped_games)}')


Number of games already scraped: 1006
Number of games to be scraped: 180


## Functions - Unchanged from 

In [4]:
## Functions for parsing the box score and metrics pages

# Initialize logging for Error and Warning messages
logging.basicConfig(filename='../TEMP/current_scrape.log', level=logging.INFO)

#### PARSE PLAYER STATS TABLE ####
def parse_player_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the playersums div
    playersums_div = soup.find('div', id='playersums')
    if playersums_div is None:
        return "Player summaries div not found"

    # Initialize list to store player stats
    player_stats = []

    # Loop through each playersum div
    for player_sum in playersums_div.find_all('div', class_='playersum'):
        team = player_sum.find('td').text.strip()
        
        # Loop through table rows
        for row in player_sum.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) > 1:
                player = cols[0].text.strip()
                goals = cols[1].text.strip()
                assists = cols[2].text.strip()
                points = cols[3].text.strip()
                plus_minus = cols[4].text.strip()
                shots = cols[5].text.strip()
                pim = cols[6].text.strip()
                fowl = cols[7].text.strip() if len(cols) > 7 else None
                
                fow, fol = None, None
                win_percentage = None
                
                

                try:
                    if fowl and '‑' in fowl:  # Checking if it contains a hyphen
                        fow, fol = map(int, fowl.split('‑'))
                        total_fo = fow + fol
                        win_percentage = (fow / total_fo) * 100 if total_fo > 0 else 0
                except ValueError:
                    fow, fol, win_percentage = None, None, None

                

                
                player_stat = {
                    'Team': team,
                    'Player': player,
                    'G': goals,
                    'A': assists,
                    'Pt.': points,
                    '+/-': plus_minus,
                    'Sh': shots,
                    'PIM': pim,
                    'FOW': fow,
                    'FOL': fol,
                    'FO%': win_percentage
                }
                player_stats.append(player_stat)

    return pd.DataFrame(player_stats)


############# PARSEING SCORING SUMMARY WITH BS4
def parse_scoring_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the scoring div and table
    scoring_div = soup.find('div', id='scoring')
    if scoring_div is None:
        logging.error("Scoring div not found")
        return None

    scoring_table = scoring_div.find('table')
    if scoring_table is None:
        logging.error("Scoring table not found within the scoring div")
        return None

    # Initialize list to store scoring events
    scoring_events = []
    period = None

    # Loop through table rows
    for row in scoring_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                try:
                    team = cols[0].text.strip()
                    pp = cols[1].text.strip()

                    player_data = cols[3].text.strip()
                    match = re.match(r"(.+)\s\((\d+)\)", player_data)
                    player = match.group(1) if match else player_data
                    goals = int(match.group(2)) if match else None

                    assist_data_raw = cols[4].text.strip()
                    assist_data = assist_data_raw.split(", ") if assist_data_raw else []
                    assist1 = assist_data[0] if len(assist_data) > 0 else None
                    assist2 = assist_data[1] if len(assist_data) > 1 else None

                    time = cols[5].text.strip()

                    scoring_event = {
                        'Period': period,
                        'Team': team,
                        'PP': pp,
                        'Player': player,
                        'Player_Goals': goals,
                        'Assist1': assist1,
                        'Assist2': assist2,
                        'Time': time
                    }
                    scoring_events.append(scoring_event)
                except Exception as e:
                    logging.error(f"An error occurred while parsing a scoring event row: {e}")
            else:
                logging.warning(f"Insufficient columns in scoring row: {len(cols)}")

    return pd.DataFrame(scoring_events)


def parse_game_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    meta_div = soup.find('div', {'id': 'meta'})
    if meta_div is None:
        logging.error("Meta div not found")
        return None
    
    game_details_div = meta_div.find_all('div')[-1]
    if game_details_div is None:
        logging.error("Game details div not found")
        return None
    
    try:
        date_str = game_details_div.h4.string
        day_of_week, date = date_str.split(", ", 1)
        
        p_elements = game_details_div.find_all('p')
        
        # Extract conference and location details
        for p in p_elements:
            if "Game" in p.text:  # e.g., "Big Ten Game"
                details_strs = p.get_text(separator='|').split('|')
                conference = details_strs[0]
                location = details_strs[-1].split('at ')[-1]
                break
        else:  # Defaults if not found
            conference, location = None, None
        
        # Extract referees and assistant referees details
        for p in p_elements:
            if "Referees" in p.text:
                refs_str = p.strong.next_sibling if p.strong else None
                asst_refs_str = p.find_all('strong')[1].next_sibling if len(p.find_all('strong')) > 1 else None
                break
        else:  # Defaults if not found
            refs_str, asst_refs_str = None, None
        
        refs = refs_str.split(', ') if refs_str else []
        asst_refs = asst_refs_str.split(', ') if asst_refs_str else []
        refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in refs]
        asst_refs = [re.sub(r'[^a-zA-Z ]+', '', ref).strip() for ref in asst_refs]
        
        # Extract attendance details using regex for better accuracy
        attendance_pattern = r"Attendance:\s?(\d+[\d,]*)"
        attendance_match = re.search(attendance_pattern, html_content)
        attendance = int(attendance_match.group(1).replace(',', '')) if attendance_match else None
        
        # Extract game details (like shootout results)
        details = None
        for p in p_elements:
            if "shootout" in p.text:
                details = p.text
                break
        
        # Clean details if present
        if details and '\n' in details:
            details = details.replace('\n', '').strip()
        if details and '\t' in details:
            details = re.sub('\t', ' ', details)
        
        game_details = {
            'Day': day_of_week,
            'Date': date,
            'Conference': conference,
            'Details': details,
            'Location': location,
            'Ref1': refs[0] if refs else None,
            'Ref2': refs[1] if len(refs) > 1 else None,
            'Asst_Ref1': asst_refs[0] if asst_refs else None,
            'Asst_Ref2': asst_refs[1] if len(asst_refs) > 1 else None,
            'Attendance': attendance
        }
        
        game_details_df = pd.DataFrame([game_details])
        return game_details_df

    except (AttributeError, IndexError, ValueError) as e:
        logging.error(f"Error while parsing game details: {e}")
        return None

### Get the Linescore Elements - Score, shots, ect by period####
### UPDATED 

def parse_linescore(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    linescore_data = []
    
    # Parsing the Goals table
    goals_table = soup.select_one("#goals table")
    if goals_table is None:
        logging.error("Goals table not found")
        return None
    
    rows = goals_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Goals table")
        return None
    
    for row in rows:
        team_data = {}
        td = row.select_one('td')
        if td:
            team_data['Team'] = td.text
        else:
            logging.warning("Team name not found in Goals table")
            continue

        goals = row.select('td')[1:]
        for i, goal in enumerate(goals):
            period = i + 1
            column_name = f'goals{period}' if i < len(goals) - 1 else 'goalsT'
            team_data[column_name] = int(goal.text)
        
        linescore_data.append(team_data)
    

    # Parsing the Shots table
    shots_table = soup.select_one("#shots table")
    if shots_table is None:
        logging.error("Shots table not found")
        return None

    rows = shots_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in Shots table")
        return None

    for i, row in enumerate(rows):
        shots = row.select('td')[1:]
        if not shots:
            logging.warning(f"No shot data found for row {i+1} in Shots table")
            continue

        for j, shot in enumerate(shots):
            period = j + 1
            column_name = f'shots{period}' if j < len(shots) - 1 else 'shotsT'
            try:
                linescore_data[i][column_name] = int(shot.text.strip())
            except ValueError:
                logging.warning(f"Could not convert shot data to integer for row {i+1}, column {j+1}")
                linescore_data[i][column_name] = None

    # Parsing the PP table
    pp_table = soup.select_one("#pp table")
    if pp_table is None:
        logging.error("PP table not found")
        return None

    rows = pp_table.select('tbody tr')
    if not rows:
        logging.warning("No rows found in PP table")
        return None

    for i, row in enumerate(rows):
        try:
            pen_pim = row.select('td')[1].text.split('‑')
            linescore_data[i]['Pen'] = int(pen_pim[0])
            linescore_data[i]['PIM'] = int(pen_pim[1])

            ppg_ppo = row.select('td')[2].text.split('‑')
            linescore_data[i]['PPG'] = int(ppg_ppo[0])
            linescore_data[i]['PPO'] = int(ppg_ppo[1])

            fow_fol = row.select('td')[3].text.split('‑')
            linescore_data[i]['FOW'] = int(fow_fol[0])
            linescore_data[i]['FOL'] = int(fow_fol[1])
            linescore_data[i]['FOW%'] = (linescore_data[i]['FOW'] / (linescore_data[i]['FOW'] + linescore_data[i]['FOL'])) * 100

        except (ValueError, IndexError) as e:
            logging.warning(f"Could not process PP data for row {i+1}. Error: {e}")
            continue

    # Convert to DataFrame early
    df = pd.DataFrame(linescore_data)

    # Ensure all columns exist
    expected_goals_columns = [f'goals{i}' for i in range(1, 5)] + ['goalsT']
    expected_shots_columns = [f'shots{i}' for i in range(1, 5)] + ['shotsT']

    for col in expected_goals_columns + expected_shots_columns:
        if col not in df.columns:
            df[col] = 0

    # Return the final DataFrame
    return df

def parse_box_score(box_score_html):
    # Initialize DataFrames to None
    scoring_summary = penalty_summary = goalie_stats = player_stats = line_chart = linescore = game_details = None
    
    try:
        scoring_summary = parse_scoring_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_scoring_summary: {e}")
    
    try:
        penalty_summary = parse_penalty_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_penalty_summary: {e}")
    
    try:
        goalie_stats = parse_goalie_stats(box_score_html)
    except Exception as e:
        print(f"Error in parse_goalie_stats: {e}")
    
    try:
        player_stats = parse_player_summary(box_score_html)
    except Exception as e:
        print(f"Error in parse_player_summary: {e}")
    
    try:
        line_chart = parse_line_chart(box_score_html)
        if line_chart.empty:
            logging.info("Line chart is empty. Skipping the insert for this game.")
        else:
            logging.info(f"Line chart DataFrame structure: {line_chart.dtypes}")

        # Insert into database (make sure this part works as expected)

    except Exception as e:
        logging.error(f"Error in parse_line_chart: {e}")


    
    try:
        linescore = parse_linescore(box_score_html)
    except Exception as e:
        print(f"Error in parse_linescore: {e}")
    
    try:
        game_details = parse_game_details(box_score_html)
    except Exception as e:
        print(f"Error in parse_game_details: {e}")
    
    # Combine DataFrames into a list
    all_dfs = [game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore]
    
    return all_dfs

#### PARSE THE ADVANCED TEAM METRICS TABLES ####
### NEW - IT WORKS IN THE NOTEBOOK BUT NOT IN THE FUNCTION
### RETURNS WHOLE ADVANCED METRICS AS SINGLE TABLE
####################################
def parse_new_advanced_metrics(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df

### May Be Outdated unnecessary
def parse_advanced_metrics_tables(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all tables with advanced metrics
    tables = soup.find_all('table', {'class': 'sortable metrics'})
    
    # List to store all parsed data
    all_data = []
    
    for table in tables:
        # Extract team name from the table header
        team_name = table.find('td').text.strip()
        
        # Extract headers (skipping the Player header)
        headers = [header.text for header in table.find_all('th')][1:]
        
        # Prepare final column headers
        col_names = ['Team', 'Player']
        for header in headers:
            col_names.append(header)
        
        # Extract player data
        rows = table.find_all('tr')[2:]  # skipping the two header rows
        for row in rows:
            player_data = [team_name]  # start with team name
            cells = row.find_all('td')
            player_data.append(cells[0].text.strip())  # player name
            for cell in cells[1:]:
                player_data.append(cell.text.strip())
            all_data.append(player_data)
    
    # Convert the list of data to a DataFrame
    df = pd.DataFrame(all_data, columns=col_names)
    return df

# Complete code for parsing the line chart information with specific positions for forwards and defensemen.
def parse_line_chart(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    line_chart_div = soup.find('div', id='linechart')

    if line_chart_div is None:
        logging.error("Line chart div not found")
        return pd.DataFrame()

    line_data = []

    for team_div in line_chart_div.find_all('div', recursive=False):
        h3 = team_div.find('h3')
        if h3 is None:
            logging.warning("Team name not found")
            continue
        
        team_name = h3.text.strip()
        
        for line_type_div in team_div.find_all('div', recursive=False):
            line_type = line_type_div.get('class')[0] if line_type_div.get('class') else None
            if line_type is None:
                logging.warning("Line type not found")
                continue
            
            if line_type == 'f':
                position_types = ['Left Wing', 'Center', 'Right Wing']
            elif line_type == 'd':
                position_types = ['Left D', 'Right D']
            elif line_type == 'x':
                position_types = ['Extra']
            elif line_type == 'g':
                position_types = ['Goalie']
                goalie_count = 1  # Initialize goalie count
            else:
                continue

            players = line_type_div.find_all('div')
            if not players:
                logging.warning(f"No players found for {team_name} in {line_type}")
                continue
            
            for i, player in enumerate(players):
                player_name = player.text.strip()
                if line_type == 'x':
                    player_name = player_name.split(' ')[0]
                if line_type == 'g':
                    line_number = f"Goalie {goalie_count}"
                    goalie_count += 1
                else:
                    line_number = i // len(position_types) + 1

                position = position_types[i % len(position_types)]
                line_data.append({
                    'Team': team_name,
                    'Line': line_number,
                    'Position': position,
                    'Player': player_name
                })

    if not line_data:
        logging.error("No line data was collected")

    df = pd.DataFrame(line_data)

    return df


############# PARSEING PENALTY SUMMARY WITH BS4

def parse_penalty_summary(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the penalties div and table
    penalties_div = soup.find('div', id='penalties')
    if penalties_div is None:
        logging.error("Penalties div not found")
        return None

    penalties_table = penalties_div.find('table')
    if penalties_table is None:
        logging.error("Penalties table not found within the penalties div")
        return None

    # Initialize list to store penalty events
    penalty_events = []
    period = None

    # Loop through table rows
    for row in penalties_table.find_all('tr'):
        if 'stats-section' in row.get('class', []):
            td = row.find('td')
            if td:
                period = td.text.strip()
            else:
                logging.warning("Period name not found in 'stats-section' row")
                period = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                team = cols[0].text.strip()
                player = cols[1].text.strip()
                pen_length = cols[2].text.strip()
                penalty_type = cols[3].text.strip()
                time = cols[4].text.strip()

                penalty_event = {
                    'Period': period,
                    'Team': team,
                    'Player': player,
                    'Pen_Length': pen_length,
                    'Penalty_Type': penalty_type,
                    'Time': time
                }
                penalty_events.append(penalty_event)

    return pd.DataFrame(penalty_events)


############# GOALIE SUMMARY WITH BS4
def parse_goalie_stats(html_content):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the goalies div and table
    goalies_div = soup.find('div', id='goalies')
    if goalies_div is None:
        logging.error("Goalies div not found")
        return None

    goalies_table = goalies_div.find('table')
    if goalies_table is None:
        logging.error("Goalies table not found within the goalies div")
        return None

    # Initialize list to store goalie stats
    goalie_stats = []
    team = None

    # Loop through table rows
    for row in goalies_table.find_all('tr'):
        if 'stats-header' in row.get('class', []):
            td = row.find('td')
            if td:
                team = td.text.strip()
            else:
                logging.warning("Team name not found in 'stats-header' row")
                team = "Unknown"
        else:
            cols = row.find_all('td')
            if len(cols) > 1:
                goalie = cols[0].text.strip()
                sv = cols[1].text.strip()
                ga = cols[2].text.strip()
                minutes = cols[3].text.strip()

                goalie_stat = {
                    'Team': team,
                    'Goalie': goalie,
                    'SV': sv,
                    'GA': ga,
                    'Minutes': minutes
                }
                goalie_stats.append(goalie_stat)

    return pd.DataFrame(goalie_stats)



def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in df.columns[df.columns.duplicated()].unique(): 
        cols[df.columns.get_loc(dup)] = [f"{dup}_{i}" if i != 0 else dup for i in range(df.columns.get_loc(dup).sum())]
    df.columns = cols
    return df


### Run the unscrapped games through the all of the individual ans store the results in the data base

In [5]:
# Create a function to call all of the game scraping sub functions

def scrape_game(game_id, box_link, metrics_link):
    # Initialize variables
    box_score_html = None
    metrics_html = None

    # Get the box score page
    if box_link:
        try:
            box_score_response = requests.get(box_link)
            if box_score_response.status_code == 200 and box_score_response.text:
                box_score_html = box_score_response.text
            else:
                logging.error(f"Box score page returned status code {box_score_response.status_code} or is empty for game {game_id}")
                # Optionally, continue to the next game if the box score page is not available
                return None
        except Exception as e:
            logging.error(f"Failed to fetch box score page for game {game_id}: {e}")
            return None
        

    # Get the metrics page
    if metrics_link:
        try:
            metrics_response = requests.get(metrics_link)
            if metrics_response.status_code == 200:
                metrics_html = metrics_response.text
            else:
                logging.error(f"Metrics page returned status code {metrics_response.status_code}")
                return None
        except Exception as e:
            logging.error(f"Failed to fetch advanced metrics page for game {game_id}: {e}")
            return None

    # Parse the box score page
    all_dfs = parse_box_score(box_score_html)
    if all_dfs:
        game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore = all_dfs
    else:
        game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore = None, None, None, None, None, None, None

    # Rename duplicate columns
    if scoring_summary is not None:
        scoring_summary = rename_duplicate_columns(scoring_summary)
    if penalty_summary is not None:
        penalty_summary = rename_duplicate_columns(penalty_summary)
    if goalie_stats is not None:
        goalie_stats = rename_duplicate_columns(goalie_stats)
    if player_stats is not None:
        player_stats = rename_duplicate_columns(player_stats)
    if line_chart is not None:
        line_chart = rename_duplicate_columns(line_chart)
    if linescore is not None:
        linescore = rename_duplicate_columns(linescore)

    # Parse the advanced metrics page
    if metrics_html:
        advanced_metrics = parse_new_advanced_metrics(metrics_html)
    else:
        advanced_metrics = None
        
    # Return the DataFrames
    return game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore, advanced_metrics


In [9]:
unscraped_games.tail()

Unnamed: 0,Date,Conference,Game_Notes,Home_Team,Home_Team_Link,Home_Score,Away_Team,Away_Team_Link,Away_Score,OT,Box_Link,Metrics_Link,Day,Game_ID
175,2024-03-16,NCHC Tournament,NCHC Quarterfinal,Omaha,/reports/team/Omaha/37,3,Colorado College,/reports/team/Colorado-College/16,1,,/box/final/20240316/uno/cc_/,/box/metrics.php?gd=97733,Saturday,2024-03-16-Omaha-Colorado College
176,2024-03-16,NCHC Tournament,NCHC Quarterfinal,Minnesota Duluth,/reports/team/Minnesota-Duluth/36,2,Denver,/reports/team/Denver/20,5,,/box/final/20240316/mnd/den/,/box/metrics.php?gd=97729,Saturday,2024-03-16-Minnesota Duluth-Denver
177,2024-03-17,AHA Tournament,Atlantic Hockey Semifinal,American Int'l,/reports/team/American-Intl/5,3,Holy Cross,/reports/team/Holy-Cross/23,1,,/box/final/20240317/aic/hcr/,/box/metrics.php?gd=97737,Sunday,2024-03-17-American Int'l-Holy Cross
178,2024-03-17,NCHC Tournament,NCHC Quarterfinal,Western Michigan,/reports/team/Western-Michigan/57,1,St. Cloud State,/reports/team/St-Cloud-State/52,5,,/box/final/20240317/wmu/stc/,/box/metrics.php?gd=97736,Sunday,2024-03-17-Western Michigan-St. Cloud State
179,2024-03-17,NCHC Tournament,NCHC Quarterfinal,Omaha,/reports/team/Omaha/37,2,Colorado College,/reports/team/Colorado-College/16,1,,/box/final/20240317/uno/cc_/,/box/metrics.php?gd=97738,Sunday,2024-03-17-Omaha-Colorado College


In [6]:
import sqlite3
import pandas as pd
import logging

# Assuming unscraped_games DataFrame is already defined and available
# Assuming scrape_game function is defined as shown in your code snippets

def store_in_database(df, table_name, conn):
    """
    Store the given DataFrame into the specified table within the database.
    Handles the insertion and commits the changes.
    """
    try:
        df.to_sql(table_name, conn, if_exists='append', index=False)
        conn.commit()
    except Exception as e:
        logging.error(f"Error inserting data into {table_name}: {e}")
        # Handle or log error as appropriate

def scrape_and_store_games(unscraped_games, db_path):
    # Connect to the database
    conn = sqlite3.connect(db_path)
    
    for index, game in unscraped_games.iterrows():
        game_id = game['Game_ID']
        box_link = game['Box_Link']  # Assuming this column name, adjust as necessary
        metrics_link = game['Metrics_Link']  # Assuming this column name, adjust as necessary
        
        # Scrape the game data
        game_data = scrape_game(game_id, box_link, metrics_link)
        if not game_data:
            logging.error(f"Failed to scrape game {game_id}")
            continue
        
        # Unpack the scraped data
        game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore = game_data
        
        # Store each DataFrame in the respective table
        if game_details is not None:
            store_in_database(game_details, 'game_details', conn)
        # Repeat for other DataFrames (scoring_summary, penalty_summary, etc.)
        if scoring_summary is not None:
            store_in_database(scoring_summary, 'scoring_summary', conn)
        if penalty_summary is not None:
            store_in_database(penalty_summary, 'penalty_summary', conn)
        if goalie_stats is not None:
            store_in_database(goalie_stats, 'goalie_stats', conn)
        if player_stats is not None:
            store_in_database(player_stats, 'player_stats', conn)
        if line_chart is not None:
            store_in_database(line_chart, 'line_chart', conn)
        if linescore is not None:
            store_in_database(linescore, 'linescore', conn)
        if advanced_metrics is not None:
            store_in_database(advanced_metrics, 'advanced_metrics', conn)

        
    # Close the database connection
    conn.close()

# Call the function with the unscraped_games DataFrame and database path
scrape_and_store_games(unscraped_games, db_path)


In [7]:
# # Scrape the games from the filtered list above and store the data in the database
# # Initialize an empty list to store the DataFrames
# game_details_list = []
# scoring_summary_list = []
# penalty_summary_list = []
# goalie_stats_list = []
# player_stats_list = []
# line_chart_list = []
# linescore_list = []

# # Loop through the unscraped games
# for row in unscraped_games.itertuples():
#     game_id = row.Game_ID
#     box_link = row.Box_Link
#     metrics_link = row.Metrics_Link

#     # Scrape the game if the box score links are valid, otherwise skip
#     if box_link:
#         # Check if the metrics link is valid
#         if metrics_link:
#             game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore = scrape_game(game_id, box_link, metrics_link)
    
    

#     # game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore = scrape_game(game_id, box_link, metrics_link)

#     # Append the DataFrames to the lists
#     game_details_list.append(game_details)
#     scoring_summary_list.append(scoring_summary)
#     penalty_summary_list.append(penalty_summary)
#     goalie_stats_list.append(goalie_stats)
#     player_stats_list.append(player_stats)
#     line_chart_list.append(line_chart)
#     linescore_list.append(linescore)

# # Connect to the database
# conn = sqlite3.connect(db_path)

# # Loop through the DataFrames and insert the data into the database
# for game_details, scoring_summary, penalty_summary, goalie_stats, player_stats, line_chart, linescore in zip(game_details_list, scoring_summary_list, penalty_summary_list, goalie_stats_list, player_stats_list, line_chart_list, linescore_list):
#     # Insert the game details
#     if game_details is not None:
#         game_details.to_sql('game_details', conn, if_exists='append', index=False)
    
#     # Insert the scoring summary
#     if scoring_summary is not None:
#         scoring_summary.to_sql('scoring_summary', conn, if_exists='append', index=False)
    
#     # Insert the penalty summary
#     if penalty_summary is not None:
#         penalty_summary.to_sql('penalty_summary', conn, if_exists='append', index=False)
    
#     # Insert the goalie stats
#     if goalie_stats is not None:
#         goalie_stats.to_sql('goalie_stats', conn, if_exists='append', index=False)
    
#     # Insert the player stats
#     if player_stats is not None:
#         player_stats.to_sql('player_stats', conn, if_exists='append', index=False)
    
#     # Insert the line chart
#     if line_chart is not None:
#         line_chart.to_sql('line_chart', conn, if_exists='append', index=False)
    
#     # Insert the linescore
#     if linescore is not None:
#         linescore.to_sql('linescore', conn, if_exists='append', index=False)
# # Close the connection
# conn.close()

# # Print the number of games scraped
# print(f'Number of games scraped: {len(unscraped_games)}')

# # Log the number of games scraped
