In [131]:
### This workbook is used to scrape the data from the College Hockey News and explore the data## 

## Notes - the site's robots.txt file sets these limits for crawlers
# Crawl-delay: 10 (seconds)
# Request-rate: 1/5 (5 requests every minute)

# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

# Set the URL to scrape
url = 'https://www.collegehockeynews.com/schedules/?season=20222023' # Link to the 2022-2023 season with all results


In [132]:
## explore page structure

# Get the page with requests
# response = requests.get(url)

# # try reading with pandas # Returns odd table structure - going to try BeautifulSoup
# tables = pd.read_html(url)

# tables[0]

# # output as csv
# tables[0].to_csv('../TEMP/2022-2023_season.csv')

In [133]:
## explore page structure

# Get the page with requests
response = requests.get(url)

# Create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')

# select the table or tables
tables = soup.find_all('table')

# tables[0] # This appears to be the game results table

In [134]:
# Initialize variables
current_date = None
current_conference = None
game_notes = None

# Initialize an empty list to hold the data
data = []

# Parse the table with BeautifulSoup

rows = soup.find_all('tr')

# Loop through each row to find relevant information
for row in rows:
    # Check for date row
    if row.get('class') == ['stats-section']:
        current_date = row.find('td').text.strip()
    # Check for conference row
    elif row.get('class') == ['sked-header']:
        current_conference = row.find('td').text.strip()
    # Check for game notes
    elif len(row.find_all('td')) == 2:
        game_notes = row.find_all('td')[1].text.strip()
    # Process rows with game data
    elif row.get('valign') == 'top':
        cells = row.find_all('td')
        if len(cells) >= 9:
            home_team = cells[0].text.strip()
            home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
            home_score = cells[1].text.strip()
            away_team = cells[3].text.strip()
            away_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
            away_score = cells[4].text.strip()
            ot = cells[5].text.strip()
            box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
            metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None
             # Capture Game Notes
            game_notes_cell = cells[-1].find('small')
            game_notes = game_notes_cell.text.strip() if game_notes_cell else None

            # Append data to the list
            data.append([current_date, current_conference, game_notes, home_team, home_team_link, home_score, away_team, away_team_link, away_score, ot, box_link, metrics_link])
            game_notes = None  # Reset game notes for the next row
            

# Create a DataFrame
columns = ['Date', 'Conference', 'Game_Notes', 'Home_Team', 'Home_Team_Link', 'Home_Score', 'Away_Team', 'Away_Team_Link', 'Away_Score', 'OT', 'Box_Link', 'Metrics_Link']
df = pd.DataFrame(data, columns=columns)


In [135]:
df.head(20)

# output csv in temp folder
df.to_csv('../TEMP/2022-2023_season.csv')

## EMPTY DATAFRAME
df = pd.DataFrame()


In [136]:
# df.head(20)

# game notes value counts
# df['Game_Notes'].value_counts()

In [137]:
#### mOVING ON TO THE BOX SCORES ####

# Example box score link
url_box = 'https://www.collegehockeynews.com/box/final/20230305/msu/ndm/'

# Example metrics link from same game
url_metrics = 'https://www.collegehockeynews.com/box/metrics.php?gd=96398'

# # output the entire html into a text file for review
# with open('../TEMP/box_score_example.txt', 'w') as file:
#     file.write(response.text)

# try to read with pandas
tables = pd.read_html(url_box)

tables[0] # score summary by period
tables[1] # shots by period
tables[2] # summary of penalties, power play results and faceoffs
tables[3] # scoring summary - period by period - periods are deliniated with a row with the "1st Period", "2nd Period", etc. text - if there is no row with this text, then there is no scoring in that period
tables[4] # penalty summary - period by period - periods are deliniated with a row with the "1st Period", "2nd Period", etc. text - if there is no row with this text, then there are no penalties in that period
tables[5] # goalie stats
tables[6] # visiting team stats
tables[7] # home team stats


Unnamed: 0,Michigan State,G,A,Pt.,+/-,Sh,PIM,FOW‑L
0,Matt Basgall,0,0,0,1,2,0,
1,Tiernan Shoudy,1,0,1,1,4,0,11‑5
2,Daniel Russell,0,0,0,0,5,2,
3,Viktor Hurtig,0,0,0,0,0,0,
4,Karsen Dorwart,0,0,0,1,0,0,12‑10
5,Tanner Kelly,0,0,0,1,1,0,
6,Erik Middendorf,0,1,1,2,0,0,
7,Jesse Tucker,0,0,0,0,0,0,
8,Jeremy Davidson,1,1,2,3,3,0,
9,David Gucciardi,0,0,0,1,3,0,


In [138]:
# tables[7].head()

In [139]:
## Split Face of Win loss column into two columns and calculate the faceoff percentage
def process_fowl_data(df):
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    
    # Replace non-standard dash with standard dash
    df_copy['FOW‑L'] = df_copy['FOW‑L'].str.replace('‑', '-', regex=False)
    
    # Fill NaN values
    df_copy['FOW‑L'].fillna('0-0', inplace=True)
    
    # Split the 'FOW‑L' column into two columns
    df_copy[['FOW', 'FOL']] = df_copy['FOW‑L'].str.split('-', expand=True)
    
    # Drop the 'FOW‑L' column
    df_copy.drop(columns=['FOW‑L'], inplace=True)
    
    # Calculate the FOW percentage
    df_copy['FOW_PCT'] = df_copy['FOW'].astype(float) / (df_copy['FOW'].astype(float) + df_copy['FOL'].astype(float))
    
    return df_copy

# Usage example
df = tables[7]
processed_df = process_fowl_data(df)

processed_df.head()



Unnamed: 0,Michigan State,G,A,Pt.,+/-,Sh,PIM,FOW,FOL,FOW_PCT
0,Matt Basgall,0,0,0,1,2,0,0,0,
1,Tiernan Shoudy,1,0,1,1,4,0,11,5,0.6875
2,Daniel Russell,0,0,0,0,5,2,0,0,
3,Viktor Hurtig,0,0,0,0,0,0,0,0,
4,Karsen Dorwart,0,0,0,1,0,0,12,10,0.545455


In [140]:
import pandas as pd
###################### NOT WORKING RIGHT NOW COME BACK TO LATER ##############################
# def clean_penalty_summary(parsed_data):
#     """
#     Clean and restructure the penalty_summary DataFrame.
    
#     Parameters:
#         df (DataFrame): The penalty_summary DataFrame.
        
#     Returns:
#         list: A list of dictionaries, each representing a penalty event.
#     """
#     # Flatten multi-level columns
#     df.columns = [' '.join(col).strip() for col in df.columns.values]
    
#     # Drop columns that are entirely NaN
#     df.dropna(axis=1, how='all', inplace=True)
    
#     # Initialize an empty list to store penalty events
#     penalties = []
    
#     # Loop through the DataFrame to create penalty event dictionaries
#     period = None
#     for i, row in df.iterrows():
#         if 'Period' in row.values:
#             period = row.values[0].split(' ')[0]
#         else:
#             penalty_event = {
#                 'Team': row['Penalties Team'],
#                 'Period': period,
#                 'Time': row['Penalties Time'],
#                 'Player': row['Penalties Player'],
#                 'Type': row['Penalties Type'],
#                 'Length': row['Penalties Length']
#             }
#             penalties.append(penalty_event)
    
#     return penalties
############################################

def parse_box_score(url_box):
    """
    Parse a college hockey game's box score and return structured data.
    
    Parameters:
        url_box (str): The URL containing the box score.
    
    Returns:
        dict: A dictionary containing the parsed data from the box score.
    """
    # Read the tables using pandas
    tables = pd.read_html(url_box)
    
    # Initialize an empty dictionary to store parsed data
    parsed_data = {}
    
    # Parse each table and store in the dictionary
    parsed_data['score_by_period'] = tables[0]
    parsed_data['shots_by_period'] = tables[1]
    parsed_data['penalties_summary'] = tables[2]
    # Clean and restructure the penalty_summary table
    # parsed_data['penalty_summary'] = clean_penalty_summary(tables[4]) # NOT WORKING RIGHT NOW COME BACK TO LATER
    ########################
    parsed_data['scoring_summary'] = tables[3]
    parsed_data['penalty_summary'] = tables[4]
    parsed_data['goalie_stats'] = tables[5]
    parsed_data['visiting_team_stats'] = tables[6]
    
    # Transform the home_team_stats table using process_fowl_data function
    parsed_data['home_team_stats'] = process_fowl_data(tables[7])
    
    return parsed_data


# Example usage
# Note: The function assumes that the URL is valid and accessible. In a real-world scenario, you'd add error handling for network issues.
url_box = 'https://www.collegehockeynews.com/box/final/20230303/msu/ndm/'
parsed_data = parse_box_score(url_box)
# print(parsed_data)  # To display the score summary by period

## save parsed data into raw text filew for review
with open('../TEMP/parsed_data.txt', 'w') as file:
    file.write(str(parsed_data))



In [141]:
### PARSE THE BOX SCORE META DATA WITH LOCATION, REFEREES, ATTENDANCE, ETC. ###

from bs4 import BeautifulSoup
import requests

soup_2 = BeautifulSoup(response.text, 'html.parser')
# Collects the game meta data from the standard box score page 
# - location, referees, attendance, etc.
def parse_game_meta(soup):
    """
    Final updated function to parse the game metadata section and return a dictionary containing the parsed information.
    
    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object containing the HTML content.
    
    Returns:
        dict: A dictionary containing the parsed game metadata.
    """
    # Initialize an empty dictionary to store parsed data
    meta_data = {}
    
    # Locate the 'meta' div
    meta_div = soup.find('div', id='meta')
    
    if meta_div is None:
        return {"error": "No meta div found"}
    
    # Parse visitor and home team information
    team_divs = meta_div.find_all('div', class_='team')
    
    # Visitor team
    visitor_div = team_divs[0]
    meta_data['visitor_logo'] = visitor_div.find('img', class_='logo')['src']
    meta_data['visitor_team'] = visitor_div.find('h4').text.strip().replace('\xa0', ' ')
    meta_data['visitor_score'] = visitor_div.find('h2').text.strip()
    meta_data['visitor_record'] = visitor_div.find_all('h4')[1].text.strip()
    
    # Home team
    home_div = team_divs[1]
    meta_data['home_logo'] = home_div.find('img', class_='logo')['src']
    meta_data['home_team'] = home_div.find('h4').text.strip().replace('\xa0', ' ')
    meta_data['home_score'] = home_div.find('h2').text.strip()
    meta_data['home_record'] = home_div.find_all('h4')[1].text.strip()
    
    # Parse additional game metadata
    game_info_div = meta_div.find_all('div')[2]
    
    meta_data['game_date'] = game_info_div.find('h4').text.strip()
    
    # Split game notes into multiple parts
    game_notes = game_info_div.find_all('p')[0].text.strip().split('\n')
    
    # If the location is specified, it's usually the last note
    if game_notes:
        location = game_notes[-1].split('at ')[-1].strip()
        meta_data['location'] = location
        game_notes[-1] = game_notes[-1].replace(f"\t\tat {location}", "")
    
    for i, note in enumerate(game_notes, 1):
        meta_data[f'game_note{i}'] = note.strip()
    
    referees_info = game_info_div.find_all('p')[1].text.strip().split('\n')
    
    # Split referees and assistant referees by comma and store in individual columns
    meta_data['referee_1'], meta_data['referee_2'] = referees_info[0].split(":")[1].strip().split(', ')
    meta_data['asst_referee_1'], meta_data['asst_referee_2'] = referees_info[1].split(":")[1].strip().split(', ')
    
    meta_data['attendance'] = referees_info[2].split(":")[1].strip()
    
    return meta_data

# Example usage would be the same as before, just use `parse_game_meta_final_v2` instead of the previous versions.


# Example usage
# Note: This function assumes that the BeautifulSoup object contains the relevant 'meta' div.
soup_2 = BeautifulSoup(response.text, 'html.parser')
game_meta_data = parse_game_meta(soup_2)
game_meta_data


{'error': 'No meta div found'}

In [142]:
def transform_parsed_data(parsed_data):
    """
    Transform the parsed data into a single unified dictionary.
    
    Parameters:
        parsed_data (dict): A dictionary containing parsed tables from the box score.
        
    Returns:
        dict: A dictionary containing the transformed data.
    """
    transformed_data = {}
    
    # Loop through each key in the parsed_data dictionary
    for key, table in parsed_data.items():
        
        # Convert the DataFrame to a list of dictionaries
        table_list = table.to_dict(orient='records')
        
        # Add this list to the transformed_data dictionary
        transformed_data[key] = table_list
    
    return transformed_data

# Example usage:
# Assuming parsed_data is the dictionary returned by your parse_box_score function
player_dict = transform_parsed_data(parsed_data)

# print(transformed_data)  # To display the score summary by period
# player_dict


In [143]:
# df_test = pd.DataFrame(master_data)

# Flatten the multi-level columns and rename them
df_test.columns = [f"{col[0]}_{col[1]}" for col in player_dict.columns]
# rename the defensive block column and srop the face of column beacuse we already have the faceoff data in the earlier table
df_test.rename(columns={'Unnamed: 21_level_0_BLKs': 'Defensive_Blocks'}, inplace=True)

# df_test.columns
# Drop the 'Unnamed: 0_level_0' column
# df_test.drop(columns=['Unnamed: 0_level_0'], inplace=True)

# index location of the last column
# last_col = df_test.columns.get_loc('Unnamed: 21_level_0')

df_test



AttributeError: 'dict' object has no attribute 'columns'

In [None]:
# parsed_data


In [None]:
def integrate_data(meta_data, transformed_data):
    """
    Integrate meta_data and transformed_data into a single dictionary.
    
    Parameters:
        meta_data (dict): Dictionary containing game meta data.
        transformed_data (dict): Dictionary containing transformed box score data.
    
    Returns:
        dict: A dictionary containing both meta and box score data.
    """
    # Create a master dictionary and update it with meta_data and transformed_data
    master_data = {}
    master_data.update(meta_data)
    master_data.update(transformed_data)
    
    return master_data

# Example usage:
# meta_data is the dictionary returned by your meta data scraping function
# transformed_data is the dictionary returned by transform_parsed_data function
master_data = integrate_data(game_meta_data, player_dict)

# display(master_data)  # To display the score summary by period

# # output JSON file
# with open('../TEMP/box_score_example.json', 'w') as file:
#     json.dump(master_data, file)

In [None]:
master_data

In [None]:
## display the dictionary
master_data
