# Notebook to scrape all available game results from college hockey news

In [3]:
### This notebook is used to collect all season results available on College Hockey News

# There Records begin with the 1901-1902 season and end with the most recent completed season 2022-2023
## Seasons begining in 2002-2003 have box score links in the results table - seasons from then on can 
## use the code develope4d in the other notbook, years before that will need some adjusted code

## Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


## Example URL: https://www.collegehockeynews.com/schedules/?season=19851986

base_url = 'https://www.collegehockeynews.com/schedules/?season='

# Construct the url for each season and store in a list
mod_seasons = []

for i in range(2002, 2024):
    mod_seasons.append(base_url + str(i) + str(i+1))

print(len(mod_seasons))

old_seasons = []

for i in range(1901, 2002):
    old_seasons.append(base_url + str(i) + str(i+1))

print(len(old_seasons))



22
101


In [4]:
### HELPER FUNCTION TO CLEAN UP THE TEAM NAMES

def clean_team_name(team_name):
    """
    Cleans the team name by removing unwanted characters.
    Args:
        team_name (str): Team name.

    Returns:
        str: Cleaned team name.
    """
    # Replace unwanted characters with an empty string
    return team_name.replace('-', ' ').replace('.', '').replace("'", '').strip()


In [5]:
### Scraping code from initial scrape_and_explore notebook

############### Test with one season ######################
# url = 'https://www.collegehockeynews.com/schedules/?season=20012002'


def parse_modern_season(url):
    
    # Get the page with requests
    response = requests.get(url)

    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    # select the table or tables
    tables = soup.find_all('table')

    # Initialize variables
    current_date = None
    current_conference = None
    game_notes = None

    # Initialize an empty list to hold the data
    # data = []

    # Parse the table with BeautifulSoup

    rows = soup.find_all('tr')

    # Loop through each row to find relevant information
    for row in rows:
        # Check for date row
        if row.get('class') == ['stats-section']:
            current_date = row.find('td').text.strip()
        # Check for conference row
        elif row.get('class') == ['sked-header']:
            current_conference = row.find('td').text.strip()
        # Check for game notes
        elif len(row.find_all('td')) == 2:
            game_notes = row.find_all('td')[1].text.strip()
        # Process rows with game data
        elif row.get('valign') == 'top':
            cells = row.find_all('td')
            if len(cells) >= 9:
                home_team = clean_team_name(cells[3].text.strip())
                home_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
                home_score = cells[4].text.strip()
                away_team = clean_team_name(cells[0].text.strip())
                away_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
                away_score = cells[1].text.strip()
                ot = cells[5].text.strip()
                box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
                metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None
                # Capture Game Notes
                game_notes_cell = cells[-1].find('small')
                game_notes = game_notes_cell.text.strip() if game_notes_cell else None

                # Append data to the list
                data.append([current_date, current_conference, game_notes, home_team, home_team_link, home_score, away_team, away_team_link, away_score, ot, box_link, metrics_link])
                game_notes = None  # Reset game notes for the next row
            

## Try running the function on the list of urls and create dataframe after looping through all seasons

# Initialize an empty list to hold the data
data = []

for url in mod_seasons:
    parse_modern_season(url)
    # wait 2 seconds between requests
    time.sleep(2)


# Create a DataFrame
columns = ['Date', 'Conference', 'Game_Notes',  'Home_Team', 'Home_Team_Link', 'Home_Score', 'Away_Team', 'Away_Team_Link', 'Away_Score', 'OT', 'Box_Link', 'Metrics_Link']
df = pd.DataFrame(data, columns=columns)


# ## Extract the day of the week from the date and save in new column
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
# # remove day of the week from date
# # format data column as YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
## Create a new column for the game ID
## Game ID will be a combination of the date and abbreviated team names

# Function to abbreviate the team names
for row in df.itertuples():
    home_team = row.Home_Team
    away_team = row.Away_Team
    home_team_abbr = home_team.split(' ')[-1]
    away_team_abbr = away_team.split(' ')[-1]
    game_id = f'{row.Date}_{home_team_abbr}_{away_team_abbr}'
    df.loc[row.Index, 'Game_ID'] = game_id

# Create a new column for the game ID
df['Game_ID'] = df['Game_ID'].str.replace(',', '')

# Apply the function to the DataFrame
df['Game_ID'] = df.apply(lambda row: f'{row.Date}_{row.Home_Team}_{row.Away_Team}', axis=1)




In [6]:
# Store the df in Memory
modern_results_df = df.copy()

## Save the DataFrame to a CSV file
modern_results_df.to_csv('../TEMP/2002-present_college_hockey_results.csv', index=False)

#### Pre 2001-02 Season scrape

In [7]:
## Parse for all games pre-2002

### Code for seasons 2001-2002 and earlier

# example_url = 'https://www.collegehockeynews.com/schedules/?season=19811982'

## list of old seasons is stored in old_seasons


# Update the function to include game notes and overtime information
def parse_pre_2002_season(url):
    # Get the page with requests
    response = requests.get(url)
    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize variables
    current_date = None
    current_conference = None

    # Initialize an empty list to hold the data
    # data = []

    # Parse the table with BeautifulSoup
    rows = soup.find_all('tr')

    # Loop through each row to find relevant information
    for row in rows:
        # Check for date row
        if row.get('class') == ['stats-section']:
            current_date = row.find('td').text.strip()
        # Check for conference row
        elif row.get('class') == ['sked-header']:
            current_conference = row.find('td').text.strip()
        # Process rows with game data
        elif row.get('valign') == 'top':
            cells = row.find_all('td')
            if len(cells) >= 5:
                home_team = clean_team_name(cells[3].text.strip())
                home_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
                home_score = cells[4].text.strip()
                away_team = clean_team_name(cells[0].text.strip())
                away_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
                away_score = cells[1].text.strip()
                
                # Extract overtime information
                ot = cells[5].text.strip() if cells[5].text.strip() else None
                
                # Extract game notes
                game_notes_cell = cells[-1].find('small')
                game_notes = game_notes_cell.text.strip() if game_notes_cell else None

                # Append data to the list
                data.append([current_date, current_conference, away_team, away_team_link, away_score,home_team, home_team_link, home_score, ot, game_notes])

    


## Loop through the list of urls and run the function on each one
data = []

for url in old_seasons:
    parse_pre_2002_season(url)
    # wait 2 seconds between requests
    time.sleep(2)


# create a dataframe from the data
columns = ['Date', 'Conference',  'Away_Team', 'Away_Team_Link', 'Away_Score','Home_Team', 'Home_Team_Link', 'Home_Score', 'OT', 'Game_Notes']
df = pd.DataFrame(data, columns=columns)

## Create a new column for the game ID
## Game ID will be a combination of the date and abbreviated team names

# Function to abbreviate the team names
for row in df.itertuples():
    home_team = row.Home_Team
    away_team = row.Away_Team
    home_team_abbr = home_team.split(' ')[-1]
    away_team_abbr = away_team.split(' ')[-1]
    game_id = f'{row.Date}_{home_team_abbr}_{away_team_abbr}'
    df.loc[row.Index, 'Game_ID'] = game_id

# Create a new column for the game ID
df['Game_ID'] = df['Game_ID'].str.replace(',', '')

# Apply the function to the DataFrame
df['Game_ID'] = df.apply(lambda row: f'{row.Date}_{row.Home_Team}_{row.Away_Team}', axis=1)


# ## Extract the day of the week from the date and save in new column
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
# # remove day of the week from date
# # format data column as YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

print(len(df))
df.head(20)



32613


Unnamed: 0,Date,Conference,Away_Team,Away_Team_Link,Away_Score,Home_Team,Home_Team_Link,Home_Score,OT,Game_Notes,Game_ID,Day
0,1901-12-05,Non-Conference,Yale,/reports/team/Yale/59,4,St Nicks,,2,,,"Thursday, December 5, 1901_St Nicks_Yale",Thursday
1,1901-12-14,Non-Conference,Yale,/reports/team/Yale/59,1,New York AC,,4,,,"Saturday, December 14, 1901_New York AC_Yale",Saturday
2,1902-01-02,Non-Conference,Yale,/reports/team/Yale/59,2,Pittsburgh AC,,5,,,"Thursday, January 2, 1902_Pittsburgh AC_Yale",Thursday
3,1902-01-03,Non-Conference,Yale,/reports/team/Yale/59,2,Pitt Keystones,,6,,,"Friday, January 3, 1902_Pitt Keystones_Yale",Friday
4,1902-01-04,Non-Conference,Yale,/reports/team/Yale/59,2,All Scholastic,,0,,,"Saturday, January 4, 1902_All Scholastic_Yale",Saturday
5,1902-01-04,Non-Conference,Yale,/reports/team/Yale/59,2,Bankers (Pitt),,3,,,"Saturday, January 4, 1902_Bankers (Pitt)_Yale",Saturday
6,1902-01-15,Intercollegiate Leag,Princeton,/reports/team/Princeton/45,0,Yale,/reports/team/Yale/59,7,,"at St. Nicholas Rink, New York","Wednesday, January 15, 1902_Yale_Princeton",Wednesday
7,1902-01-18,Intercollegiate Leag,Harvard,/reports/team/Harvard/22,4,Columbia,,3,,,"Saturday, January 18, 1902_Columbia_Harvard",Saturday
8,1902-01-25,Non-Conference,Brown,/reports/team/Brown/12,1,Yale,/reports/team/Yale/59,11,,"at St. Nicholas Rink, New York","Saturday, January 25, 1902_Yale_Brown",Saturday
9,1902-01-29,Non-Conference,Yale Alumni,,3,Yale,/reports/team/Yale/59,4,,"at St. Nicholas Rink, New York","Wednesday, January 29, 1902_Yale_Yale Alumni",Wednesday


In [8]:
# Store the df in Memory
pre_2002_results_df = df.copy()

## Save data to CSV File
pre_2002_results_df.to_csv('../TEMP/pre_2002_college_hockey_results.csv', index=False)

In [9]:
# Combine the two dataframes into a single dataframe
combined_results_df = pd.concat([pre_2002_results_df, modern_results_df], ignore_index=True)

# Save the combined dataframe to a CSV file
combined_results_df.to_csv('../TEMP/all_time_college_hockey_results.csv', index=False)