In [21]:
### This workbook is used to scrape the data from the College Hockey News and explore the data## 

## Notes - the site's robots.txt file sets these limits for crawlers
# Crawl-delay: 10 (seconds)
# Request-rate: 1/5 (5 requests every minute)

# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

# Set the URL to scrape
url = 'https://www.collegehockeynews.com/schedules/?season=20222023' # Link to the 2022-2023 season with all results


In [22]:
## explore page structure

# Get the page with requests
# response = requests.get(url)

# # try reading with pandas # Returns odd table structure - going to try BeautifulSoup
# tables = pd.read_html(url)

# tables[0]

# # output as csv
# tables[0].to_csv('../TEMP/2022-2023_season.csv')

In [23]:
## explore page structure

# Get the page with requests
response = requests.get(url)

# Create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')

# select the table or tables
tables = soup.find_all('table')

# tables[0] # This appears to be the game results table

In [24]:
# Initialize variables
current_date = None
current_conference = None
game_notes = None

# Initialize an empty list to hold the data
data = []

# Parse the table with BeautifulSoup

rows = soup.find_all('tr')

# Loop through each row to find relevant information
for row in rows:
    # Check for date row
    if row.get('class') == ['stats-section']:
        current_date = row.find('td').text.strip()
    # Check for conference row
    elif row.get('class') == ['sked-header']:
        current_conference = row.find('td').text.strip()
    # Check for game notes
    elif len(row.find_all('td')) == 2:
        game_notes = row.find_all('td')[1].text.strip()
    # Process rows with game data
    elif row.get('valign') == 'top':
        cells = row.find_all('td')
        if len(cells) >= 9:
            home_team = cells[0].text.strip()
            home_team_link = cells[0].find('a')['href'] if cells[0].find('a') else None
            home_score = cells[1].text.strip()
            away_team = cells[3].text.strip()
            away_team_link = cells[3].find('a')['href'] if cells[3].find('a') else None
            away_score = cells[4].text.strip()
            ot = cells[5].text.strip()
            box_link = cells[7].find('a')['href'] if cells[7].find('a') else None
            metrics_link = cells[8].find('a')['href'] if cells[8].find('a') else None
             # Capture Game Notes
            game_notes_cell = cells[-1].find('small')
            game_notes = game_notes_cell.text.strip() if game_notes_cell else None

            # Append data to the list
            data.append([current_date, current_conference, game_notes, home_team, home_team_link, home_score, away_team, away_team_link, away_score, ot, box_link, metrics_link])
            game_notes = None  # Reset game notes for the next row
            

# Create a DataFrame
columns = ['Date', 'Conference', 'Game_Notes', 'Home_Team', 'Home_Team_Link', 'Home_Score', 'Away_Team', 'Away_Team_Link', 'Away_Score', 'OT', 'Box_Link', 'Metrics_Link']
df = pd.DataFrame(data, columns=columns)


In [25]:
## Extract the day of the week from the date and save in new column
df['Day'] = pd.to_datetime(df['Date']).dt.day_name()
# remove day of the week from date
# format data column as YYYY-MM-DD
df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')

### Create a new column for the game ID
## Game ID will be a combination of the date and abbreviated team names

# Function to abbreviate the team names
for row in df.itertuples():
    home_team = row.Home_Team
    away_team = row.Away_Team
    home_team_abbr = home_team.split(' ')[-1]
    away_team_abbr = away_team.split(' ')[-1]
    game_id = f'{row.Date}_{home_team_abbr}_{away_team_abbr}'
    df.loc[row.Index, 'Game_ID'] = game_id

# Create a new column for the game ID
df['Game_ID'] = df['Game_ID'].str.replace(',', '')

# Apply the function to the DataFrame
df['Game_ID'] = df.apply(lambda row: f'{row.Date}_{row.Home_Team}_{row.Away_Team}', axis=1)


Unnamed: 0,Date,Conference,Game_Notes,Home_Team,Home_Team_Link,Home_Score,Away_Team,Away_Team_Link,Away_Score,OT,Box_Link,Metrics_Link,Day
0,2022-10-01,Exhibition,,Western Ontario,,2,Providence,/reports/team/Providence/46,5,,/box/final/20221001/won/prv/,,Saturday
1,2022-10-01,Exhibition,,Lake Superior,/reports/team/Lake-Superior/24,2,Michigan Tech,/reports/team/Michigan-Tech/33,5,,/box/final/20221001/lss/mtu/,,Saturday
2,2022-10-01,Exhibition,,Toronto,,2,Quinnipiac,/reports/team/Quinnipiac/47,4,,/box/final/20221001/tor/qui/,,Saturday
3,2022-10-01,Exhibition,,Colgate,/reports/team/Colgate/15,4,Rensselaer,/reports/team/Rensselaer/48,2,,/box/final/20221001/clg/ren/,,Saturday
4,2022-10-01,Exhibition,,Sacred Heart,/reports/team/Sacred-Heart/51,3,Massachusetts,/reports/team/Massachusetts/27,2,,/box/final/20221001/sac/uma/,,Saturday


In [27]:
df.head(20)

# output csv in temp folder
df.to_csv('../TEMP/2022-2023_season.csv')

## EMPTY DATAFRAME
df = pd.DataFrame()
