# NHL DATA SCRAPER v.2.0

### Step 1
Import all required modules

In [52]:
import os
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Step 2
1. Global variables
2. Set up chrome options for automatic download

In [78]:
DOWNLOADS_PATH = 'C:\\Users\\riger\\Downloads'
SCRIPT_PATH = os.getcwd()
NHL_STATS_CSV_PATH = os.path.join(SCRIPT_PATH, './generated/nhl_stats_data.csv')
NHL_GOALIE_CSV_PATH = os.path.join(SCRIPT_PATH, './generated/nhl_goalie_data.csv')
COMBINED_CSV_PATH = os.path.join(SCRIPT_PATH, './generated/final_nhl_stats.csv')
BASE_STATS_URL = 'https://www.nhl.com/stats/teams?aggregate=0&report=daysbetweengames&reportType=game&seasonFrom=20212022&seasonTo=20242025&dateFromSeason&gameType=2&homeRoad=H&sort=a_gameDate&page={}&pageSize=100'
BASE_GOALIE_URL = 'https://www.nhl.com/stats/goalies?aggregate=0&reportType=game&seasonFrom=20212022&seasonTo=20242025&dateFromSeason&gameType=2&homeRoad=H&sort=a_gameDate&page={}&pageSize=100'

In [54]:
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
    'download.default_directory': DOWNLOADS_PATH,
    'download.prompt_for_download': False,
    'download.directory_upgrade': True,
    'safebrowsing.enabled': True
})

### Step 4
Start scraping stats
- This will open every page from 0 to 52
- It will download the csv to your downloads folder, read it, and save/concat it to the 'data' pandas df
- Finally, remove the downloaded file
- Repeat with all pages

In [55]:
#Driver setup

service = Service('chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.maximize_window()

In [56]:
# try:
#     # Create csv directory if doesn't exist (will be used at the end)
#     if not os.path.exists(NHL_STATS_CSV_PATH):
#         os.makedirs('generated', exist_ok=True)  # create 'generated' folder if needed
        
#         with open(NHL_STATS_CSV_PATH, 'w') as f:
#             f.write("")
#             print("Created file:", NHL_STATS_CSV_PATH)

#     combined_data = pd.DataFrame()

#     for page in range(0, 53):
#         url = BASE_STATS_URL.format(page)
#         driver.get(url)

#         # Wait for the export link to be present and clickable
#         export_link = WebDriverWait(driver, 20).until(
#             EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
#         )

#         # Scroll to the element
#         driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
#         export_link.click()
#         time.sleep(5)  # Wait for the file to download

#         # Process the downloaded file
#         downloaded_file = os.path.join(DOWNLOADS_PATH, 'Days between Games.xlsx')
#         if os.path.exists(downloaded_file):
#             data = pd.read_excel(downloaded_file)
#             combined_data = pd.concat([combined_data, data], ignore_index=True)
#             os.remove(downloaded_file)  # Remove the downloaded file after processing

#     # Save the combined data as a CSV file
#     if os.path.exists('Days between Games.xlsx'):
#         os.remove('Days between Games.xlsx')

#     combined_data.to_csv(NHL_STATS_CSV_PATH, index=False)
#     print(f"Combined CSV saved at: {NHL_STATS_CSV_PATH}")

# finally:
#     driver.quit()

### Step 5
Scrape goalie data
* This will scrape all pages from 0 to 56
* Like before, will download a csv, read into a pandas df
* combine all pages into one csv

In [57]:
# try:
#     # Create csv directory if doesn't exist (will be used at the end)
#     if not os.path.exists(NHL_GOALIE_CSV_PATH):
#         if not os.path.exists('generated'):
#             os.makedirs('generated', exist_ok=True)  # create 'generated' folder if needed
        
#         with open(NHL_GOALIE_CSV_PATH, 'w') as f:
#             f.write("")
#             print("Created file:", NHL_GOALIE_CSV_PATH)

#     combined_data = pd.DataFrame()

#     for page in range(0, 56):
#         url = BASE_GOALIE_URL.format(page)
#         driver.get(url)

#         # Wait for the export link to be present and clickable
#         export_link = WebDriverWait(driver, 20).until(
#             EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
#         )

#         # Scroll to the element
#         driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
#         export_link.click()
#         time.sleep(5)  # Wait for the file to download

#         # Process the downloaded file
#         downloaded_file = os.path.join(DOWNLOADS_PATH, 'Summary.xlsx')
#         if os.path.exists(downloaded_file):
#             data = pd.read_excel(downloaded_file)
#             combined_data = pd.concat([combined_data, data], ignore_index=True)
#             os.remove(downloaded_file)  # Remove the downloaded file after processing

#     # Save the combined data as a CSV file
#     if os.path.exists('Summary.xlsx'):
#         os.remove('Summary.xlsx')

#     combined_data.to_csv(NHL_GOALIE_CSV_PATH, index=False)
#     print(f"Combined CSV saved at: {NHL_GOALIE_CSV_PATH}")

# finally:
#     driver.quit()

### Step 6
Filter goalie data
* loop through every combination of date + team
* If result has 2 or more rows, reduce to one row
    * to do this, apply this example formula to calculate save %:
        * team_save_percentage = (saves_ullmark + saves_forsberg) / (shots_against_ullmark + shots_against_forsberg)
    * to decide on the name that stays, compare ice times
* **In the end, number of rows in goalie data should match number of rows in stats data**

In [69]:
goalie_df = pd.read_csv('./generated/nhl_goalie_data.csv')

In [70]:
def toi_to_seconds(toi_str):
    mins, secs = map(int, toi_str.split(':'))
    return mins * 60 + secs

goalie_df['TOI_seconds'] = goalie_df['TOI'].apply(toi_to_seconds)

In [None]:
grouped = goalie_df.groupby(['Game Date', 'Team'])

# Iterate through each group
for (game_date, team), group in grouped:
    if group.shape[0] > 1:
        curr_saves = group['Svs'].sum()
        curr_shots = group['SA'].sum()

        # Get goalie with most TOI
        max_toi_row = group.loc[group['TOI_seconds'].idxmax()]
        goalie = max_toi_row['Player']

        # Drop other rows
        goalie_df = goalie_df.drop(group.index[group['Player'] != goalie])

        # Update save percentage
        new_sv_percentage = round(curr_saves / curr_shots, 3)
        goalie_df.loc[group.index[group['Player'] == goalie], 'Sv%'] = new_sv_percentage

# Save the modified DataFrame to a new CSV file
goalie_df.to_csv(NHL_GOALIE_CSV_PATH, index=False)

#### Step 6.1
Expand team name in goalie data to be easily combined with team name (home) in stat data

In [73]:
nhl_team_abbr = {
    'ANA' : 'Anaheim Ducks',
    'ARI' : 'Arizona Coyotes',
    'BOS' : 'Boston Bruins',
    'BUF' : 'Buffalo Sabres',
    'CAR' : 'Carolina Hurricanes',
    'CBJ' : 'Columbus Blue Jackets',
    'CGY' : 'Calgary Flames',
    'CHI' : 'Chicago Blackhawks',
    'COL' : 'Colorado Avalanche',
    'DAL' : 'Dallas Stars',
    'DET' : 'Detroit Red Wings',
    'EDM' : 'Edmonton Oilers',
    'FLA' : 'Florida Panthers',
    'LAK' : 'Los Angeles Kings',
    'MIN' : 'Minnesota Wild',
    'MTL' : 'Montréal Canadiens',
    'NJD' : 'New Jersey Devils',
    'NSH' : 'Nashville Predators',
    'NYI' : 'New York Islanders',
    'NYR' : 'New York Rangers',
    'OTT' : 'Ottawa Senators',
    'PHI' : 'Philadelphia Flyers',
    'PIT' : 'Pittsburgh Penguins',
    'SJS' : 'San Jose Sharks',
    'SEA' : 'Seattle Kraken',
    'STL' : 'St. Louis Blues',
    'TBL' : 'Tampa Bay Lightning',
    'TOR' : 'Toronto Maple Leafs',
    'UTA' : 'Utah Hockey Club',
    'VAN' : 'Vancouver Canucks',
    'VGK' : 'Vegas Golden Knights',
    'WPG' : 'Winnipeg Jets',
    'WSH' : 'Washington Capitals',
    'ARI' : 'Utah Hockey Club'
}

In [74]:
goalie_df = pd.read_csv('./generated/nhl_goalie_data.csv')

In [75]:
goalie_df['Team'] = goalie_df['Team'].apply(lambda abbr: nhl_team_abbr.get(abbr, abbr))
goalie_df.to_csv(NHL_GOALIE_CSV_PATH, index=False)

### Step 7
Combine **goalie data (name, sv%)** with **stats data** based on data and home team name
* Create a final csv file called 'final_nhl_stats.csv'

In [79]:
stats_df = pd.read_csv('./generated/nhl_stats_data.csv')
goalie_df = pd.read_csv('./generated/nhl_goalie_data.csv')

In [80]:
goalie_subset = goalie_df[['Game Date', 'Team', 'Player', 'Sv%']]

final_nhl_stats_df = pd.merge(stats_df, goalie_subset, on=['Game Date', 'Team'], how='left')

final_nhl_stats_df = final_nhl_stats_df.rename(columns={'Player': 'Goalie'})

final_nhl_stats_df.to_csv(COMBINED_CSV_PATH, index=False)