In [1]:
import os
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
DOWNLOADS_PATH = 'C:\\Users\\riger\\Downloads'
SCRIPT_PATH = os.getcwd()
NHL_CSV_2023_PATH = os.path.join(SCRIPT_PATH, 'nhl_data_2023.csv')
NHL_CSV_2024_PATH = os.path.join(SCRIPT_PATH, 'nhl_data_2024.csv')
COMBINED_CSV_PATH = os.path.join(SCRIPT_PATH, 'combined.csv')
GOALIE_2023_CSV_PATH = os.path.join(SCRIPT_PATH, 'goalie_2023_data.csv')
GOALIE_2024_CSV_PATH = os.path.join(SCRIPT_PATH, 'goalie_2024_data.csv')
BASE_URL_2023 = 'https://www.nhl.com/stats/teams?aggregate=0&report=daysbetweengames&reportType=game&dateFrom=2023-10-10&dateTo=2024-04-18&gameType=2&sort=a_gameDate&page={}&pageSize=100'
BASE_URL_2024 = 'https://www.nhl.com/stats/teams?aggregate=0&report=daysbetweengames&reportType=game&dateFrom=2024-10-04&dateTo=2025-01-11&gameType=2&sort=a_gameDate&page={}&pageSize=100'
GOALIE_URL_2023 = 'https://www.nhl.com/stats/goalies?aggregate=0&reportType=game&dateFrom=2023-10-10&dateTo=2024-04-18&gameType=2&sort=a_gameDate&page={}&pageSize=100'
GOALIE_URL_2024 = 'https://www.nhl.com/stats/goalies?aggregate=0&reportType=game&dateFrom=2024-10-04&dateTo=2025-01-11&gameType=2&sort=a_gameDate&page={}&pageSize=100'

In [3]:
# Setup Chrome options for automatic download
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
    'download.default_directory': DOWNLOADS_PATH,
    'download.prompt_for_download': False,
    'download.directory_upgrade': True,
    'safebrowsing.enabled': True
})

In [4]:
service = Service('chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.maximize_window()

try:
    combined_data = pd.DataFrame()

    for page in range(0, 27):
        url = BASE_URL_2023.format(page)
        driver.get(url)

        # Wait for the export link to be present and clickable
        export_link = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
        )

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
        export_link.click()
        time.sleep(5)  # Wait for the file to download

        # Process the downloaded file
        downloaded_file = os.path.join(DOWNLOADS_PATH, 'Days between Games.xlsx')
        if os.path.exists(downloaded_file):
            data = pd.read_excel(downloaded_file)
            combined_data = pd.concat([combined_data, data], ignore_index=True)
            os.remove(downloaded_file)  # Remove the downloaded file after processing

    # Save the combined data as a CSV file
    if os.path.exists('Days between Games.xlsx'):
        os.remove('Days between Games.xlsx')
    combined_data.to_csv(NHL_CSV_2023_PATH, index=False)
    print(f"Combined CSV saved at: {NHL_CSV_2023_PATH}")

finally:
    driver.quit()

Combined CSV saved at: c:\Users\riger\Desktop\dataScraper\NHL_data_scraper\nhl_data_2023.csv


In [5]:
service = Service('chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.maximize_window()

try:
    combined_data = pd.DataFrame()

    for page in range(0, 14):
        url = BASE_URL_2024.format(page)
        driver.get(url)

        # Wait for the export link to be present and clickable
        export_link = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
        )

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
        export_link.click()
        time.sleep(5)  # Wait for the file to download

        # Process the downloaded file
        downloaded_file = os.path.join(DOWNLOADS_PATH, 'Days between Games.xlsx')
        if os.path.exists(downloaded_file):
            data = pd.read_excel(downloaded_file)
            combined_data = pd.concat([combined_data, data], ignore_index=True)
            os.remove(downloaded_file)  # Remove the downloaded file after processing

    # Save the combined data as a CSV file
    if os.path.exists('Days between Games.xlsx'):
        os.remove('Days between Games.xlsx')
    combined_data.to_csv(NHL_CSV_2024_PATH, index=False)
    print(f"Combined CSV saved at: {NHL_CSV_2024_PATH}")

finally:
    driver.quit()

Combined CSV saved at: c:\Users\riger\Desktop\dataScraper\NHL_data_scraper\nhl_data_2024.csv


In [6]:
service = Service('chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.maximize_window()

try:
    combined_data = pd.DataFrame()

    for page in range(0, 28):
        url = GOALIE_URL_2023.format(page)
        driver.get(url)

        # Wait for the export link to be present and clickable
        export_link = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
        )

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
        export_link.click()
        time.sleep(5)  # Wait for the file to download

        # Process the downloaded file
        downloaded_file = os.path.join(DOWNLOADS_PATH, 'Summary.xlsx')
        if os.path.exists(downloaded_file):
            data = pd.read_excel(downloaded_file)
            combined_data = pd.concat([combined_data, data], ignore_index=True)
            os.remove(downloaded_file)  # Remove the downloaded file after processing

    # Save the combined data as a CSV file
    if os.path.exists('Summary.xlsx'):
        os.remove('Summary.xlsx')
    combined_data.to_csv(GOALIE_2023_CSV_PATH, index=False)
    print(f"Combined CSV saved at: {GOALIE_2023_CSV_PATH}")

finally:
    driver.quit()

Combined CSV saved at: c:\Users\riger\Desktop\dataScraper\NHL_data_scraper\goalie_2023_data.csv


In [7]:
service = Service('chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.maximize_window()

try:
    combined_data = pd.DataFrame()

    for page in range(0, 15):
        url = GOALIE_URL_2024.format(page)
        driver.get(url)

        # Wait for the export link to be present and clickable
        export_link = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/main/div/div/div/div[2]/div/div[2]/div/main/div[2]/h4/a'))
        )

        # Scroll to the element
        driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
        export_link.click()
        time.sleep(5)  # Wait for the file to download

        # Process the downloaded file
        downloaded_file = os.path.join(DOWNLOADS_PATH, 'Summary.xlsx')
        if os.path.exists(downloaded_file):
            data = pd.read_excel(downloaded_file)
            combined_data = pd.concat([combined_data, data], ignore_index=True)
            os.remove(downloaded_file)  # Remove the downloaded file after processing

    # Save the combined data as a CSV file
    if os.path.exists('Summary.xlsx'):
        os.remove('Summary.xlsx')
    combined_data.to_csv(GOALIE_2024_CSV_PATH, index=False)
    print(f"Combined CSV saved at: {GOALIE_2024_CSV_PATH}")

finally:
    driver.quit()

Combined CSV saved at: c:\Users\riger\Desktop\dataScraper\NHL_data_scraper\goalie_2024_data.csv


In [8]:
hockey_ref_url_2023 = 'https://www.hockey-reference.com/leagues/NHL_2024_games.html'
hockey_ref_dfs = pd.read_html(hockey_ref_url_2023)
hockey_ref_dfs[0].to_csv('nhl_ref_data_2023.csv', index=False)

hockey_ref_url_2024 = 'https://www.hockey-reference.com/leagues/NHL_2025_games.html'
hockey_ref_dfs = pd.read_html(hockey_ref_url_2024)
hockey_ref_dfs[0].to_csv('nhl_ref_data_2024.csv', index=False)

In [9]:
nhl_ref_data_2023 = pd.read_csv('nhl_ref_data_2023.csv')
nhl_ref_data_2024 = pd.read_csv('nhl_ref_data_2024.csv')

In [10]:
# Changing all mentions of Arizona Coyotes to Utah Hockey Club in 2023 reference data
nhl_ref_data_2023 = nhl_ref_data_2023.replace('Arizona Coyotes', 'Utah Hockey Club')

In [11]:
nhl_data_2023 = pd.read_csv('nhl_data_2023.csv')
nhl_data_2023 = nhl_data_2023.sort_values(by="Game Date")

# Changing all Montréal Canadiens to Montreal Canadiens in 2023 data
nhl_data_2023 = nhl_data_2023.replace('Montréal Canadiens', 'Montreal Canadiens')
nhl_data_2023 = nhl_data_2023.replace('Arizona Coyotes', 'Utah Hockey Club')

def add_home_column(final_df, temp_df):
    # step 1: loop thorugh the temp df
    # step 2: get display a second df from the main where the dates match the temp df
    # step 3: loop through the second df until 'Team' matches 'Home' and 'Date' matches 'Date' in the temp df
    # step 4: add the 'Home' column to the main df for that row with the value of 1
    for i in range(len(temp_df)):  # Iterate over rows in temp_df
        temp_date = temp_df['Date'][i]
        new_df = final_df[final_df['Game Date'] == temp_date]
        
        for j in range(len(new_df)):
            if new_df['Team'].iloc[j] == temp_df['Home'].iloc[i]:
                if 'Venue' not in final_df.columns:
                    final_df['Venue'] = 'Away'
                
                # Update the specific row in the original df
                final_df.loc[new_df.index[j], 'Venue'] = 'Home'
    
    return final_df

In [12]:
nhl_data_2024 = pd.read_csv('nhl_data_2024.csv')
nhl_data_2024 = nhl_data_2024.sort_values(by="Game Date")

# Changing all Montréal Canadiens to Montreal Canadiens in 2024 data
nhl_data_2024 = nhl_data_2024.replace('Montréal Canadiens', 'Montreal Canadiens')

def add_home_column(final_df, temp_df):
    # step 1: loop thorugh the temp df
    # step 2: get display a second df from the main where the dates match the temp df
    # step 3: loop through the second df until 'Team' matches 'Home' and 'Date' matches 'Date' in the temp df
    # step 4: add the 'Home' column to the main df for that row with the value of 1
    for i in range(len(temp_df)):  # Iterate over rows in temp_df
        temp_date = temp_df['Date'][i]
        new_df = final_df[final_df['Game Date'] == temp_date]
        
        for j in range(len(new_df)):
            if new_df['Team'].iloc[j] == temp_df['Home'].iloc[i]:
                if 'Venue' not in final_df.columns:
                    final_df['Venue'] = 'Away'
                
                # Update the specific row in the original df
                final_df.loc[new_df.index[j], 'Venue'] = 'Home'
    
    return final_df

In [13]:
final_nhl_data_2023 = add_home_column(nhl_data_2023, nhl_ref_data_2023)
final_nhl_data_2023 = final_nhl_data_2023.dropna()

final_nhl_data_2024 = add_home_column(nhl_data_2024, nhl_ref_data_2024)
final_nhl_data_2024 = final_nhl_data_2024.dropna()

In [14]:
def add_time_column(final_df, temp_df):
    # Ensure 'Time' column exists in final_df
    if 'Time' not in final_df.columns:
        final_df['Time'] = None

    # Iterate over rows in temp_df
    for i in range(len(temp_df)):
        temp_time = temp_df['Time'].iloc[i]
        temp_date = temp_df['Date'].iloc[i]
        temp_home = temp_df['Home'].iloc[i]
        temp_visitor = temp_df['Visitor'].iloc[i]

        # Filter final_df for matching date
        matching_rows = final_df[final_df['Game Date'] == temp_date]

        for j in range(len(matching_rows)):
            team = matching_rows['Team'].iloc[j]

            # Check if the team matches either Home or Visitor in temp_df
            if team == temp_home or team == temp_visitor:
                # Update the 'Time' column in the original final_df
                final_df.loc[matching_rows.index[j], 'Time'] = temp_time

    return final_df

In [15]:
final_nhl_data_2023 = add_time_column(nhl_data_2023, nhl_ref_data_2023)
final_nhl_data_2024 = add_time_column(nhl_data_2024, nhl_ref_data_2024)

In [16]:
final_nhl_data_2023['Time'] = final_nhl_data_2023['Time'].apply(
    lambda x: pd.to_datetime(x, format='%I:%M %p').strftime('%H:%M') if pd.notnull(x) else None
)

final_nhl_data_2024['Time'] = final_nhl_data_2024['Time'].apply(
    lambda x: pd.to_datetime(x, format='%I:%M %p').strftime('%H:%M') if pd.notnull(x) else None
)

In [17]:
final_nhl_data_2023 = final_nhl_data_2023.sort_values(by="Game Date")
final_nhl_data_2024 = final_nhl_data_2024.sort_values(by="Game Date")

In [18]:
final_nhl_data_2023.rename(columns={'OT':'OT Losses','GD/GP':'Net Goals','Shots/GP':'Shots For','SA/GP':'Shots Against','SD/GP':'Shot Diff'}, inplace=True)
final_nhl_data_2023['Result'] = final_nhl_data_2023.apply(lambda row: 'W' if row['Net Goals'] > 0 else 'L', axis=1)

final_nhl_data_2024.rename(columns={'OT':'OT Losses','GD/GP':'Net Goals','Shots/GP':'Shots For','SA/GP':'Shots Against','SD/GP':'Shot Diff'}, inplace=True)
final_nhl_data_2024['Result'] = final_nhl_data_2023.apply(lambda row: 'W' if row['Net Goals'] > 0 else 'L', axis=1)

In [19]:
final_nhl_data_2023.head()

Unnamed: 0,Team,Opp Team,Game Date,Days Btwn Games,GP,W,L,T,OT Losses,P,...,Shots Against,Shot Diff,PP Opp/GP,TS/GP,PP%,PK%,FOW%,Venue,Time,Result
0,Pittsburgh Penguins,CHI,2023-10-10,4,1,0,1,--,0,0,...,36,5,2,4,0.0,100.0,67.8,Home,20:00,L
1,Tampa Bay Lightning,NSH,2023-10-10,4,1,1,0,--,0,2,...,31,3,5,4,40.0,75.0,56.7,Home,17:30,W
2,Chicago Blackhawks,PIT,2023-10-10,4,1,1,0,--,0,2,...,41,-5,4,2,0.0,100.0,32.2,Away,20:00,W
3,Nashville Predators,TBL,2023-10-10,4,1,0,1,--,0,0,...,34,-3,4,5,25.0,60.0,43.3,Away,17:30,L
4,Vegas Golden Knights,SEA,2023-10-10,4,1,1,0,--,0,2,...,33,-5,4,4,0.0,100.0,59.2,Home,22:30,W


In [20]:
final_nhl_data_2024.head()

Unnamed: 0,Team,Opp Team,Game Date,Days Btwn Games,GP,W,L,T,OT Losses,P,...,Shots Against,Shot Diff,PP Opp/GP,TS/GP,PP%,PK%,FOW%,Venue,Time,Result
0,New Jersey Devils,BUF,2024-10-04,4,1,1,0,--,0,2,...,31,-8,2,4,0.0,100.0,42.4,Away,13:00,L
1,Buffalo Sabres,NJD,2024-10-04,4,1,0,1,--,0,0,...,23,8,4,2,0.0,100.0,57.6,Home,13:00,W
2,New Jersey Devils,BUF,2024-10-05,0,1,1,0,--,0,2,...,18,19,4,2,25.0,100.0,64.7,Home,10:00,W
3,Buffalo Sabres,NJD,2024-10-05,0,1,0,1,--,0,0,...,37,-19,2,4,0.0,75.0,35.3,Away,10:00,L
4,Boston Bruins,FLA,2024-10-08,4,1,0,1,--,0,0,...,35,-7,4,6,25.0,100.0,48.4,Away,19:00,W


In [21]:
nhl_team_abbr = {
    'ANA' : 'Anaheim Ducks',
    'BOS' : 'Boston Bruins',
    'BUF' : 'Buffalo Sabres',
    'CAR' : 'Carolina Hurricanes',
    'CBJ' : 'Columbus Blue Jackets',
    'CGY' : 'Calgary Flames',
    'CHI' : 'Chicago Blackhawks',
    'COL' : 'Colorado Avalanche',
    'DAL' : 'Dallas Stars',
    'DET' : 'Detroit Red Wings',
    'EDM' : 'Edmonton Oilers',
    'FLA' : 'Florida Panthers',
    'LAK' : 'Los Angeles Kings',
    'MIN' : 'Minnesota Wild',
    'MTL' : 'Montreal Canadiens',
    'NJD' : 'New Jersey Devils',
    'NSH' : 'Nashville Predators',
    'NYI' : 'New York Islanders',
    'NYR' : 'New York Rangers',
    'OTT' : 'Ottawa Senators',
    'PHI' : 'Philadelphia Flyers',
    'PIT' : 'Pittsburgh Penguins',
    'SJS' : 'San Jose Sharks',
    'SEA' : 'Seattle Kraken',
    'STL' : 'St. Louis Blues',
    'TBL' : 'Tampa Bay Lightning',
    'TOR' : 'Toronto Maple Leafs',
    'UTA' : 'Utah Hockey Club',
    'VAN' : 'Vancouver Canucks',
    'VGK' : 'Vegas Golden Knights',
    'WPG' : 'Winnipeg Jets',
    'WSH' : 'Washington Capitals',
    'ARI' : 'Utah Hockey Club'
}

In [22]:
goalie_df_2023 = pd.read_csv('goalie_2023_data.csv')
goalie_df_2024 = pd.read_csv('goalie_2024_data.csv')

In [23]:
goalie_df_2023['Team'] = goalie_df_2023['Team'].apply(lambda abbr: nhl_team_abbr.get(abbr, abbr))

In [24]:
goalie_df_2024['Team'] = goalie_df_2024['Team'].apply(lambda abbr: nhl_team_abbr.get(abbr, abbr))

In [25]:
final_nhl_data_2023 = final_nhl_data_2023.merge(goalie_df_2023[['Team', 'Game Date', 'Player', 'Sv%']], 
                                      left_on=['Team', 'Game Date'], 
                                      right_on=['Team', 'Game Date'], 
                                      how='left')

final_nhl_data_2023['goalie'] = final_nhl_data_2023['Player']
final_nhl_data_2023.drop(columns=['Player'], inplace=True)
# final_nhl_data_2023.drop(columns=['Team'], inplace=True)
# final_nhl_data_2023.drop(columns=['Game Date'], inplace=True)

In [26]:
final_nhl_data_2024 = final_nhl_data_2024.merge(goalie_df_2024[['Team', 'Game Date', 'Player', 'Sv%']], 
                                      left_on=['Team', 'Game Date'], 
                                      right_on=['Team', 'Game Date'], 
                                      how='left')

final_nhl_data_2024['goalie'] = final_nhl_data_2024['Player']
final_nhl_data_2024.drop(columns=['Player'], inplace=True)
# final_nhl_data_2024.drop(columns=['Team'], inplace=True)
# final_nhl_data_2024.drop(columns=['Game Date'], inplace=True)

In [27]:
final_nhl_data_2024.head()

Unnamed: 0,Team,Opp Team,Game Date,Days Btwn Games,GP,W,L,T,OT Losses,P,...,PP Opp/GP,TS/GP,PP%,PK%,FOW%,Venue,Time,Result,Sv%,goalie
0,New Jersey Devils,BUF,2024-10-04,4,1,1,0,--,0,2,...,2,4,0.0,100.0,42.4,Away,13:00,L,0.968,Jacob Markstrom
1,Buffalo Sabres,NJD,2024-10-04,4,1,0,1,--,0,0,...,4,2,0.0,100.0,57.6,Home,13:00,W,0.864,Ukko-Pekka Luukkonen
2,New Jersey Devils,BUF,2024-10-05,0,1,1,0,--,0,2,...,4,2,25.0,100.0,64.7,Home,10:00,W,0.944,Jake Allen
3,Buffalo Sabres,NJD,2024-10-05,0,1,0,1,--,0,0,...,2,4,0.0,75.0,35.3,Away,10:00,L,0.919,Devon Levi
4,Boston Bruins,FLA,2024-10-08,4,1,0,1,--,0,0,...,4,6,25.0,100.0,48.4,Away,19:00,W,0.829,Joonas Korpisalo


In [28]:
final_nhl_data_2023['Opp Team'] = final_nhl_data_2023['Opp Team'].apply(lambda abbr: nhl_team_abbr.get(abbr, abbr))
final_nhl_data_2024['Opp Team'] = final_nhl_data_2024['Opp Team'].apply(lambda abbr: nhl_team_abbr.get(abbr, abbr))

In [29]:
final_nhl_data_2023.drop(columns=['W', 'L', 'T', 'GP', 'OT Losses', 'P', 'P%'], inplace=True)
final_nhl_data_2024.drop(columns=['W', 'L', 'T', 'GP', 'OT Losses', 'P', 'P%'], inplace=True)

In [30]:
# Step 4: Remove the other temp csv files
# os.remove('nhl_ref_data.csv')
# os.remove('combined.csv')
# os.remove('goalie_data.csv')

In [31]:
final_nhl_data_2023.columns = [x.lower() for x in final_nhl_data_2023.columns]
final_nhl_data_2024.columns = [x.lower() for x in final_nhl_data_2024.columns]

In [32]:
final_nhl_data_2023.head()

Unnamed: 0,team,opp team,game date,days btwn games,gf/gp,ga/gp,net goals,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,venue,time,result,sv%,goalie
0,Pittsburgh Penguins,Chicago Blackhawks,2023-10-10,4,2,4,-2,41,36,5,2,4,0.0,100.0,67.8,Home,20:00,L,0.914,Tristan Jarry
1,Tampa Bay Lightning,Nashville Predators,2023-10-10,4,5,3,2,34,31,3,5,4,40.0,75.0,56.7,Home,17:30,W,0.903,Jonas Johansson
2,Chicago Blackhawks,Pittsburgh Penguins,2023-10-10,4,4,2,2,36,41,-5,4,2,0.0,100.0,32.2,Away,20:00,W,0.951,Petr Mrazek
3,Nashville Predators,Tampa Bay Lightning,2023-10-10,4,3,5,-2,31,34,-3,4,5,25.0,60.0,43.3,Away,17:30,L,0.879,Juuse Saros
4,Vegas Golden Knights,Seattle Kraken,2023-10-10,4,4,1,3,28,33,-5,4,4,0.0,100.0,59.2,Home,22:30,W,0.97,Adin Hill


In [33]:
final_nhl_data_2024.head()

Unnamed: 0,team,opp team,game date,days btwn games,gf/gp,ga/gp,net goals,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,venue,time,result,sv%,goalie
0,New Jersey Devils,Buffalo Sabres,2024-10-04,4,4,1,3,23,31,-8,2,4,0.0,100.0,42.4,Away,13:00,L,0.968,Jacob Markstrom
1,Buffalo Sabres,New Jersey Devils,2024-10-04,4,1,4,-3,31,23,8,4,2,0.0,100.0,57.6,Home,13:00,W,0.864,Ukko-Pekka Luukkonen
2,New Jersey Devils,Buffalo Sabres,2024-10-05,0,3,1,2,37,18,19,4,2,25.0,100.0,64.7,Home,10:00,W,0.944,Jake Allen
3,Buffalo Sabres,New Jersey Devils,2024-10-05,0,1,3,-2,18,37,-19,2,4,0.0,75.0,35.3,Away,10:00,L,0.919,Devon Levi
4,Boston Bruins,Florida Panthers,2024-10-08,4,4,6,-2,28,35,-7,4,6,25.0,100.0,48.4,Away,19:00,W,0.829,Joonas Korpisalo


In [34]:
final_nhl_data_2023 = final_nhl_data_2023.rename(columns={"game date": "date", "opp team": "opponent", "days btwn games": "rest days"})
final_nhl_data_2024 = final_nhl_data_2024.rename(columns={"game date": "date", "opp team": "opponent", "days btwn games": "rest days"})

In [35]:
final_nhl_data_2023.head()

Unnamed: 0,team,opponent,date,rest days,gf/gp,ga/gp,net goals,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,venue,time,result,sv%,goalie
0,Pittsburgh Penguins,Chicago Blackhawks,2023-10-10,4,2,4,-2,41,36,5,2,4,0.0,100.0,67.8,Home,20:00,L,0.914,Tristan Jarry
1,Tampa Bay Lightning,Nashville Predators,2023-10-10,4,5,3,2,34,31,3,5,4,40.0,75.0,56.7,Home,17:30,W,0.903,Jonas Johansson
2,Chicago Blackhawks,Pittsburgh Penguins,2023-10-10,4,4,2,2,36,41,-5,4,2,0.0,100.0,32.2,Away,20:00,W,0.951,Petr Mrazek
3,Nashville Predators,Tampa Bay Lightning,2023-10-10,4,3,5,-2,31,34,-3,4,5,25.0,60.0,43.3,Away,17:30,L,0.879,Juuse Saros
4,Vegas Golden Knights,Seattle Kraken,2023-10-10,4,4,1,3,28,33,-5,4,4,0.0,100.0,59.2,Home,22:30,W,0.97,Adin Hill


In [36]:
final_nhl_data_2024.head()

Unnamed: 0,team,opponent,date,rest days,gf/gp,ga/gp,net goals,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,venue,time,result,sv%,goalie
0,New Jersey Devils,Buffalo Sabres,2024-10-04,4,4,1,3,23,31,-8,2,4,0.0,100.0,42.4,Away,13:00,L,0.968,Jacob Markstrom
1,Buffalo Sabres,New Jersey Devils,2024-10-04,4,1,4,-3,31,23,8,4,2,0.0,100.0,57.6,Home,13:00,W,0.864,Ukko-Pekka Luukkonen
2,New Jersey Devils,Buffalo Sabres,2024-10-05,0,3,1,2,37,18,19,4,2,25.0,100.0,64.7,Home,10:00,W,0.944,Jake Allen
3,Buffalo Sabres,New Jersey Devils,2024-10-05,0,1,3,-2,18,37,-19,2,4,0.0,75.0,35.3,Away,10:00,L,0.919,Devon Levi
4,Boston Bruins,Florida Panthers,2024-10-08,4,4,6,-2,28,35,-7,4,6,25.0,100.0,48.4,Away,19:00,W,0.829,Joonas Korpisalo


In [37]:
final_nhl_data_2023.columns

Index(['team', 'opponent', 'date', 'rest days', 'gf/gp', 'ga/gp', 'net goals',
       'shots for', 'shots against', 'shot diff', 'pp opp/gp', 'ts/gp', 'pp%',
       'pk%', 'fow%', 'venue', 'time', 'result', 'sv%', 'goalie'],
      dtype='object')

In [38]:
final_nhl_data_2024.columns

Index(['team', 'opponent', 'date', 'rest days', 'gf/gp', 'ga/gp', 'net goals',
       'shots for', 'shots against', 'shot diff', 'pp opp/gp', 'ts/gp', 'pp%',
       'pk%', 'fow%', 'venue', 'time', 'result', 'sv%', 'goalie'],
      dtype='object')

In [39]:
new_order = ['date', 'time', 'venue', 'rest days', 'result', 'gf/gp', 'ga/gp', 'net goals', 'opponent', 'shots for', 'shots against', 'shot diff', 'pp opp/gp', 'ts/gp', 'pp%',
       'pk%', 'fow%', 'team', 'goalie', 'sv%']

In [40]:
final_nhl_data_2023 = final_nhl_data_2023[new_order]
final_nhl_data_2024 = final_nhl_data_2024[new_order]

In [41]:
final_nhl_data_2023

Unnamed: 0,date,time,venue,rest days,result,gf/gp,ga/gp,net goals,opponent,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,team,goalie,sv%
0,2023-10-10,20:00,Home,4,L,2,4,-2,Chicago Blackhawks,41,36,5,2,4,0.0,100.0,67.8,Pittsburgh Penguins,Tristan Jarry,0.914
1,2023-10-10,17:30,Home,4,W,5,3,2,Nashville Predators,34,31,3,5,4,40.0,75.0,56.7,Tampa Bay Lightning,Jonas Johansson,0.903
2,2023-10-10,20:00,Away,4,W,4,2,2,Pittsburgh Penguins,36,41,-5,4,2,0.0,100.0,32.2,Chicago Blackhawks,Petr Mrazek,0.951
3,2023-10-10,17:30,Away,4,L,3,5,-2,Tampa Bay Lightning,31,34,-3,4,5,25.0,60.0,43.3,Nashville Predators,Juuse Saros,0.879
4,2023-10-10,22:30,Home,4,W,4,1,3,Seattle Kraken,28,33,-5,4,4,0.0,100.0,59.2,Vegas Golden Knights,Adin Hill,0.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2792,2024-04-18,22:30,Home,2,W,5,4,1,Chicago Blackhawks,35,13,22,4,1,50.0,0.0,63.8,Los Angeles Kings,Cam Talbot,0.692
2793,2024-04-18,21:00,Away,2,L,1,5,-4,Calgary Flames,17,37,-20,2,1,50.0,100.0,58.1,San Jose Sharks,Devin Cooley,0.783
2794,2024-04-18,21:00,Away,2,L,1,5,-4,Calgary Flames,17,37,-20,2,1,50.0,100.0,58.1,San Jose Sharks,Georgi Romanov,1
2795,2024-04-18,19:00,Home,2,L,3,4,-1,Seattle Kraken,24,27,-3,4,3,50.0,66.7,41.2,Minnesota Wild,Marc-Andre Fleury,0.885


In [42]:
final_nhl_data_2024

Unnamed: 0,date,time,venue,rest days,result,gf/gp,ga/gp,net goals,opponent,shots for,shots against,shot diff,pp opp/gp,ts/gp,pp%,pk%,fow%,team,goalie,sv%
0,2024-10-04,13:00,Away,4,L,4,1,3,Buffalo Sabres,23,31,-8,2,4,0.0,100.0,42.4,New Jersey Devils,Jacob Markstrom,0.968
1,2024-10-04,13:00,Home,4,W,1,4,-3,New Jersey Devils,31,23,8,4,2,0.0,100.0,57.6,Buffalo Sabres,Ukko-Pekka Luukkonen,0.864
2,2024-10-05,10:00,Home,0,W,3,1,2,Buffalo Sabres,37,18,19,4,2,25.0,100.0,64.7,New Jersey Devils,Jake Allen,0.944
3,2024-10-05,10:00,Away,0,L,1,3,-2,New Jersey Devils,18,37,-19,2,4,0.0,75.0,35.3,Buffalo Sabres,Devon Levi,0.919
4,2024-10-08,19:00,Away,4,W,4,6,-2,Florida Panthers,28,35,-7,4,6,25.0,100.0,48.4,Boston Bruins,Joonas Korpisalo,0.829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,2025-01-11,19:00,Away,1,L,0,3,-3,Winnipeg Jets,22,29,-7,2,0,0,--,39.1,Colorado Avalanche,Mackenzie Blackwood,0.929
1431,2025-01-11,22:00,Home,3,L,2,1,1,Los Angeles Kings,18,32,-14,4,3,25,100,35.3,Calgary Flames,Dustin Wolf,0.969
1432,2025-01-11,19:00,Home,0,L,3,4,-1,Edmonton Oilers,22,34,-12,1,1,0,0,56.3,Chicago Blackhawks,Arvid Soderblom,0.882
1433,2025-01-11,20:00,Home,3,L,1,4,-3,Washington Capitals,33,16,17,3,2,33.3,50,51.0,Nashville Predators,Juuse Saros,0.8


In [43]:
final_nhl_data_2023.to_csv('final_nhl_data_2023.csv', index=False)
final_nhl_data_2024.to_csv('final_nhl_data_2024.csv', index=False)