### Import Modules

In [1]:
import psycopg2
from sqlalchemy import MetaData, Table, Column, String, DateTime, Integer, Float, inspect, create_engine
import pandas as pd
import numpy as np
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import os
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from datetime import datetime, timedelta

### Cleanse Original Raw Data

In [2]:
def cleanse_nhl_raw_data(table_name):
    "Cleasses Raw NHL Statistics from NHL Site"
    # Define the database connection string
    db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_master_data"

    # Create SQLAlchemy engine
    engine = create_engine(db_string)

    # Connect to the database
    connection = engine.connect()

    # Execute SQL query to select all data from the table
    query = f"SELECT * FROM {table_name}"

    # Load data into DataFrame
    df = pd.read_sql(query, connection)

    # Close the database connection
    connection.close()

    # Change Montréal Canadiens name without accent
    df['Team Name'] = df['Team Name'].replace('Montréal Canadiens', 'Montreal Canadiens')

    # Change "St. Louis Blues" to "St Louis Blues"
    df['Team Name'] = df['Team Name'].replace('St. Louis Blues', 'St Louis Blues')

    # Drop Columns 'GF/GP' and 'GA/GP' since they are the same as 'GF' and 'GA'
    df.drop(columns=['GF/GP', 'GA/GP'], inplace=True)

    # Replace "--" with 100 in PK and Net PK columns
    df["PK%"] = df["PK%"].replace("--", 100)
    df["Net PK%"] = df["Net PK%"].replace("--", 100)

    # Replace "--" with 0 in PP and Net PP columns
    df["PP%"] = df["PP%"].replace("--", 0)
    df["Net PP%"] = df["Net PP%"].replace("--", 0)

    # Replace "N/A" with NaNs
    df.replace('N/A', np.nan, inplace=True)

    # Convert selected columns to numeric type
    numeric_columns = ['GP', 'W', 'L', 'P', 'P%', 'RW', 'ROW', 'SO_win', 'GF', 'GA', 'PP%', 'PK%',
                       'Net PP%', 'Net PK%', 'Shots/GP', 'SA/GP', 'FOW%']
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Calculate "Save %" and "Shooting &"
    df['Save %'] = (df['SA/GP'] - df['GA']) / df['SA/GP']
    df['Shooting %'] = df['GF'] / df['Shots/GP']

    # Calculate Corsi %
    df['Corsi%'] = (df['Shots/GP'] / (df['Shots/GP'] + df['SA/GP'])) * 100

    # Connect to the database again to load abbreviations
    connection = engine.connect()

    # SQL query to select all data from the "teamname_abbreviations" table
    query = "SELECT * FROM teamname_abbreviations"

    # Load data into DataFrame
    abbreviations_df = pd.read_sql(query, connection)

    # Close the database connection
    connection.close()

    # Extract the abbreviation for the opposing team
    opponent_abbr = df['Game Date'].str.extract(r'@ (\w{3})|vs (\w{3})')

    # Assign the extracted values to the 'Against Team' column
    df['Against Team'] = opponent_abbr[0].fillna(opponent_abbr[1])

    # Create a dictionary mapping abbreviations to full names
    abbr_to_fullname = dict(zip(abbreviations_df['abbreviation'], abbreviations_df['Full Name']))

    # Convert against team names using abbreviations
    df['Against Team'] = df['Against Team'].map(abbr_to_fullname)

    # Define a function to determine home and away teams
    def determine_home_away(row):
        if "@" in row["Game Date"]:
            return row["Against Team"], row["Team Name"]
        elif "vs" in row["Game Date"]:
            return row["Team Name"], row["Against Team"]

    # Apply the function to create Home and Away Team Columns
    df[["Home Team", "Away Team"]] = df.apply(determine_home_away, axis=1, result_type="expand")

    # Remove the team abbreviations from the Game Date column
    df['Game Date'] = df['Game Date'].str.replace(r'@ \w{3}|vs \w{3}', '', regex=True).str.strip()

    # Convert Game Date Column to Pandas Datetime
    df["Game Date"] = pd.to_datetime(df["Game Date"])

    # Drop the Against Team column
    #df.drop(['Against Team'], axis=1, inplace=True)
    
    # Create a new column 'Home Team W' initialized with 0
    df['Home Team W'] = 0

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Check if the 'Team Name' matches 'Home Team'
        if row['Team Name'] == row['Home Team']:
            df.at[index, 'Home Team W'] = row['W']
        # Check if the 'Team Name' matches 'Away Team'
        elif row['Team Name'] == row['Away Team']:
            if row['W'] == 0:
                df.at[index, 'Home Team W'] = 1
            elif row['W'] == 1:
                df.at[index, 'Home Team W'] = 0

    return df

In [3]:
df = cleanse_nhl_raw_data("original_raw_data")

In [4]:
df

Unnamed: 0,Team Name,Game Date,GP,W,L,T,OT,P,P%,RW,...,FOW%,Season,Type,Save %,Shooting %,Corsi%,Against Team,Home Team,Away Team,Home Team W
0,Boston Bruins,2018-10-03,1,0,1,--,0,0,0.0,0,...,68.3,20182019,Regular Season,0.810811,0.000000,40.322581,Washington Capitals,Washington Capitals,Boston Bruins,1
1,Montreal Canadiens,2018-10-03,1,0,0,--,1,1,0.5,0,...,41.3,20182019,Regular Season,0.884615,0.055556,58.064516,Toronto Maple Leafs,Toronto Maple Leafs,Montreal Canadiens,1
2,Toronto Maple Leafs,2018-10-03,1,1,0,--,0,2,1.0,0,...,58.7,20182019,Regular Season,0.944444,0.115385,41.935484,Montreal Canadiens,Toronto Maple Leafs,Montreal Canadiens,1
3,Washington Capitals,2018-10-03,1,1,0,--,0,2,1.0,1,...,31.7,20182019,Regular Season,1.000000,0.189189,59.677419,Boston Bruins,Washington Capitals,Boston Bruins,1
4,Calgary Flames,2018-10-03,1,0,1,--,0,0,0.0,0,...,57.6,20182019,Regular Season,0.782609,0.057143,60.344828,Vancouver Canucks,Vancouver Canucks,Calgary Flames,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12641,Vegas Golden Knights,2023-06-08,1,0,1,--,,0,0.0,0,...,52.3,20222023,Playoff,0.869565,0.074074,54.000000,Florida Panthers,Florida Panthers,Vegas Golden Knights,1
12642,Florida Panthers,2023-06-10,1,0,1,--,,0,0.0,0,...,53.7,20222023,Playoff,0.903226,0.064516,50.000000,Vegas Golden Knights,Florida Panthers,Vegas Golden Knights,0
12643,Vegas Golden Knights,2023-06-10,1,1,0,--,,2,1.0,1,...,46.3,20222023,Playoff,0.935484,0.096774,50.000000,Florida Panthers,Florida Panthers,Vegas Golden Knights,0
12644,Florida Panthers,2023-06-13,1,0,1,--,,0,0.0,0,...,44.8,20222023,Playoff,0.718750,0.085714,52.238806,Vegas Golden Knights,Vegas Golden Knights,Florida Panthers,1


### Create Rolling Data From Original Data

In [5]:
# Create a Database in PostgreSQL to save the Rolling Teams Data

# Connect to PostgreSQL server
conn = psycopg2.connect(
    dbname="postgres",       # Connect to the default PostgreSQL database
    user="User_1",           # Replace with your username
    password="postgres",     # Replace with your password
    host="192.168.1.246",    # Replace with your host
    port="5432"              # Replace with your port
)

# Set autocommit to True
conn.autocommit = True

# Create a cursor object
cur = conn.cursor()

# Execute the query to create the database
cur.execute("CREATE DATABASE nhl_team_data")

# Close the cursor
cur.close()

# Close the connection
conn.close()

In [6]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, Float, String
import pandas as pd

def calculate_n_days_rolling_stats(group_df, n_values):
    "Calaculates the rolloing statistics for the columns P%','Corsi%'"
    columns_to_avg = ['P%','Corsi%']
    rolling_avg_dfs = []
    for n in n_values:
        rolling_avg_dict = {}
        for column in columns_to_avg:
            rolling_avg_column = f'{column}_NDays_Rolling_Avg_{n}'
            rolling_avg_dict[rolling_avg_column] = group_df[column].rolling(window=n, min_periods=1).mean().shift(1)
        rolling_avg_df = pd.DataFrame(rolling_avg_dict)
        rolling_avg_dfs.append(rolling_avg_df)
    
    rolling_avg_df = pd.concat(rolling_avg_dfs, axis=1)
    return pd.concat([group_df, rolling_avg_df], axis=1)

def create_team_table(team_name, df):
    "Creates a table in PostgreSQL for teach team in the NHL data"
    metadata = MetaData()
    columns = [
        Column(col, Float) if df[col].dtype == 'float64' else Column(col, Integer) if df[col].dtype == 'int64' else Column(col, String)
        for col in df.columns
    ]
    table_name = f"team_{team_name.replace(' ', '_').lower()}"
    team_table = Table(
        table_name,
        metadata,
        *columns
    )
    metadata.create_all(engine)
    return team_table

In [7]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_team_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Group data by 'Team Name'
grouped = df.groupby('Team Name')

# Modify the loop where you create tables for each team
for team, group_df in grouped:
    group_df = group_df.sort_values(by='Game Date')
    group_df = calculate_n_days_rolling_stats(group_df, [35,40,50])
    
    # Create a table for the team
    team_table = create_team_table(team, group_df)
    
    # Insert data into the table
    group_df.to_sql(f"team_{team.replace(' ', '_').lower()}", engine, if_exists='replace', index=False)
    print(f"Table created and data inserted for {team}")

Table created and data inserted for Anaheim Ducks
Table created and data inserted for Arizona Coyotes
Table created and data inserted for Boston Bruins
Table created and data inserted for Buffalo Sabres
Table created and data inserted for Calgary Flames
Table created and data inserted for Carolina Hurricanes
Table created and data inserted for Chicago Blackhawks
Table created and data inserted for Colorado Avalanche
Table created and data inserted for Columbus Blue Jackets
Table created and data inserted for Dallas Stars
Table created and data inserted for Detroit Red Wings
Table created and data inserted for Edmonton Oilers
Table created and data inserted for Florida Panthers
Table created and data inserted for Los Angeles Kings
Table created and data inserted for Minnesota Wild
Table created and data inserted for Montreal Canadiens
Table created and data inserted for Nashville Predators
Table created and data inserted for New Jersey Devils
Table created and data inserted for New York

### Merge Rolling Teams Tables with Original Data to Add the Rolling Sums and Averages to the Data

In [8]:
def merge_rolling_data(df):
    "Merges the NHL team data with the cleansed data and calculates difference columns to incorporate the rolling statistics into the cleansed data frame"
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Remove original columns used to calculate rolling sums and averages from the dataFrame
    df_copy.drop(df_copy.columns[2:27], axis=1, inplace=True)
    
    # Select home and away teams
    home_teams = df_copy[df_copy['Team Name'] == df_copy['Home Team']]
    away_teams = df_copy[df_copy['Team Name'] == df_copy['Away Team']]
    
    # Define the PostgreSQL database connection string
    db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_team_data"

    # Establish a connection to the PostgreSQL database
    engine = create_engine(db_string)

    # Create an inspector to get table names
    inspector = inspect(engine)

    # Fetch table names from the database schema
    table_names = inspector.get_table_names()

    # List to store DataFrames for each table
    table_dfs = []

    # Iterate over each table
    for table_name in table_names:
        # Query data from the table
        query = f"SELECT * FROM {table_name};"
        table_df = pd.read_sql(query, con=engine)
        # Add a column to indicate the source table
        table_df['Source_Table'] = table_name
        # Append DataFrame to the list
        table_dfs.append(table_df)

    # Concatenate all DataFrames into one
    combined_team_df = pd.concat(table_dfs, ignore_index=True)
    
    # Convert 'Game Date' column to datetime64[ns] in combined team dataframe
    combined_team_df['Game Date'] = pd.to_datetime(combined_team_df['Game Date'])
    
    # Select columns from combined_team_df that contain "Rolling" in the column header
    selected_columns = ['Team Name', 'Game Date'] + [col for col in combined_team_df.columns if 'Rolling' in col]
    combined_team_df = combined_team_df[selected_columns]
        
    # Merge Home Teams and Combined Rolling Dataframe Based on 'Team Name' and 'Game Date'
    home_merged_df = pd.merge(combined_team_df, home_teams, on=['Team Name', 'Game Date'], how='inner')

    # Merge Away Teams and Combined Rolling Dataframe Based on 'Team Name' and 'Game Date'
    away_merged_df = pd.merge(combined_team_df, away_teams, on=['Team Name', 'Game Date'], how='inner')
    
    
    # Reorder the columns so that columns from team names and date appear first
    home_merged_df = home_merged_df[home_teams.columns.tolist() + [col for col in home_merged_df.columns if col not in home_teams.columns]]

    # Reordering columns so that columns from home_teams appear first
    away_merged_df = away_merged_df[away_merged_df.columns.tolist() + [col for col in away_merged_df.columns if col not in away_merged_df.columns]]

    
    # Iterate over column names and add "Home_" to columns containing "Rolling" in the header
    for col in home_merged_df.columns:
        if 'Rolling' in col:
            home_merged_df.rename(columns={col: 'Home_' + col}, inplace=True)    


    # Iterate over column names and add "Away_" to columns containing "Rolling" in the header
    for col in away_merged_df.columns:
        if 'Rolling' in col:
            away_merged_df.rename(columns={col: 'Away_' + col}, inplace=True)

    # Drop the "Team Name" column from away_merged_df and home_merged_df
    away_merged_df.drop(columns=["Team Name"], inplace=True)
    home_merged_df.drop(columns=["Team Name"], inplace=True)

    # Move the columns "Home Team", "Away Team", and "Home Team W" to the front
    move_columns = ['Game Date', 'Home Team', 'Away Team', 'Home Team W']

    home_merged_df = home_merged_df[move_columns + [col for col in home_merged_df.columns if col not in move_columns]]
    away_merged_df = away_merged_df[move_columns + [col for col in away_merged_df.columns if col not in move_columns]]
    
    # Merge home_merged_df and away_merged_df based on the specified columns
    rolling_df = pd.merge(home_merged_df, away_merged_df, on=["Game Date", "Home Team", "Away Team", "Home Team W"], how="inner")

    # Create a list to store the difference columns
    difference_columns = []
    rolling_df_copy = rolling_df.copy()    

    # Create a DataFrame to store the computed differences
    difference_df = pd.DataFrame()

    # Iterate over each column in the DataFrame
    for column in rolling_df_copy.columns:
        # Check if the column contains "Home_Avg" or "Away_Avg"
        if "Home_" in column:
            # Generate the corresponding Away_Avg column name
            away_column = column.replace("Home_", "Away_")
            # Generate the difference column name
            difference_column_name = column.replace("Home_", "Difference_")
            # Compute the difference and assign it to the new DataFrame
            difference_df = difference_df.assign(**{difference_column_name: rolling_df_copy[column] - rolling_df_copy[away_column]})
            # Append the name of the new difference column to the list
            difference_columns.append(difference_column_name)

    # Concatenate the original DataFrame with the difference DataFrame
    merged_df = pd.concat([rolling_df, difference_df], axis=1)
    
    # Remove rows with blanks in columns containing "Difference" in column name
    merged_df = merged_df[~merged_df.filter(like='Difference').isnull().any(axis=1)]

    # Select only needed columns
    selected_columns = ['Game Date', 'Home Team', 'Away Team', 'Home Team W'] + [col for col in merged_df.columns if 'Difference' in col]
    
    # Ensure all columns used in correlation calculation are numeric
    merged_df = merged_df[selected_columns]
   
    # Convert each difference column to numeric
    for column in merged_df.columns:
        # Check if the column name contains "difference"
        if "difference" in column.lower():
            # Convert the column to numeric
            merged_df[column] = pd.to_numeric(merged_df[column], errors='coerce')

    # Sort merged_df by 'Game Date' and then by 'Team Name'
    merged_df_sorted = merged_df.sort_values(by=['Game Date', 'Home Team']).reset_index(drop=True)

    return merged_df_sorted

In [9]:
merged_df = merge_rolling_data(df)

In [10]:
merged_df

Unnamed: 0,Game Date,Home Team,Away Team,Home Team W,Difference_P%_NDays_Rolling_Avg_35,Difference_Corsi%_NDays_Rolling_Avg_35,Difference_P%_NDays_Rolling_Avg_40,Difference_Corsi%_NDays_Rolling_Avg_40,Difference_P%_NDays_Rolling_Avg_50,Difference_Corsi%_NDays_Rolling_Avg_50
0,2018-10-05,Columbus Blue Jackets,Carolina Hurricanes,0,0.500000,-3.595275,0.5000,-3.595275,0.50,-3.595275
1,2018-10-06,Arizona Coyotes,Anaheim Ducks,0,-1.000000,23.295455,-1.0000,23.295455,-1.00,23.295455
2,2018-10-06,Buffalo Sabres,New York Rangers,1,0.000000,2.998501,0.0000,2.998501,0.00,2.998501
3,2018-10-06,Calgary Flames,Vancouver Canucks,1,-1.000000,20.689655,-1.0000,20.689655,-1.00,20.689655
4,2018-10-06,Colorado Avalanche,Philadelphia Flyers,1,0.000000,15.573770,0.0000,15.573770,0.00,15.573770
...,...,...,...,...,...,...,...,...,...,...
6300,2023-06-03,Vegas Golden Knights,Florida Panthers,1,0.042857,-3.678704,0.0625,-5.188466,0.09,-4.093859
6301,2023-06-05,Vegas Golden Knights,Florida Panthers,1,0.071429,-3.132973,0.0875,-5.018701,0.11,-3.894222
6302,2023-06-08,Florida Panthers,Vegas Golden Knights,1,-0.100000,2.776998,-0.1125,4.652316,-0.13,4.204792
6303,2023-06-10,Florida Panthers,Vegas Golden Knights,0,-0.071429,2.098666,-0.0625,3.822915,-0.09,4.136202


In [11]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_master_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Save the combined DataFrame as a table in PostgreSQL database
merged_df.to_sql("original_model_data", engine, index=False, if_exists="replace")

305

### Create Machine Learning Model Using Original Data

In [12]:
# Select features and target variable
X = merged_df[['Difference_Corsi%_NDays_Rolling_Avg_35', 'Difference_Corsi%_NDays_Rolling_Avg_40', 'Difference_Corsi%_NDays_Rolling_Avg_50', 'Difference_P%_NDays_Rolling_Avg_50']]
y = merged_df['Home Team W']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier with a linear kernel
model = svm.SVC(kernel='linear', random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model.fit(X_train_scaled, y_train)

# Predict labels for test set
y_pred = model.predict(X_test_scaled)

In [13]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Construct and print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)  

Accuracy: 0.5963521015067407
Confusion Matrix:
[[281 309]
 [200 471]]


## Update Model with 2023-2024 Season Data

### Gather Data Up-To-Date Data

In [14]:
def find_season_data(season_year, gameType="2"):
    """Scrape NHL season data and store as DataFrames"""
    # Note: gameType = 2 is Regular Season and gameType = 3 is Playoffs
    # Set up Chrome WebDriver with headless option
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    # Define the season and page variables
    season = season_year

    # Construct the URL using the season and page variables
    url = f"https://www.nhl.com/stats/teams?aggregate=0&reportType=game&seasonFrom={season}&seasonTo={season}&dateFromSeason&gameType={gameType}&sort=a_gameDate&page=0&pageSize=100"

    # Open the webpage
    driver.get(url)

    # Scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for a brief moment to allow content to load
    time.sleep(5)

    # Retrieve the HTML content after the page has fully loaded
    html_content = driver.page_source

    # Close the browser
    driver.quit()

    # Use regular expression to find the max value
    max_value_match = re.search(r'max="(\d+)"', html_content)

    if max_value_match:
        num_pages = max_value_match.group(1)
    else:
        print("Max value not found in the HTML.")

    # Set up Chrome WebDriver with headless option again for scraping data
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    data = []  # Initialize an empty list to store data

    # Loop through each page
    for page in range(int(num_pages)):
        # Construct the URL using the season and page variables
        url = f"https://www.nhl.com/stats/teams?aggregate=0&reportType=game&seasonFrom={season}&seasonTo={season}&dateFromSeason&gameType={gameType}&sort=a_gameDate&page={page}&pageSize=100"

        # Open the webpage
        driver.get(url)

        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for a brief moment to allow content to load
        time.sleep(5)

        # Retrieve the HTML content after the page has fully loaded
        html_content = driver.page_source

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extracting relevant data
        rows = soup.find_all('div', class_='rt-tr-group')

        # Append data based on game type
        for row in rows:
            columns = row.find_all('div', class_='rt-td')
            team_name = columns[1].text.strip()
            game_date = columns[2].text.strip()
            GP = columns[3].text.strip()
            W = columns[4].text.strip()
            L = columns[5].text.strip()
            T = columns[6].text.strip()

            if gameType == "2" or season_year in ["20192020", "20202021"]:
                OT = columns[7].text.strip()
                P = columns[8].text.strip()
                P_percent = columns[9].text.strip()
                RW = columns[10].text.strip()
                ROW = columns[11].text.strip()
                SO_win = columns[12].text.strip()
                GF = columns[13].text.strip()
                GA = columns[14].text.strip()
                GF_GP = columns[15].text.strip()
                GA_GP = columns[16].text.strip()
                PP_percent = columns[17].text.strip()
                PK_percent = columns[18].text.strip()
                Net_PP_percent = columns[19].text.strip()
                Net_PK_percent = columns[20].text.strip()
                Shots_GP = columns[21].text.strip()
                SA_GP = columns[22].text.strip()
                FOW_percent = columns[23].text.strip()

                data.append({
                    "Team Name": team_name,
                    "Game Date": game_date,
                    "GP": GP,
                    "W": W,
                    "L": L,
                    "T": T,
                    "OT": OT,
                    "P": P,
                    "P%": P_percent,
                    "RW": RW,
                    "ROW": ROW,
                    "SO_win": SO_win,
                    "GF": GF,
                    "GA": GA,
                    "GF/GP": GF_GP,
                    "GA/GP": GA_GP,
                    "PP%": PP_percent,
                    "PK%": PK_percent,
                    "Net PP%": Net_PP_percent,
                    "Net PK%": Net_PK_percent,
                    "Shots/GP": Shots_GP,
                    "SA/GP": SA_GP,
                    "FOW%": FOW_percent
                })

            elif gameType == "3":
                P = columns[7].text.strip()
                P_percent = columns[8].text.strip()
                RW = columns[9].text.strip()
                ROW = columns[10].text.strip()
                SO_win = columns[11].text.strip()
                GF = columns[12].text.strip()
                GA = columns[13].text.strip()
                GF_GP = columns[14].text.strip()
                GA_GP = columns[15].text.strip()
                PP_percent = columns[16].text.strip()
                PK_percent = columns[17].text.strip()
                Net_PP_percent = columns[18].text.strip()
                Net_PK_percent = columns[19].text.strip()
                Shots_GP = columns[20].text.strip()
                SA_GP = columns[21].text.strip()
                FOW_percent = columns[22].text.strip()

                data.append({
                    "Team Name": team_name,
                    "Game Date": game_date,
                    "GP": GP,
                    "W": W,
                    "L": L,
                    "T": T,
                    "P": P,
                    "P%": P_percent,
                    "RW": RW,
                    "ROW": ROW,
                    "SO_win": SO_win,
                    "GF": GF,
                    "GA": GA,
                    "GF/GP": GF_GP,
                    "GA/GP": GA_GP,
                    "PP%": PP_percent,
                    "PK%": PK_percent,
                    "Net PP%": Net_PP_percent,
                    "Net PK%": Net_PK_percent,
                    "Shots/GP": Shots_GP,
                    "SA/GP": SA_GP,
                    "FOW%": FOW_percent
                })

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)

    # Close the browser
    driver.quit()

    # Return the DataFrame
    return df

In [15]:
# Dictionary to store DataFrames
season_data_dict = {}

# Scrape and store data for each year and game type
for year in ["20232024"]:
    for game_Type in ["2", "3"]:
        # Generate key for the dictionary
        key = f"{year} {'Regular Season' if game_Type == '2' else 'Playoff Season'}"
        # Store DataFrame in the dictionary
        season_data_dict[key] = find_season_data(year, game_Type)

In [16]:
# Function to add season and type columns to a DataFrame
def add_season_and_type(df, season_year, gameType):
    "Add Season Year and Type to Raw NHL Statistics"
    df['Season'] = season_year
    df['Type'] = 'Regular Season' if gameType == "2" else 'Playoff'
    return df

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the dataframes in the dictionary
for key, df in season_data_dict.items():
    # Check if "OT" column already exists
    if "OT" not in df.columns:
        # Add "OT" column with all values set to "N/A"
        df.insert(df.columns.get_loc('P'), 'OT', 'N/A') 

    # Extract season year and game type from the key
    season_year, gameType = key.split()[0], "2" if "Regular" in key else "3"

    # Add season and type columns to the DataFrame
    df = add_season_and_type(df, season_year, gameType)

    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

In [17]:
# Get yesterday's date
yesterday_date = datetime.now() - timedelta(days=1)
yesterday_date_str = yesterday_date.strftime("%Y/%m/%d")

# Get today's date
today_date = datetime.now()
today_date_str = today_date.strftime("%Y/%m/%d")

# Remove Games from Yesterday and Today from Update Data (We will stream this data instead)
combined_df = combined_df[~combined_df['Game Date'].str.contains(yesterday_date_str) & ~combined_df['Game Date'].str.contains(today_date_str)]

In [18]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_master_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Save the combined DataFrame as a table in PostgreSQL database
combined_df.to_sql("update_raw_data", engine, index=False, if_exists="replace")

716

In [19]:
# Save Combined DF To File
directory_file_path = r'C:\Users\KLBue\OneDrive\Documents\MS Data Science\Capstone Project\Git Hub Submission\Back-Up Data\update_raw_data.csv'

combined_df.to_csv(directory_file_path, index=False)

print("DataFrame saved successfully as CSV files.")

DataFrame saved successfully as CSV files.


### Cleanse Up-To-Date Data

In [20]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

In [21]:
update_df = cleanse_nhl_raw_data("update_raw_data")

In [22]:
update_df

Unnamed: 0,Team Name,Game Date,GP,W,L,T,OT,P,P%,RW,...,FOW%,Season,Type,Save %,Shooting %,Corsi%,Against Team,Home Team,Away Team,Home Team W
0,Pittsburgh Penguins,2023-10-10,1,0,1,--,0,0,0.0,0,...,67.8,20232024,Regular Season,0.888889,0.048780,53.246753,Chicago Blackhawks,Pittsburgh Penguins,Chicago Blackhawks,0
1,Tampa Bay Lightning,2023-10-10,1,1,0,--,0,2,1.0,1,...,56.7,20232024,Regular Season,0.903226,0.147059,52.307692,Nashville Predators,Tampa Bay Lightning,Nashville Predators,1
2,Chicago Blackhawks,2023-10-10,1,1,0,--,0,2,1.0,1,...,32.2,20232024,Regular Season,0.951220,0.111111,46.753247,Pittsburgh Penguins,Pittsburgh Penguins,Chicago Blackhawks,0
3,Nashville Predators,2023-10-10,1,0,1,--,0,0,0.0,0,...,43.3,20232024,Regular Season,0.852941,0.096774,47.692308,Tampa Bay Lightning,Tampa Bay Lightning,Nashville Predators,1
4,Vegas Golden Knights,2023-10-10,1,1,0,--,0,2,1.0,1,...,59.2,20232024,Regular Season,0.969697,0.142857,45.901639,Seattle Kraken,Vegas Golden Knights,Seattle Kraken,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2711,Carolina Hurricanes,2024-05-05,1,0,1,--,,0,0.0,0,...,53.1,20232024,Playoff,0.826087,0.120000,52.083333,New York Rangers,New York Rangers,Carolina Hurricanes,1
2712,Dallas Stars,2024-05-05,1,1,0,--,,2,1.0,1,...,49.1,20232024,Playoff,0.956522,0.083333,51.063830,Vegas Golden Knights,Dallas Stars,Vegas Golden Knights,1
2713,Vegas Golden Knights,2024-05-05,1,0,1,--,,0,0.0,0,...,50.9,20232024,Playoff,0.916667,0.043478,48.936170,Dallas Stars,Dallas Stars,Vegas Golden Knights,1
2714,Boston Bruins,2024-05-06,1,1,0,--,,2,1.0,1,...,41.3,20232024,Playoff,0.974359,0.172414,42.647059,Florida Panthers,Florida Panthers,Boston Bruins,0


In [23]:
update_df = update_df.drop_duplicates()

### Update Team's Table

In [24]:
import pandas as pd
from sqlalchemy import create_engine

def update_teams_tables(grouped, engine):
    """Update Teams' Tables in PostgreSQL with New Data and Rolling Averages/Sums"""
    
    for team, group_df in grouped:
        # Query the existing data from the team's table
        existing_query = f"SELECT * FROM team_{team.replace(' ', '_').lower()}"
        existing_data = pd.read_sql(existing_query, engine)
        
        # Merge existing data with new data
        merged_data = pd.concat([existing_data, group_df], ignore_index=True)
        
        # Calculate rolling stats for the merged data
        merged_data = calculate_n_days_rolling_stats(merged_data, [35,40,50])
        
        # Update the team's table with the merged and updated data
        merged_data.to_sql(f"team_{team.replace(' ', '_').lower()}", engine, if_exists='replace', index=False)
        print(f"Table updated for {team}")

In [25]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_team_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Group data by 'Team Name'
grouped = update_df.groupby('Team Name')

# Update teams' tables
update_teams_tables(grouped, engine)

Table updated for Anaheim Ducks
Table updated for Arizona Coyotes
Table updated for Boston Bruins
Table updated for Buffalo Sabres
Table updated for Calgary Flames
Table updated for Carolina Hurricanes
Table updated for Chicago Blackhawks
Table updated for Colorado Avalanche
Table updated for Columbus Blue Jackets
Table updated for Dallas Stars
Table updated for Detroit Red Wings
Table updated for Edmonton Oilers
Table updated for Florida Panthers
Table updated for Los Angeles Kings
Table updated for Minnesota Wild
Table updated for Montreal Canadiens
Table updated for Nashville Predators
Table updated for New Jersey Devils
Table updated for New York Islanders
Table updated for New York Rangers
Table updated for Ottawa Senators
Table updated for Philadelphia Flyers
Table updated for Pittsburgh Penguins
Table updated for San Jose Sharks
Table updated for Seattle Kraken
Table updated for St Louis Blues
Table updated for Tampa Bay Lightning
Table updated for Toronto Maple Leafs
Table upda

### Create Merged Updated Data

In [26]:
update_merged_df = merge_rolling_data(update_df)

In [27]:
update_merged_df

Unnamed: 0,Game Date,Home Team,Away Team,Home Team W,Difference_P%_NDays_Rolling_Avg_35,Difference_Corsi%_NDays_Rolling_Avg_35,Difference_P%_NDays_Rolling_Avg_40,Difference_Corsi%_NDays_Rolling_Avg_40,Difference_P%_NDays_Rolling_Avg_50,Difference_Corsi%_NDays_Rolling_Avg_50
0,2023-10-10,Pittsburgh Penguins,Chicago Blackhawks,0,0.142857,8.143969,0.1500,8.641812,0.10,7.453362
1,2023-10-10,Tampa Bay Lightning,Nashville Predators,1,-0.142857,4.305100,-0.1375,4.291642,-0.06,3.524612
2,2023-10-10,Vegas Golden Knights,Seattle Kraken,1,0.185714,-5.201885,0.2125,-5.616687,0.20,-5.620983
3,2023-10-11,Boston Bruins,Chicago Blackhawks,1,0.385714,5.609144,0.3625,6.454469,0.34,6.656505
4,2023-10-11,Calgary Flames,Winnipeg Jets,1,0.114286,6.979842,0.1250,7.148473,0.09,7.227845
...,...,...,...,...,...,...,...,...,...,...
1353,2024-05-03,Vegas Golden Knights,Dallas Stars,1,-0.214286,-4.843570,-0.1875,-5.399066,-0.17,-4.403001
1354,2024-05-04,Boston Bruins,Toronto Maple Leafs,1,-0.014286,-3.855497,-0.0500,-4.223606,0.02,-4.103734
1355,2024-05-05,Dallas Stars,Vegas Golden Knights,1,0.157143,4.766114,0.1625,4.354830,0.15,4.130628
1356,2024-05-05,New York Rangers,Carolina Hurricanes,1,0.014286,-5.694638,0.0250,-5.375052,-0.04,-4.611114


In [28]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_master_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Save the combined DataFrame as a table in PostgreSQL database
update_merged_df.to_sql("update_model_data", engine, index=False, if_exists="replace")

358

### Update Model with Up-To-Date Data

In [29]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@192.168.1.246:5432/nhl_master_data"

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Connect to the database
connection = engine.connect()

# Execute SQL query to select all data from the original_model_data and update_model_data
orignal_query = f"SELECT * FROM original_model_data"
update_query = f"SELECT * FROM update_model_data"

# Load data into DataFrame
original_model_df = pd.read_sql(orignal_query, connection)
update_model_df = pd.read_sql(update_query, connection)

# Close the database connection
connection.close()

# Select x and y columns
x_columns = ['Difference_Corsi%_NDays_Rolling_Avg_35', 'Difference_Corsi%_NDays_Rolling_Avg_40', 'Difference_Corsi%_NDays_Rolling_Avg_50', 'Difference_P%_NDays_Rolling_Avg_50']
y_column = ['Home Team W']

X_original = original_model_df[x_columns]
y_original = original_model_df[y_column]

# Select features and target variable from Update Data
X_update = update_model_df[x_columns]
y_update = update_model_df[y_column]

# Concatenate the new data with the existing data
combined_X = pd.concat([X_original, X_update])
combined_y = pd.concat([y_original, y_update])

combined_X = combined_X[['Difference_Corsi%_NDays_Rolling_Avg_35', 'Difference_Corsi%_NDays_Rolling_Avg_40', 'Difference_Corsi%_NDays_Rolling_Avg_50', 'Difference_P%_NDays_Rolling_Avg_50']]
combined_y = combined_y['Home Team W']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(combined_X, combined_y, test_size=0.2, random_state=42)

# Initialize SVM classifier with a linear kernel
model = svm.SVC(kernel='linear', random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model.fit(X_train_scaled, y_train)

# Predict labels for test set
y_pred = model.predict(X_test_scaled)

In [30]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Construct and print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)  

Accuracy: 0.5968688845401174
Confusion Matrix:
[[334 388]
 [230 581]]
