## Create nhl_master_data Database in PostgreSQL

In [None]:
import psycopg2
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from bs4 import BeautifulSoup
import os
from sqlalchemy import create_engine

In [None]:
# Connect to PostgreSQL server
conn = psycopg2.connect(
    dbname="postgres",       # Connect to the default PostgreSQL database
    user="User_1",           # Replace with your username
    password="postgres",     # Replace with your password
    host="ip_address",    # Replace with IP Address
    port="5432"              # Replace with your port
)

# Set autocommit to True
conn.autocommit = True

# Create a cursor object
cur = conn.cursor()

# Execute the query to create the database
cur.execute("CREATE DATABASE nhl_master_data")

# Close the cursor
cur.close()

# Close the connection
conn.close()

## Create Original Raw Data Table in PostgreSQL

In [None]:
def find_season_data(season_year, gameType="2"):
    """Scrape NHL season data and store as DataFrames"""
    # Note: gameType = 2 is Regular Season and gameType = 3 is Playoffs
    # Set up Chrome WebDriver with headless option
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    # Define the season and page variables
    season = season_year

    # Construct the URL using the season and page variables
    url = f"https://www.nhl.com/stats/teams?aggregate=0&reportType=game&seasonFrom={season}&seasonTo={season}&dateFromSeason&gameType={gameType}&sort=a_gameDate&page=0&pageSize=100"

    # Open the webpage
    driver.get(url)

    # Scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for a brief moment to allow content to load
    time.sleep(5)

    # Retrieve the HTML content after the page has fully loaded
    html_content = driver.page_source

    # Close the browser
    driver.quit()

    # Use regular expression to find the max value
    max_value_match = re.search(r'max="(\d+)"', html_content)

    if max_value_match:
        num_pages = max_value_match.group(1)
    else:
        print("Max value not found in the HTML.")

    # Set up Chrome WebDriver with headless option again for scraping data
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)

    data = []  # Initialize an empty list to store data

    # Loop through each page
    for page in range(int(num_pages)):
        # Construct the URL using the season and page variables
        url = f"https://www.nhl.com/stats/teams?aggregate=0&reportType=game&seasonFrom={season}&seasonTo={season}&dateFromSeason&gameType={gameType}&sort=a_gameDate&page={page}&pageSize=100"

        # Open the webpage
        driver.get(url)

        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for a brief moment to allow content to load
        time.sleep(5)

        # Retrieve the HTML content after the page has fully loaded
        html_content = driver.page_source

        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extracting relevant data
        rows = soup.find_all('div', class_='rt-tr-group')

        # Append data based on game type
        for row in rows:
            columns = row.find_all('div', class_='rt-td')
            team_name = columns[1].text.strip()
            game_date = columns[2].text.strip()
            GP = columns[3].text.strip()
            W = columns[4].text.strip()
            L = columns[5].text.strip()
            T = columns[6].text.strip()

            if gameType == "2" or season_year in ["20192020", "20202021"]:
                OT = columns[7].text.strip()
                P = columns[8].text.strip()
                P_percent = columns[9].text.strip()
                RW = columns[10].text.strip()
                ROW = columns[11].text.strip()
                SO_win = columns[12].text.strip()
                GF = columns[13].text.strip()
                GA = columns[14].text.strip()
                GF_GP = columns[15].text.strip()
                GA_GP = columns[16].text.strip()
                PP_percent = columns[17].text.strip()
                PK_percent = columns[18].text.strip()
                Net_PP_percent = columns[19].text.strip()
                Net_PK_percent = columns[20].text.strip()
                Shots_GP = columns[21].text.strip()
                SA_GP = columns[22].text.strip()
                FOW_percent = columns[23].text.strip()

                data.append({
                    "Team Name": team_name,
                    "Game Date": game_date,
                    "GP": GP,
                    "W": W,
                    "L": L,
                    "T": T,
                    "OT": OT,
                    "P": P,
                    "P%": P_percent,
                    "RW": RW,
                    "ROW": ROW,
                    "SO_win": SO_win,
                    "GF": GF,
                    "GA": GA,
                    "GF/GP": GF_GP,
                    "GA/GP": GA_GP,
                    "PP%": PP_percent,
                    "PK%": PK_percent,
                    "Net PP%": Net_PP_percent,
                    "Net PK%": Net_PK_percent,
                    "Shots/GP": Shots_GP,
                    "SA/GP": SA_GP,
                    "FOW%": FOW_percent
                })

            elif gameType == "3":
                P = columns[7].text.strip()
                P_percent = columns[8].text.strip()
                RW = columns[9].text.strip()
                ROW = columns[10].text.strip()
                SO_win = columns[11].text.strip()
                GF = columns[12].text.strip()
                GA = columns[13].text.strip()
                GF_GP = columns[14].text.strip()
                GA_GP = columns[15].text.strip()
                PP_percent = columns[16].text.strip()
                PK_percent = columns[17].text.strip()
                Net_PP_percent = columns[18].text.strip()
                Net_PK_percent = columns[19].text.strip()
                Shots_GP = columns[20].text.strip()
                SA_GP = columns[21].text.strip()
                FOW_percent = columns[22].text.strip()

                data.append({
                    "Team Name": team_name,
                    "Game Date": game_date,
                    "GP": GP,
                    "W": W,
                    "L": L,
                    "T": T,
                    "P": P,
                    "P%": P_percent,
                    "RW": RW,
                    "ROW": ROW,
                    "SO_win": SO_win,
                    "GF": GF,
                    "GA": GA,
                    "GF/GP": GF_GP,
                    "GA/GP": GA_GP,
                    "PP%": PP_percent,
                    "PK%": PK_percent,
                    "Net PP%": Net_PP_percent,
                    "Net PK%": Net_PK_percent,
                    "Shots/GP": Shots_GP,
                    "SA/GP": SA_GP,
                    "FOW%": FOW_percent
                })

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(data)

    # Close the browser
    driver.quit()

    # Return the DataFrame
    return df

# Dictionary to store DataFrames
season_data_dict = {}

# Scrape and store data for each year and game type
for year in ["20182019", "20192020", "20202021", "20212022", "20222023"]:
    for game_Type in ["2","3"]:
        # Generate key for the dictionary
        key = f"{year} {'Regular Season' if game_Type == '2' else 'Playoff Season'}"
        # Store DataFrame in the dictionary
        season_data_dict[key] = find_season_data(year, game_Type)


In [None]:
# Function to add season and type columns to a DataFrame
def add_season_and_type(df, season_year, gameType):
    df['Season'] = season_year
    df['Type'] = 'Regular Season' if gameType == "2" else 'Playoff'
    return df

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the dataframes in the dictionary
for key, df in season_data_dict.items():
    # Check if "OT" column already exists
    if "OT" not in df.columns:
        # Add "OT" column with all values set to "N/A"
        df.insert(df.columns.get_loc('P'), 'OT', 'N/A') 

    # Extract season year and game type from the key
    season_year, gameType = key.split()[0], "2" if "Regular" in key else "3"

    # Add season and type columns to the DataFrame
    df = add_season_and_type(df, season_year, gameType)

    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Define the database connection string
db_string = "postgresql://User_1:postgres@1ip_address:5432/nhl_master_data" # Enter IP Address

# Create SQLAlchemy engine
engine = create_engine(db_string)

# Save the combined DataFrame as a table in PostgreSQL database
combined_df.to_sql("original_raw_data", engine, index=False, if_exists="replace")