In [1]:
import seaborn as sns
from scipy.stats import pearsonr, ttest_ind
import pandas as pd
from selenium import webdriver  
from selenium.webdriver.chrome.service import Service  
from selenium.webdriver.common.by import By  
from selenium.webdriver.chrome.options import Options  
from webdriver_manager.chrome import ChromeDriverManager  
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import time  
import random  
import matplotlib.pyplot as plt
from selenium.common.exceptions import ElementClickInterceptedException
import matplotlib.ticker as mtick
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import f_oneway
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



In [None]:
# Set up ChromeDriver using webdriver_manager
chrome_options = Options()  # Initialize Chrome options (optional)
service = Service(ChromeDriverManager().install())  # Install and set up ChromeDriver as a service

# Create a ChromeDriver instance
browser = webdriver.Chrome(service=service, options=chrome_options)

# Open the website
url = 'https://www.nhl.com/stats/teams?aggregate=0&reportType=season&seasonFrom=20112012&seasonTo=20232024&gameType=2&sort=a_seasonId&page=0&pageSize=50'
browser.get(url)

# Maximize the browser window for better visibility
browser.maximize_window()

# Wait a random time between 3-7 seconds before starting
time.sleep(random.uniform(3, 7))

team= []
season= []
games_played= []
wins= []
losses= []
overtime_losses= []
points= []
point_perc= []
gf= []
ga= []


def scrape_page():
    # Find rows
    rows = browser.find_elements(By.XPATH, '//tbody[@class="rt-tbody"]/tr')
    for i, row in enumerate(rows):
        try:
            # Locate all columns in the current row
            cols = row.find_elements(By.XPATH, './/td')
            team.append(cols[1].text)  # Second column: Team
            season.append(cols[2].text)  # Third column: Season
            games_played.append(cols[3].text)       # Fourth column: GP
            wins.append(cols[4].text)               # Fifth column: Wins
            losses.append(cols[5].text)             # Sixth column: Losses
            overtime_losses.append(cols[7].text)    # Eighth column: OT Losses
            points.append(cols[8].text)             # Ninth column: Points
            point_perc.append(cols[9].text)         # Tenth column: Point Percentage
            gf.append(cols[13].text)                # Fourteenth column: Goals For
            ga.append(cols[14].text)                # Fifteenth column: Goals Against
        except IndexError:
            print(f"Row {i} does not have enough columns. Skipping.")

while True:
    scrape_page()  # Scrape the current page
    try:
        # Wait for the Next button
        next_button = WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="season-tabpanel"]/span/nav/button[2]'))
        )
        
        # Scroll down
        browser.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
        
        try:
                    # Retry clicking
                    next_button.click()
        except ElementClickInterceptedException:
                    print("ElementClickInterceptedException: Retrying click after scrolling")
                    time.sleep(1)  # Wait 
                    browser.execute_script("window.scrollBy(0, 300);")  # Scroll down
                    browser.execute_script("arguments[0].click();", next_button)
                    time.sleep(random.uniform(3, 7))  # Wait for the next page to load
    except TimeoutException:
        print("Timeout waiting for Next")
        break
    except NoSuchElementException:
        print("No more pages to scrape")
        break
browser.quit()  # Close the browser after scraping
# Convert to df
data = {
    'Team': team,
    'Season': season,
    'Games Played': games_played,
    'Wins': wins,
    'Losses': losses,
    'Overtime Losses': overtime_losses,
    'Points': points,
    'Point Percentage': point_perc,
    'Goals For': gf,
    'Goals Against': ga
}
stats = pd.DataFrame(data)  


# Convert specific columns to float64
columns_to_convert = [
    'Games Played', 'Wins', 'Losses', 'Overtime Losses',
    'Points', 'Point Percentage', 'Goals For', 'Goals Against'
]
for column in columns_to_convert:
    stats[column] = stats[column].astype('float64')

stats['Season'] = stats['Season'].astype(str)

# Ensure column headers are set correctly
stats.columns = ['Team', 'Season', 'Games Played', 'Wins', 'Losses', 'Overtime Losses', 'Points', 'Point Percentage', 'Goals For', 'Goals Against']
stats['Season'] = stats['Season'].str.split('-').str[0]
# Display the DataFrame to verify
print(stats.head())

stats.to_csv('nhl_stats.csv', index=False)