In [3]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/goalie-stats/all-teams/46?qualified=all&conference=10&division=-1&playertype=goalie&rookie=no&sort=wins&statstype=standard&page=1&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Jersey#', 'Name', 'Division', 'Team', 'GP', 'W', 'L', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']

df = pd.DataFrame(acha_dynamic_stats, columns=column_names)

# Skip every other row, keeping only odd-indexed rows (1, 3, 5, etc.)
df = df.iloc[::2]
df = df[df['GP'].notna()]

numeric_columns = ['Jersey#', 'GP', 'W', 'L', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  

string_columns = ['Name', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)
    


print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md3_goalies.csv', index=False)

     Jersey#              Name                     Division  \
0       70.0  Michael Gilerman  MD3 NECHA Associate Members   
2        1.0    Lucas Chalfoun              MD3 Independent   
4        6.0     Connor Nugent        MD3 MACHA Silver East   
6       44.0      Matthew Page                      MD3 CHE   
8       30.0    Maclane Schick            MD3 MCHC(MI) West   
..       ...               ...                          ...   
646     52.0  Leonidas Assimes                     MD3 WCHC   
648     31.0     Aaron Johnson                      MD3 CHE   
650     77.0     Blaise Becker                     MD3 WCHC   
652      1.0     Cameron Griep            MD3 MCHC(MI) West   
654     31.0      Chris Stamos            MD3 MCHC(MI) West   

                                        Team  GP   W  L  OTL  SOL  T  GA  \
0            MD3 University of Massachusetts  14  11  3    0    0  0  31   
2               MD3 University of Notre Dame  10  10  0    0    0  0  23   
4             M