In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/goalie-stats/all-teams/46?qualified=all&conference=9&division=-1&playertype=goalie&rookie=no&sort=wins&statstype=standard&page=1&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Jersey#', 'Name', 'Pos', 'Division', 'Team', 'GP', 'W', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']

df = pd.DataFrame(acha_dynamic_stats, columns=column_names)

# Skip every other row, keeping only odd-indexed rows (1, 3, 5, etc.)
df = df.iloc[::2]
df = df[df['GP'].notna()]

numeric_columns = ['Jersey#', 'GP', 'W', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  

string_columns = ['Name', 'Pos', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)
    


print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md2_goalies.csv', index=False)

      Jersey#                  Name                Pos  \
0        80.0          Aidan Comeau            MD2 PAC   
2        54.0        Jorgen Johnson          MD2 MWCHL   
4        74.0        Austin Bartell     MD2 NCHL Black   
6        30.0        Jesse Reinhard           MD2 TCHC   
8         1.0       Dean Saccomanno  MD2 NECHA Patriot   
...       ...                   ...                ...   
1176     33.0           Easton Kent          MD2 MWCHL   
1178     35.0  Mitchell Norkevicius          MD2 TSCHL   
1180     35.0            Brock Burm  MD2 WCCHA Midwest   
1182     30.0     Kaleb Bartholomew           MD2 BMHC   
1184     22.0         Julian Arenal          MD2 MWCHL   

                                   Division Team  GP  W  OTL  SOL  T  GA  \
0     MD2 University of California-Berkeley   15  14  0    0    0  0  14   
2              MD2 Montana State University   17  14  1    0    2  0  27   
4     MD2 Concordia University of Wisconsin   18  12  5    1    0  0  48   