In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/goalie-stats/all-teams/45?qualified=all&conference=12&playertype=goalie&rookie=no&sort=wins&statstype=standard&page=1&league=1&division=-1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Jersey#', 'Name', 'Pos', 'Division', 'Team', 'GP', 'W', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']

df = pd.DataFrame(acha_dynamic_stats, columns=column_names)

# Skip every other row, keeping only odd-indexed rows (1, 3, 5, etc.)
df = df.iloc[::2]
df = df[df['GP'].notna()]

numeric_columns = ['Jersey#', 'GP', 'W', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  

string_columns = ['Name', 'Pos', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)
    


print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_wd2_goalies.csv', index=False)

     Jersey#               Name              Pos  \
0       30.0       Abby Hodsden        WD2 DVCHC   
2       30.0      Allison Moore  WD2 Independent   
4       30.0        Rachel Hart        WD2 CCWHA   
6       30.0      Payton Murray  WD2 Independent   
8        8.0       Nadia Balduf        WD2 DVCHC   
..       ...                ...              ...   
292      1.0          Amy Scali        WD2 IWCHL   
294      1.0      Bailey Arnold  WD2 Independent   
296      1.0  Molly Murgittroyd        WD2 CCWHA   
298      3.0      Hayley Minter        WD2 ACCHL   
300      1.0    Veronica Booker        WD2 CCWHA   

                                  Division Team  GP  W  OTL  SOL  T  GA  \
0                   WD2 Liberty University   12  11  1    0    0  0   8   
2                WD2 Iowa State University   12   8  4    0    0  0  28   
4    WD2 Lawrence Technological University   14   7  4    1    0  2  39   
6                  WD2 Assiniboine College    8   7  1    0    0  0   5   
