In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = #Choose the path to geckodriver.exe, for example: r"C:\Program Files\GeckoDriver\geckodriver.exe"
    
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/goalie-stats/all-teams/46?qualified=all&conference=8&division=-1&playertype=goalie&rookie=no&sort=wins&statstype=standard&page=1&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Jersey#', 'Name', 'Division', 'Team', 'GP', 'W', 'L', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']

df = pd.DataFrame(acha_dynamic_stats, columns=column_names)

# Skip every other row, keeping only odd-indexed rows
df = df.iloc[::2]
df = df[df['GP'].notna()]
# Handle empty strings in Jersey#
df['Jersey#'] = df['Jersey#'].apply(lambda x: int(float(x)) if x else 0)

# Reset the index to re-index the rows
df = df.reset_index(drop=True)

numeric_columns = ['Jersey#', 'GP', 'W', 'L', 'OTL', 'SOL', 'T', 'GA', 'GAA', 'SVS', 'SV%', 'SO', 'Min', 'SOW']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  

string_columns = ['Name', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)
    


print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md1_goalies.csv', index=False)

     Jersey#            Name         Division  \
0          1   Brandon Weare          MD1 MCH   
1         40   Mason Matthew        MD1 GLCHL   
2         31       Adam Yost          MD1 GL6   
3         39  Peyton Trzaska          MD1 GL6   
4         30  Jozef Kuchaslo  MD1 Independent   
..       ...             ...              ...   
237       33   Matthew Loney          MD1 MCH   
238       33   Brady Vickery         MD1 CHMA   
239       35   Sean Knudtson         MD1 ECHA   
240       29  Connor Maloney         MD1 WHAC   
241       50    Jacob Zomick        MD1 GLCHL   

                                   Team  GP   W  L  OTL  SOL  T  GA    GAA  \
0           MD1 University of Jamestown  21  16  5    0    0  0  43   2.05   
1              MD1 Northwood University  16  14  1    0    0  0  21   1.35   
2                 MD1 Calvin University  14  12  2    0    0  0  21   1.53   
3                    MD1 Adrian College  15  12  1    1    1  0  26   1.71   
4                  MD