In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = #Choose the path to geckodriver.exe, for example: r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/player-stats/all-teams/46?conference=9&division=-1&playertype=skater&position=skaters&rookie=no&sort=points&statstype=standard&page=1&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Jersey#', 'Name', 'Pos', 'Division', 'Team', 'GP', 'G', 'A', 'PTS', 'Pt/G', 'PPG', 'SHG', 'GWG', 'SOGW', 'PIM', 'SH%']

df = pd.DataFrame(acha_dynamic_stats, columns=column_names)

#Skip every other row, keeping only odd-indexed rows 
df = df.iloc[::2]
df = df[df['GP'].notna()]

#Handle Jersey#, SHG, and PIM to keep NA values as NA
for col in ['Jersey#', 'SHG', 'PIM']:
    df[col] = df[col].apply(lambda x: int(float(x)) if x else pd.NA)

#Reset the index to re-index the rows
df = df.reset_index(drop=True)

#Convert specified columns to Int64 to handle nullable integers
for col in ['Jersey#', 'SHG', 'PIM']:
    df[col] = df[col].astype('Int64')

numeric_columns = ['GP', 'G', 'A', 'PTS', 'Pt/G', 'PPG', 'GWG', 'SOGW', 'SH%']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  

string_columns = ['Name', 'Pos', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)

print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md2_skaters.csv', index=False)

#Null values reflect website

      Jersey#             Name Pos           Division  \
0          12    Gavin Peduzzi   F  MD2 ACCHL Premier   
1           9    Jacob Pierson   F    MD2 Independent   
2          48     Adam Bricker   F  MD2 WCCHA Midwest   
3          41   Kameron Khazai   F  MD2 ACCHL Premier   
4          91      Tripp Miley   F           MD2 TCHC   
...       ...              ...  ..                ...   
5016       13  Scott Naukatsik   D  MD2 WCCHA Midwest   
5017       30   Jesse Reinhard   G           MD2 TCHC   
5018       98       Ej Halleck   D     MD2 NCHL Black   
5019       28     Evan Neitzel   F     MD2 NCHL Black   
5020       21      Robby White   F     MD2 NCHL Black   

                                       Team  GP   G   A  PTS  Pt/G  PPG  SHG  \
0                         MD2 Virginia Tech  19  19  38   57  3.00    4    0   
1         MD2 North Dakota State University  19  24  25   49  2.58    8    0   
2                     MD2 Bethel University  16  19  27   46  2.88    2    