In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = #Choose the path to geckodriver.exe, for example: r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/standings?season=46&conference=9&division=-1&standingstype=division&context=overall&specialteams=false&sortkey=points&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for row in rows[1:]:  # Skip header row
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

column_names = ['Team', 'GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PCT', 'PCT2', 'PIM', 'RW', 'ROW', 'STK', 'IN-DIV', 'GPCT', 'P10', 'Nickname']


data_without_first_two = [row[2:] for row in acha_dynamic_stats]

df = pd.DataFrame(data_without_first_two, columns=column_names)


int_columns = ['RW', 'ROW', 'GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PIM']
for column in int_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype(int)


float_columns = ['PCT', 'GPCT', 'PCT2']
for column in float_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')


string_columns = ['Team', 'STK', 'IN-DIV', 'P10', 'Nickname']
for column in string_columns:
    df[column] = df[column].astype(str)


df = df[df['Team'] != 'None']
df = df.dropna(subset=['Team'])

print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md2_team.csv', index=False)

      Team  GP   W   L  OTL  T  SOW  SOL  PTS  OTW  ...    PCT  PCT2  PIM  RW  \
0    M2HPU  14  12   2    0  0    0    0   24    0  ...  0.857   0.0  328  12   
1    M2GTU  15  12   3    0  0    1    0   24    0  ...  0.800   0.0  141  11   
2    M2SJU  18  11   6    1  0    0    0   23    0  ...  0.639   0.0  246  11   
3     DENU  20  10  10    0  0    1    0   20    0  ...  0.500   0.0  174   9   
4    M2WVU  19   8   9    1  0    2    1   18    1  ...  0.474   0.0  142   5   
..     ...  ..  ..  ..  ... ..  ...  ...  ...  ...  ...    ...   ...  ...  ..   
195  M2UST  21  14   6    1  0    0    0   29    1  ...  0.690   0.0  206  13   
196  M2WEC  20  13   6    0  0    1    1   27    0  ...  0.675   0.0  187  12   
197  M2UMD  14   4   9    1  0    0    0    9    0  ...  0.321   0.0  111   4   
198  M2MSM  10   3   5    1  0    1    1    8    0  ...  0.400   0.0  168   2   
199  M2GAC  12   2  10    0  0    0    0    4    0  ...  0.167   0.0  192   2   

     ROW      STK   IN-DIV 