In [3]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/standings?season=45&conference=11&division=-1&standingstype=division&context=overall&specialteams=false&sortkey=points&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for row in rows[1:]:  # Skip header row
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

# Modify columns by removing the first two and renaming the third to 'Team'
column_names = ['Team', 'GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PCT', 'PCT2', 'PIM', 'RW', 'ROW', 'STK', 'IN-DIV', 'GPCT', 'P10', 'Nickname']

# Remove first two columns from all_data before creating DataFrame
data_without_first_two = [row[2:] for row in acha_dynamic_stats]

df = pd.DataFrame(data_without_first_two, columns=column_names)

# Convert numeric columns where possible
numeric_columns = ['GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PCT', 'RW', 'ROW', 'GPCT']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Ensure string columns are treated as strings
string_columns = ['Team', 'STK', 'IN-DIV', 'P10', 'PCT2', 'PIM']
for column in string_columns:
    df[column] = df[column].astype(str)

# Drop rows where Team equals "None"
df = df[df['Team'] != 'None']

print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_wd1_team.csv', index=False)

     Team    GP     W     L  OTL    T  SOW  SOL   PTS  OTW  ...    PCT   PCT2  \
0      AC  22.0  17.0   3.0  0.0  2.0  0.0  0.0  36.0  3.0  ...  0.818  0.000   
1    ITW1  18.0  13.0   3.0  1.0  1.0  0.0  0.0  28.0  3.0  ...  0.778  0.000   
2   MichS  12.0   9.0   2.0  1.0  0.0  0.0  0.0  19.0  1.0  ...  0.792  0.000   
3    Mich  14.0   8.0   3.0  0.0  3.0  0.0  0.0  19.0  1.0  ...  0.679  0.000   
4   MichD  17.0   8.0   7.0  2.0  0.0  0.0  0.0  18.0  0.0  ...  0.529  0.000   
5   Miami  15.0   7.0   5.0  2.0  1.0  0.0  0.0  17.0  0.0  ...  0.567  0.000   
6    LSSU  16.0   8.0   7.0  1.0  0.0  0.0  0.0  17.0  2.0  ...  0.531  0.000   
7     PSU  10.0   5.0   4.0  1.0  0.0  0.0  0.0  11.0  0.0  ...  0.550  0.000   
8    CUAA  12.0   3.0   9.0  0.0  0.0  0.0  0.0   6.0  0.0  ...  0.250  0.000   
9    GVSU  15.0   1.0  13.0  1.0  0.0  0.0  0.0   3.0  0.0  ...  0.100  0.000   
10     DU  12.0   0.0  11.0  0.0  1.0  0.0  0.0   1.0  0.0  ...  0.042  0.000   
11     AQ   8.0   0.0   8.0 