In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/standings?season=46&conference=10&division=-1&standingstype=division&context=overall&specialteams=false&sortkey=points&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for row in rows[1:]:  # Skip header row
                    cols = row.find_all('td')
                    cols = [ele.text.strip() for ele in cols]
                    all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()

# Modify columns by removing the first two and renaming the third to 'Team'
column_names = ['Team', 'GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PCT', 'PCT2', 'PIM', 'RW', 'ROW', 'STK', 'IN-DIV', 'GPCT', 'P10', 'Nickname']

# Remove first two columns from all_data before creating DataFrame
data_without_first_two = [row[2:] for row in acha_dynamic_stats]

df = pd.DataFrame(data_without_first_two, columns=column_names)

# Convert numeric columns where possible
numeric_columns = ['GP', 'W', 'L', 'OTL', 'T', 'SOW', 'SOL', 'PTS', 'OTW', 'GF', 'GA', 'DIFF', 'PCT', 'RW', 'ROW', 'GPCT']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

# Ensure string columns are treated as strings
string_columns = ['Team', 'STK', 'IN-DIV', 'P10', 'PCT2', 'PIM']
for column in string_columns:
    df[column] = df[column].astype(str)
    
df = df[df['Team'] != 'None']

print(df)

print("\nDataFrame Info:")
print(df.info())

df.to_csv('acha_md3_team.csv', index=False)

      Team    GP    W     L  OTL    T  SOW  SOL   PTS  OTW  ...    PCT   PCT2  \
0    M3WCU  10.0  4.0   6.0  0.0  0.0  0.0  0.0   8.0  0.0  ...  0.400  0.000   
1     YCPA  13.0  4.0   9.0  0.0  0.0  0.0  0.0   8.0  0.0  ...  0.308  0.000   
2    GMUM3   6.0  1.0   5.0  0.0  0.0  0.0  0.0   2.0  0.0  ...  0.167  0.000   
3     STVU   6.0  1.0   5.0  0.0  0.0  0.0  0.0   2.0  0.0  ...  0.167  0.000   
4    MILLU   0.0  0.0   0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.000  0.000   
..     ...   ...  ...   ...  ...  ...  ...  ...   ...  ...  ...    ...    ...   
121    UCD   9.0  5.0   4.0  0.0  0.0  0.0  0.0  10.0  0.0  ...  0.556  0.000   
122   M3PU  15.0  2.0  11.0  1.0  0.0  0.0  1.0   6.0  0.0  ...  0.200  0.000   
123  M3USD   9.0  1.0   8.0  0.0  0.0  0.0  0.0   2.0  0.0  ...  0.111  0.000   
124   UCSC   5.0  0.0   5.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.000  0.000   
125   CALU  10.0  0.0  10.0  0.0  0.0  0.0  0.0   0.0  0.0  ...  0.000  0.000   

     PIM   RW  ROW       ST