In [32]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

def scrape_acha_dynamic_stats():
    firefox_options = Options()
    firefox_options.add_argument("--headless")  

    geckodriver_path = r"C:\Program Files\GeckoDriver\geckodriver.exe"
    service = Service(executable_path=geckodriver_path)

    with webdriver.Firefox(service=service, options=firefox_options) as driver:
        url = "https://www.achahockey.org/stats/player-stats/all-teams/46?conference=8&division=-1&playertype=skater&position=skaters&rookie=no&sort=points&statstype=standard&page=1&league=1"
        driver.get(url)

        all_data = []
        
        while True:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            table = soup.find('table')
            if table:
                rows = table.find_all('tr')
                for i, row in enumerate(rows[1:], start=1):  # Start at 1 to match row numbers with human counting
                    # Keep every other row starting from the first
                    if i % 2 != 0:
                        cols = row.find_all('td')
                        # Omit the first (index 0), third (index 2), and last (index -1) elements of each row
                        cols = [ele.text.strip() for i, ele in enumerate(cols) if i != 0 and i != 2 and i != len(cols) - 1]
                        all_data.append(cols)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Next')]"))
                )
                next_button.click()
                time.sleep(2)
            except:
                break

    return all_data

acha_dynamic_stats = scrape_acha_dynamic_stats()


column_names = ['PlayerNumber', 'Name', 'Pos', 'Division', 'Team', 'GP', 'G', 'A', 'PTS', 'Pt/G', 'PPG', 'SHG', 'GWG', 'SOGW', 'PIM', 'SH%']


df = pd.DataFrame(acha_dynamic_stats, columns=column_names)


numeric_columns = ['PlayerNumber', 'GP', 'G', 'A', 'PTS', 'Pt/G', 'PPG', 'SHG', 'GWG', 'SOGW', 'PIM', 'SH%']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  # 'coerce' will turn non-numeric values to NaN


string_columns = ['Name', 'Pos', 'Division', 'Team']
for column in string_columns:
    df[column] = df[column].astype(str)


#Omitting rows where the players have played in 0 games:
df = df[df['GP'].notna()]


print(df)


print("\nDataFrame Info:")
print(df.info())




#Option to Save the DataFrame to CSV
#df.to_csv('acha_d1_skaters.csv', index=False)

      PlayerNumber               Name Pos   Division  \
0             28.0     Dylan Tammadge   F   MD1 WHAC   
1              6.0          Wes Smith   F   MD1 WHAC   
2             71.0       Steven Ickes   F    MD1 GL6   
3              9.0       Bryce Garber   F  MD1 GLCHL   
4             11.0  Krzysztof Petryla   F    MD1 GL6   
...            ...                ...  ..        ...   
2127          17.0         Ian Rogers   D   MD1 WHAC   
2128          33.0     Nolan Woodring   G    MD1 MCH   
2129          39.0         Drew Adams   G   MD1 WHAC   
2130           1.0      Brandon Weare   G    MD1 MCH   
2131          20.0        Ethan Koval   D    MD1 MCH   

                                     Team    GP     G     A   PTS  Pt/G  PPG  \
0                   MD1 Cleary University  24.0  21.0  18.0  39.0  1.63  6.0   
1                   MD1 Cleary University  23.0  17.0  22.0  39.0  1.70  2.0   
2                   MD1 Calvin University  20.0   6.0  33.0  39.0  1.95  1.0   
3      