downloaded datasets from https://www.sports-reference.com/cbb/schools/connecticut/men/2025.html#all_roster with two csv's per team, one of roster for year (upperclassmen) and one of per game for starters, with team totals in same table having info on the field goals, free throws, etc aka common bbal stats

In [36]:
#getting from downloaded html file

import os
from bs4 import BeautifulSoup
import pandas as pd

# Directory containing the HTML files
html_folder = '../data/html_sportsreference'
output_folder = '../'  # Directory to save CSV files

# List of table div IDs to extract
table_div_ids = ['all_roster', 'all_advanced_players']

# Loop through each HTML file in the folder
for filename in os.listdir(html_folder):
    if filename.endswith(".html"):
        print(f"Processing file: {filename}")
        file_path = os.path.join(html_folder, filename)

        # Open and parse the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        # Extract team name from filename (without .html)
        team_name = os.path.splitext(filename)[0]

        # Loop through each table ID
        for div_id in table_div_ids:
            print(f"Processing table with div ID: {div_id} in team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    print("Headers:", headers)

                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' in team '{team_name}'")
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")


Processing file: 2024-25 uconn.html
Processing table with div ID: all_roster in team 2024-25 uconn
Headers: ['Player', '#', 'Class', 'Pos', 'Height', 'Weight', 'Hometown', 'High School', 'RSCI Top 100', 'Summary']
Table 'all_roster' for team '2024-25 uconn' saved to ../2024-25 uconn_all_roster.csv
Processing table with div ID: all_advanced_players in team 2024-25 uconn
Headers: ['Rk', 'Player', 'G', 'GS', 'MP', 'PER', 'TS%', 'eFG%', '3PAr', 'FTr', 'PProd', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '', 'OWS', 'DWS', 'WS', 'WS/40', '', 'OBPM', 'DBPM', 'BPM']
Table 'all_advanced_players' for team '2024-25 uconn' saved to ../2024-25 uconn_all_advanced_players.csv


In [None]:

#same as above, but using direct url instead 

import requests
import time #request the server too much and error 429
from bs4 import BeautifulSoup
import pandas as pd
import os

# List of URLs to scrape
urls = [
    'https://www.sports-reference.com/cbb/schools/fairleigh-dickinson/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-am-corpus-christi/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-southern/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/miami-fl/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/louisiana-lafayette/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-am/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/baylor/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/auburn/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arkansas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arizona-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arizona/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/alabama/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/duke/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/drake/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/creighton/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/connecticut/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/college-of-charleston/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/colgate/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/boise-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/illinois/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/howard/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/houston/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/grand-canyon/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/gonzaga/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/furman/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/florida-atlantic/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kentucky/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kent-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kennesaw-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kansas-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kansas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iowa-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iowa/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iona/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/indiana/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/nevada/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/montana-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/missouri/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/mississippi-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/michigan-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/memphis/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/maryland/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/marquette/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/saint-marys-ca/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/purdue/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/providence/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/princeton/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/pittsburgh/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/penn-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/oral-roberts/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/northwestern/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/northern-kentucky/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/tennessee/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-christian/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/southern-california/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/southeast-missouri-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/san-diego-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/xavier/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/west-virginia/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/virginia-commonwealth/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/virginia/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/vermont/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/utah-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina-asheville/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/ucla/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/california-santa-barbara/men/2023.html',

    'https://www.sports-reference.com/cbb/schools/clemson/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/brigham-young/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/boise-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/baylor/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/auburn/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/arizona/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/alabama/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/akron/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/drake/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/dayton/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/creighton/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/connecticut/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/colorado-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/colorado/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/college-of-charleston/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/colgate/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/illinois/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/howard/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/houston/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/grand-canyon/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/grambling/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/gonzaga/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/florida-atlantic/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/florida/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/duquesne/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/duke/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/new-mexico/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/nevada/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/nebraska/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/morehead-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/montana-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/mississippi-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/michigan-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/mcneese-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/marquette/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/longwood/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/long-beach-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/kentucky/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/kansas/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/james-madison/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/iowa-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/tennessee/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/texas-christian/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/stetson/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/south-dakota-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/south-carolina/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/san-diego-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/samford/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/saint-peters/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/saint-marys-ca/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/purdue/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/oregon/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/oakland/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/northwestern/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/yale/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/wisconsin/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/western-kentucky/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/washington-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/wagner/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/virginia/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/vermont/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/utah-state/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/alabama-birmingham/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/texas-tech/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/texas-am/men/2024.html',
    'https://www.sports-reference.com/cbb/schools/texas/men/2024.html',

    'https://www.sports-reference.com/cbb/schools/yale/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/purdue/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/bryant/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/auburn/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/wyoming/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/wright-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/wisconsin/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/virginia-tech/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/vermont/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/ucla/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/alabama-birmingham/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/texas-tech/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/texas-southern/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/texas-am-corpus-christi/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/texas/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/tennessee/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/texas-christian/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/southern-california/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/south-dakota-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/seton-hall/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/san-francisco/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/san-diego-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/saint-peters/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/saint-marys-ca/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/rutgers/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/richmond/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/providence/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/ohio-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/notre-dame/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/new-mexico-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/norfolk-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/murray-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/montana-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/michigan/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/michigan-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/miami-fl/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/memphis/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/marquette/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/loyola-il/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/louisiana-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/longwood/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/kentucky/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/kansas/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/jacksonville-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/iowa-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/iowa/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/indiana/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/illinois/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/houston/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/gonzaga/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/georgia-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/duke/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/delaware/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/davidson/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/creighton/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/connecticut/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/colorado-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/colgate/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/chattanooga/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/cal-state-fullerton/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/cal-state-fullerton/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/boise-state/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/baylor/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/arkansas/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/arizona/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/alabama/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/akron/men/2022.html',
    'https://www.sports-reference.com/cbb/schools/villanova/men/2022.html'
]


# Directory to save CSV files
output_folder = '../data/upperclassmen'

# List of table div IDs to extract
table_div_ids = ['all_roster']

#init res df with the upperclassmen and team stats
d = {'team' : [], 'year': [], 'upperclassmen': [], 'team_stats': []}
res = pd.DataFrame(data=d)
res.set_index('team', inplace=True)


# Loop through URLs
for url in urls:
    #print(f"Processing URL: {url}")
    response = requests.get(url)
    time.sleep(5)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract team name from the URL
        team_name = url.split('/')[-3]  # Extract team name from URL path
        print(f"Processing team name: {team_name}")
        team_name = team_name.strip().lower()

        if team_name == "alabama-birmingham": 
            team_name = "uab"
        if team_name == "louisiana-lafayette": 
            team_name = "louisiana"
        if team_name == "loyola-il":
            team_name = "loyola (il)"
        if team_name == "fairleigh-dickinson":
            team_name = "fdu"
        if team_name == "miami-fl":
            team_name = "miami (fl)"
        if team_name == "north-carolina-asheville":
            team_name = "unc asheville"
        if team_name == "north-carolina-state":
            team_name = "nc state"
        if team_name == "saint-marys-ca":
            team_name = "saint mary's (ca)"
        if team_name == "saint-peters":
            team_name = "saint peter's"
        if team_name == "texas-am":
            team_name = "texas a&m"
        if team_name == "texas-am-corpus-christi": 
            team_name = "texas a&m-corpus christi"
        if team_name == "texas-christian":
            team_name = "tcu"
        if team_name == "california-santa-barbara":
            team_name = "uc santa barbara"

        # Loop through each table ID
        for div_id in table_div_ids:
            #print(f"Processing table with div ID: {div_id} for team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    
                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' for team '{team_name}'")
                    
                    #getting out number of upperclassmen per team
                    
                    df = pd.read_csv(csv_path)
                
                    df['Points Scored'] = df['RSCI Top 100'].str.extract(r'(\d+)').dropna()
                    # Convert extracted numbers to numeric type and sort descending
                    df['Points Scored'] = pd.to_numeric(df['Points Scored'])
                    sorted_df = df.sort_values(by='Points Scored', ascending=False)
                    sorted_df.dropna(subset = "Points Scored", inplace=True)
                    sorted_df = sorted_df.head(7)
                    jr_count = sorted_df['#'].str.count('JR').sum()
                    sr_count = sorted_df['#'].str.count('SR').sum()
                    upperclassmen = jr_count + sr_count
                    print(upperclassmen)
                    year = url.split('/')[-1].strip('.html')
                    new = pd.DataFrame({'team': [team_name], 'upperclassmen': [upperclassmen], 'year': [year]})
                    res = pd.concat([res, new])
                    print(res)
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
                    
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")
    else:
        print(f"Failed to fetch URL: {url}, Status code: {response.status_code}")


Processing team name: fairleigh-dickinson
Table 'all_roster' for team 'fdu' saved to ../data/upperclassmen/fdu_all_roster.csv
5
   year  upperclassmen  team_stats team
0  2023            5.0         NaN  fdu


  res = pd.concat([res, new])


Processing team name: texas-am-corpus-christi
Table 'all_roster' for team 'texas a&m-corpus christi' saved to ../data/upperclassmen/texas a&m-corpus christi_all_roster.csv
6
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0  2023            6.0         NaN  texas a&m-corpus christi
Processing team name: texas-southern
Table 'all_roster' for team 'texas-southern' saved to ../data/upperclassmen/texas-southern_all_roster.csv
3
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0  2023            6.0         NaN  texas a&m-corpus christi
0  2023            3.0         NaN            texas-southern
Processing team name: miami-fl
Table 'all_roster' for team 'miami (fl)' saved to ../data/upperclassmen/miami (fl)_all_roster.csv
5
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0

### basic school stats

In [38]:
import os
from bs4 import BeautifulSoup
import pandas as pd

# Path to the directory containing HTML files
directory_path = '../data/html_teamranking/'

d = {'Team' : [], 'SRS': [], 'SOS': [], 'Tm': [], 'Opp': [], 'MP': [], 'FG': [], 'FGA': [], 'FG%': [], '3P': [], '3PA': [], '3P%': [], 'FT': [],
                   'FTA': [], 'FT%': [], 'ORB': [], 'TRB': [], 'AST': [], 'STL': [], 'BLK': [], 'TOV': [],'PF': []}
    
stats = pd.DataFrame(data=d)

# Iterate through each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('season.html'):  # Process only HTML files
        file_path = os.path.join(directory_path, file_name)
        year = file_name.split()[0]

        # Open and parse the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        # Locate the table
        div_id = 'div_basic_school_stats'
        table = soup.find('div', id=div_id).find('table', id='basic_school_stats')

        if table:
            # Extract headers from the table's <thead>
            headers = [header.text.strip() for header in table.find('thead').find_all('th')]

            # Extract rows from the table's <tbody>
            rows = []
            for row in table.find('tbody').find_all('tr'):
                cells = row.find_all(['td', 'th'])
                row_data = [cell.text.strip() for cell in cells]
                # Ensure the row has the same number of columns as headers
                if len(row_data) < len(headers):
                    row_data.extend([''] * (len(headers) - len(row_data)))  # Pad missing cells with blanks
                rows.append(row_data[:len(headers)])  # Trim any extra cells

            # Convert to DataFrame
            if rows:
                df = pd.DataFrame(rows, columns=headers)
                # Save each CSV file with a unique name based on the HTML filename
                csv_filename = os.path.join('../processed_csvs', f"{file_name.split('.')[0]}_basic_school_stats.csv")
                os.makedirs(os.path.dirname(csv_filename), exist_ok=True)
                df.to_csv(csv_filename, index=False)
                print(f"Table 'basic_school_stats' saved to {csv_filename}")

                # Process the DataFrame further
                schoolDF = pd.read_csv(csv_filename)
                # FILTERING OUT AND DOING STUFF TO DF
                filtered_df = schoolDF[schoolDF['Overall'].str.contains('NCAA', na=False)].copy()
                # Create a new column 'team_name_without_nccaa' with 'NCAA' removed
                filtered_df['Overall'] = filtered_df['Overall'].str.strip().str.replace('NCAA', '')

                filtered_df = filtered_df[['Overall', 'Unnamed: 6', 'Away', 'SRS', 'SOS', 'W.1', 'L.1', 'Unnamed: 23', 'W.2', 'L.2', 'Unnamed: 26', 'W.3', 'L.3',
                   'Unnamed: 29', 'Tm.', 'Opp.', 'Unnamed: 32', 'MP', 'FG', 'FGA', 'FG%','3P']]
                # Rename columns
                filtered_df.rename(columns={'Overall' : 'Team', 'Unnamed: 6': 'SRS', 'Away': 'SOS', 'SRS': 'Tm', 'SOS': 'Opp', 'W.1': 'MP', 'L.1': 'FG', 'Unnamed: 23': 'FGA', 'W.2': 'FG%', 'L.2': '3P', 'Unnamed: 26': '3PA', 'W.3': '3P%', 'L.3': 'FT',
                   'Unnamed: 29': 'FTA', 'Tm.': 'FT%', 'Opp.': 'ORB', 'Unnamed: 32': 'TRB', 'MP': 'AST', 'FG': 'STL', 'FGA': 'BLK', 'FG%': 'TOV','3P': 'PF'}, inplace=True)
                filtered_df["Year"] = year

                stats = pd.concat([stats, filtered_df])
                
            else:
                print(f"No valid rows found in the table for {file_name}.")
        else:
            print(f"Table 'basic_school_stats' not found in {file_name}.")


Table 'basic_school_stats' saved to ../processed_csvs/2024 season_basic_school_stats.csv
Table 'basic_school_stats' saved to ../processed_csvs/2022 season_basic_school_stats.csv
Table 'basic_school_stats' saved to ../processed_csvs/2023 season_basic_school_stats.csv


# #LEFT JOIN WITH JULIA

joining first upperclassmen and school data -> joining with julia


In [39]:
#change



In [52]:
#res = upperclassmn, filtered_df = school stats 
#res = res.iloc[:, [3, 0, 1, 2]] 
res['team'] = res['team'].str.lower().str.strip()
stats['Team'] = stats['Team'].str.lower().str.strip()
res["team"] = res["team"].str.replace("-", " ")
res = res.replace("texas a&m corpus christi", "texas a&m-corpus christi")
res = res.replace("saint peters", "saint peter's")
res = res.replace("abu", "uab")





res.sort_values('team', inplace=True)
stats.sort_values('Team', inplace=True)


merged_df = pd.merge(res, stats, left_on=['team', 'year'], right_on=['Team', 'Year'])

In [41]:
res.head

<bound method NDFrame.head of     year  upperclassmen  team_stats          team
0   2022            7.0         NaN           abu
0   2024            6.0         NaN           abu
0   2024            7.0         NaN         akron
0   2022            3.0         NaN         akron
0   2024            5.0         NaN       alabama
..   ...            ...         ...           ...
0   2022            4.0         NaN  wright state
0   2022            3.0         NaN       wyoming
0   2023            6.0         NaN        xavier
0   2024            4.0         NaN          yale
0   2022            5.0         NaN          yale

[205 rows x 4 columns]>

In [42]:
stats.head

<bound method NDFrame.head of              Team    SRS    SOS    Tm   Opp    MP    FG   FGA   FG%   3P  ...  \
2           akron   2.77  -2.08  2577  2316  1405   915  2018  .453  280  ...   
2           akron   0.91  -3.93  2402  2130  1365   827  1799  .460  265  ...   
3         alabama  14.62  11.59  2623  2523  1325   911  2072  .440  305  ...   
3         alabama  23.19   9.65  3027  2526  1510  1023  2314  .442  366  ...   
3         alabama  20.69  11.80  3335  3006  1490  1136  2385  .476  413  ...   
..            ...    ...    ...   ...   ...   ...   ...   ...   ...  ...  ...   
387  wright state  -1.94  -5.11  2729  2585  1440   987  2140  .461  233  ...   
388       wyoming   9.36   3.60  2466  2227  1380   853  1865  .457  275  ...   
396        xavier  16.03   9.16  2995  2741  1485  1109  2256  .492  274  ...   
396          yale   4.63   0.41  2465  2226  1340   919  1969  .467  241  ...   
390          yale  -0.10  -1.34  2226  2143  1245   795  1800  .442  212  ...  

In [53]:
temp = merged_df.copy()
temp.drop('team_stats', axis=1, inplace=True)
temp.drop('team', axis=1, inplace=True)
first_column = temp.pop('Team')
temp.insert(0, 'Team', first_column) 

temp.to_csv('../data/upperclass_allStats.csv')

In [44]:
#left merge on name and YEAR 
temp


Unnamed: 0,Team,year,upperclassmen,SRS,SOS,Tm,Opp,MP,FG,FGA,...,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,Year
0,,2022,7.0,,,,,,,,...,,,,,,,,,,
1,,2024,6.0,,,,,,,,...,,,,,,,,,,
2,akron,2024,7.0,2.77,-2.08,2577,2316,1405,915,2018,...,642,.727,363,1278,455,197,100,394,583,2024
3,akron,2022,3.0,0.91,-3.93,2402,2130,1365,827,1799,...,701,.689,334,1188,396,199,110,388,536,2022
4,alabama,2024,5.0,20.69,11.80,3335,3006,1490,1136,2385,...,842,.772,472,1467,587,256,162,438,734,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,wright state,2022,4.0,-1.94,-5.11,2729,2585,1440,987,2140,...,680,.768,377,1255,506,212,107,440,523,2022
201,wyoming,2022,3.0,9.36,3.60,2466,2227,1380,853,1865,...,669,.725,280,1212,381,133,67,385,516,2022
202,xavier,2023,6.0,16.03,9.16,2995,2741,1485,1109,2256,...,708,.710,380,1396,705,236,120,459,593,2023
203,yale,2024,4.0,4.63,0.41,2465,2226,1340,919,1969,...,546,.707,326,1205,495,204,105,316,499,2024
