downloaded datasets from https://www.sports-reference.com/cbb/schools/connecticut/men/2025.html#all_roster with two csv's per team, one of roster for year (upperclassmen) and one of per game for starters, with team totals in same table having info on the field goals, free throws, etc aka common bbal stats

In [None]:
#getting from downloaded html file

import os
from bs4 import BeautifulSoup
import pandas as pd

# Directory containing the HTML files
html_folder = '../data/html_sportsreference'
output_folder = '../'  # Directory to save CSV files

# List of table div IDs to extract
table_div_ids = ['all_roster', 'all_advanced_players']

# Loop through each HTML file in the folder
for filename in os.listdir(html_folder):
    if filename.endswith(".html"):
        print(f"Processing file: {filename}")
        file_path = os.path.join(html_folder, filename)

        # Open and parse the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        # Extract team name from filename (without .html)
        team_name = os.path.splitext(filename)[0]

        # Loop through each table ID
        for div_id in table_div_ids:
            print(f"Processing table with div ID: {div_id} in team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    print("Headers:", headers)

                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' in team '{team_name}'")
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")


Processing file: 2024-25 uconn.html
Processing table with div ID: all_roster in team 2024-25 uconn
Headers: ['Player', '#', 'Class', 'Pos', 'Height', 'Weight', 'Hometown', 'High School', 'RSCI Top 100', 'Summary']
Table 'all_roster' for team '2024-25 uconn' saved to ../2024-25 uconn_all_roster.csv
Processing table with div ID: all_advanced_players in team 2024-25 uconn
Headers: ['Rk', 'Player', 'G', 'GS', 'MP', 'PER', 'TS%', 'eFG%', '3PAr', 'FTr', 'PProd', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '', 'OWS', 'DWS', 'WS', 'WS/40', '', 'OBPM', 'DBPM', 'BPM']
Table 'all_advanced_players' for team '2024-25 uconn' saved to ../2024-25 uconn_all_advanced_players.csv


In [None]:

#same as above, but using direct url instead 

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# List of URLs to scrape
urls = [
    'https://www.sports-reference.com/cbb/schools/connecticut/men/2023.html',
    # Add more URLs here
]

# Directory to save CSV files
output_folder = '../'

# List of table div IDs to extract
table_div_ids = ['all_roster', 'all_advanced_players']

# Loop through URLs
for url in urls:
    #print(f"Processing URL: {url}")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract team name from the URL
        team_name = url.split('/')[-3]  # Extract team name from URL path
        print(f"Processing team name: {team_name}")

        # Loop through each table ID
        for div_id in table_div_ids:
            #print(f"Processing table with div ID: {div_id} for team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    
                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' for team '{team_name}'")
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")
    else:
        print(f"Failed to fetch URL: {url}, Status code: {response.status_code}")


Processing team name: connecticut
../connecticut_all_roster.csv
Table 'all_roster' for team 'connecticut' saved to ../connecticut_all_roster.csv
../connecticut_all_advanced_players.csv
Table 'all_advanced_players' for team 'connecticut' saved to ../connecticut_all_advanced_players.csv


In [None]:
#all school stats for regular bbal stuff

from bs4 import BeautifulSoup
import pandas as pd

# Path to the local HTML file
file_path = '../data/html_teamranking/all schools 2022 season.html'

# Open and parse the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Locate the table
div_id = 'div_basic_school_stats'
table = soup.find('div', id=div_id).find('table', id='basic_school_stats')

if table:
    # Extract headers from the table's <thead>
    headers = [header.text.strip() for header in table.find('thead').find_all('th')]

    # Extract rows from the table's <tbody>
    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        # Ensure the row has the same number of columns as headers
        if len(row_data) < len(headers):
            row_data.extend([''] * (len(headers) - len(row_data)))  # Pad missing cells with blanks
        rows.append(row_data[:len(headers)])  # Trim any extra cells

    # Convert to DataFrame
    if rows:
        df = pd.DataFrame(rows, columns=headers)
        csv_filename = '../basic_school_stats.csv'
        df.to_csv(csv_filename, index=False)
        print(f"Table 'basic_school_stats' saved to {csv_filename}")
    else:
        print("No valid rows found in the table.")
else:
    print("Table 'basic_school_stats' not found.")


Table 'basic_school_stats' saved to ../basic_school_stats.csv
