downloaded datasets from https://www.sports-reference.com/cbb/schools/connecticut/men/2025.html#all_roster with two csv's per team, one of roster for year (upperclassmen) and one of per game for starters, with team totals in same table having info on the field goals, free throws, etc aka common bbal stats

In [1]:
#getting from downloaded html file

import os
from bs4 import BeautifulSoup
import pandas as pd

# Directory containing the HTML files
html_folder = '../data/html_sportsreference'
output_folder = '../'  # Directory to save CSV files

# List of table div IDs to extract
table_div_ids = ['all_roster', 'all_advanced_players']

# Loop through each HTML file in the folder
for filename in os.listdir(html_folder):
    if filename.endswith(".html"):
        print(f"Processing file: {filename}")
        file_path = os.path.join(html_folder, filename)

        # Open and parse the HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')

        # Extract team name from filename (without .html)
        team_name = os.path.splitext(filename)[0]

        # Loop through each table ID
        for div_id in table_div_ids:
            print(f"Processing table with div ID: {div_id} in team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    print("Headers:", headers)

                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' in team '{team_name}'")
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")


Processing file: 2024-25 uconn.html
Processing table with div ID: all_roster in team 2024-25 uconn
Headers: ['Player', '#', 'Class', 'Pos', 'Height', 'Weight', 'Hometown', 'High School', 'RSCI Top 100', 'Summary']
Table 'all_roster' for team '2024-25 uconn' saved to ../2024-25 uconn_all_roster.csv
Processing table with div ID: all_advanced_players in team 2024-25 uconn
Headers: ['Rk', 'Player', 'G', 'GS', 'MP', 'PER', 'TS%', 'eFG%', '3PAr', 'FTr', 'PProd', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '', 'OWS', 'DWS', 'WS', 'WS/40', '', 'OBPM', 'DBPM', 'BPM']
Table 'all_advanced_players' for team '2024-25 uconn' saved to ../2024-25 uconn_all_advanced_players.csv


In [None]:

#same as above, but using direct url instead 

import requests
import time #request the server too much and error 429
from bs4 import BeautifulSoup
import pandas as pd
import os

# List of URLs to scrape
urls = [
    'https://www.sports-reference.com/cbb/schools/fairleigh-dickinson/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-am-corpus-christi/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-southern/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/miami-fl/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/louisiana-lafayette/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-am/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/baylor/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/auburn/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arkansas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arizona-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/arizona/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/alabama/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/duke/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/drake/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/creighton/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/connecticut/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/college-of-charleston/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/colgate/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/boise-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/illinois/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/howard/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/houston/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/grand-canyon/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/gonzaga/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/furman/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/florida-atlantic/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kentucky/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kent-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kennesaw-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kansas-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/kansas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iowa-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iowa/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/iona/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/indiana/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/nevada/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/montana-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/missouri/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/mississippi-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/michigan-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/memphis/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/maryland/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/marquette/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/saint-marys-ca/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/purdue/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/providence/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/princeton/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/pittsburgh/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/penn-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/oral-roberts/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/northwestern/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/northern-kentucky/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/tennessee/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/texas-christian/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/southern-california/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/southeast-missouri-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/san-diego-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/xavier/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/west-virginia/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/virginia-commonwealth/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/virginia/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/vermont/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/utah-state/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/north-carolina-asheville/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/ucla/men/2023.html',
    'https://www.sports-reference.com/cbb/schools/california-santa-barbara/men/2023.html'
]


# Directory to save CSV files
output_folder = '../data/upperclassmen'

# List of table div IDs to extract
table_div_ids = ['all_roster']

#init res df with the upperclassmen and team stats
d = {'team' : [], 'year': [], 'upperclassmen': [], 'team_stats': []}
res = pd.DataFrame(data=d)
res.set_index('team', inplace=True)


# Loop through URLs
for url in urls:
    #print(f"Processing URL: {url}")
    response = requests.get(url)
    time.sleep(5)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract team name from the URL
        team_name = url.split('/')[-3]  # Extract team name from URL path
        print(f"Processing team name: {team_name}")
        team_name = team_name.strip().lower()
        #team_name = team_name.str.replace("-", " ")

        if team_name == "louisiana-lafayette": 
            team_name = "louisiana"
        if team_name == "fairleigh-dickinson":
            team_name = "fdu"
        if team_name == "miami-fl":
            team_name = "miami (fl)"
        if team_name == "north-carolina-asheville":
            team_name = "unc asheville"
        if team_name == "north-carolina-state":
            team_name = "nc state"
        if team_name == "saint-marys-ca":
            team_name = "saint mary's (ca)"
        if team_name == "texas-am":
            team_name = "texas a&m"
        if team_name == "texas-am-corpus-christi":
            team_name = "texas a&m-corpus christi"
        if team_name == "texas-christian":
            team_name = "tcu"
        if team_name == "california-santa-barbara":
            team_name = "uc santa barbara"

        # Loop through each table ID
        for div_id in table_div_ids:
            #print(f"Processing table with div ID: {div_id} for team {team_name}")
            div = soup.find('div', id=div_id)
            if div:
                table = div.find('table')  # Locate the table inside the div
                if table:
                    # Extract headers from <thead>
                    thead = table.find('thead')
                    headers = [header.text.strip() for header in thead.find_all('th')] if thead else []
                    
                    # Extract rows from <tbody>
                    tbody = table.find('tbody')
                    rows = []
                    if tbody:
                        for row in tbody.find_all('tr'):
                            cells = row.find_all('td')
                            row_data = [cell.text.strip() for cell in cells]
                            # Check if the row length matches headers
                            if len(row_data) == len(headers):
                                rows.append(row_data)
                            else:
                                #print(f"Row length mismatch in table '{div_id}':", row_data)
                                # Pad the row to match headers
                                row_data.extend([''] * (len(headers) - len(row_data)))
                                rows.append(row_data)

                    # Convert to DataFrame
                    if rows:
                        df = pd.DataFrame(rows, columns=headers)

                        # Save to CSV with team name and table ID
                        csv_filename = f"{team_name}_{div_id}.csv"
                        csv_path = os.path.join(output_folder, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Table '{div_id}' for team '{team_name}' saved to {csv_path}")
                    else:
                        print(f"No valid rows found for table '{div_id}' for team '{team_name}'")
                    
                    #getting out number of upperclassmen per team
                    
                    df = pd.read_csv(csv_path)
                
                    df['Points Scored'] = df['RSCI Top 100'].str.extract(r'(\d+)').dropna()
                    # Convert extracted numbers to numeric type and sort descending
                    df['Points Scored'] = pd.to_numeric(df['Points Scored'])
                    sorted_df = df.sort_values(by='Points Scored', ascending=False)
                    sorted_df.dropna(subset = "Points Scored", inplace=True)
                    sorted_df = sorted_df.head(7)
                    jr_count = sorted_df['#'].str.count('JR').sum()
                    sr_count = sorted_df['#'].str.count('SR').sum()
                    upperclassmen = jr_count + sr_count
                    print(upperclassmen)
                    year = url.split('/')[-1].strip('.html')
                    new = pd.DataFrame({'team': [team_name], 'upperclassmen': [upperclassmen], 'year': [year]})
                    res = pd.concat([res, new])
                    print(res)
                        

                    
                else:
                    print(f"No table found with ID '{div_id}' for team '{team_name}'")
                    
            else:
                print(f"No div found with ID '{div_id}' for team '{team_name}'")
    else:
        print(f"Failed to fetch URL: {url}, Status code: {response.status_code}")


Processing team name: fairleigh-dickinson
Table 'all_roster' for team 'fdu' saved to ../data/upperclassmen/fdu_all_roster.csv
5
   year  upperclassmen  team_stats team
0  2023            5.0         NaN  fdu


  res = pd.concat([res, new])


Processing team name: texas-am-corpus-christi
Table 'all_roster' for team 'texas a&m-corpus christi' saved to ../data/upperclassmen/texas a&m-corpus christi_all_roster.csv
6
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0  2023            6.0         NaN  texas a&m-corpus christi
Processing team name: texas-southern
Table 'all_roster' for team 'texas-southern' saved to ../data/upperclassmen/texas-southern_all_roster.csv
3
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0  2023            6.0         NaN  texas a&m-corpus christi
0  2023            3.0         NaN            texas-southern
Processing team name: miami-fl
Table 'all_roster' for team 'miami (fl)' saved to ../data/upperclassmen/miami (fl)_all_roster.csv
5
   year  upperclassmen  team_stats                      team
0  2023            5.0         NaN                       fdu
0

### basic school stats

In [3]:
#all school stats for regular bbal stuff

from bs4 import BeautifulSoup
import pandas as pd

# Path to the local HTML file
file_path = '../data/html_teamranking/all schools 2022 season.html'

# Open and parse the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# Locate the table
div_id = 'div_basic_school_stats'
table = soup.find('div', id=div_id).find('table', id='basic_school_stats')

if table:
    # Extract headers from the table's <thead>
    headers = [header.text.strip() for header in table.find('thead').find_all('th')]

    # Extract rows from the table's <tbody>
    rows = []
    for row in table.find('tbody').find_all('tr'):
        cells = row.find_all(['td', 'th'])
        row_data = [cell.text.strip() for cell in cells]
        # Ensure the row has the same number of columns as headers
        if len(row_data) < len(headers):
            row_data.extend([''] * (len(headers) - len(row_data)))  # Pad missing cells with blanks
        rows.append(row_data[:len(headers)])  # Trim any extra cells

    # Convert to DataFrame
    if rows:
        df = pd.DataFrame(rows, columns=headers)
        csv_filename = '../basic_school_stats.csv'
        df.to_csv(csv_filename, index=False)
        print(f"Table 'basic_school_stats' saved to {csv_filename}")
    else:
        print("No valid rows found in the table.")

    schoolDF = pd.read_csv(csv_filename)
    #FILTERING OUT AND DOING STUFF TO DF
    filtered_df = schoolDF[schoolDF['Overall'].str.contains('NCAA', na=False)].copy()
    # Create a new column 'team_name_without_nccaa' with 'NCAA' removed
    filtered_df['Overall'] = filtered_df['Overall'].str.strip().str.replace('NCAA', '')

    filtered_df = filtered_df[['Overall', 'Unnamed: 6', 'Away', 'SRS', 'SOS', 'W.1', 'L.1', 'Unnamed: 23', 'W.2', 'L.2', 'Unnamed: 26', 'W.3', 'L.3',
       'Unnamed: 29', 'Tm.', 'Opp.', 'Unnamed: 32', 'MP', 'FG', 'FGA', 'FG%','3P']]
    #team name, SRS, SOS, Tm, Opp, MP	FG	FGA	FG%	3P	3PA	3P%	FT	FTA	FT%	ORB	TRB	AST	STL	BLK	TOV	PF
    filtered_df.rename(columns={'Overall' : 'Team', 'Unnamed: 6': 'SRS', 'Away': 'SOS', 'SRS': 'Tm', 'SOS': 'Opp', 'W.1': 'MP', 'L.1': 'FG', 'Unnamed: 23': 'FGA', 'W.2': 'FG%', 'L.2': '3P', 'Unnamed: 26': '3PA', 'W.3': '3P%', 'L.3': 'FT',
       'Unnamed: 29': 'FTA', 'Tm.': 'FT%', 'Opp.': 'ORB', 'Unnamed: 32': 'TRB', 'MP': 'AST', 'FG': 'STL', 'FGA': 'BLK', 'FG%': 'TOV','3P': 'PF'}, inplace=True)

    
else:
    print("Table 'basic_school_stats' not found.")


Table 'basic_school_stats' saved to ../basic_school_stats.csv


# #LEFT JOIN WITH JULIA

joining first upperclassmen and school data -> joining with julia


In [4]:
#change



In [21]:
#res = upperclassmn, filtered_df = school stats 
#res = res.iloc[:, [3, 0, 1, 2]] 
res['team'] = res['team'].str.lower().str.strip()
filtered_df['Team'] = filtered_df['Team'].str.lower().str.strip()
res["team"] = res["team"].str.replace("-", " ")

res.sort_values('team', inplace=True)
filtered_df.sort_values('Team', inplace=True)




merged_df = pd.merge(res, filtered_df, left_on='team', right_on='Team')

In [6]:
res.head

<bound method NDFrame.head of     year  upperclassmen  team_stats                   team
0   2023            2.0         NaN                alabama
0   2023            6.0         NaN                arizona
0   2023            4.0         NaN          arizona state
0   2023            3.0         NaN               arkansas
0   2023            7.0         NaN                 auburn
..   ...            ...         ...                    ...
0   2023            6.0         NaN                vermont
0   2023            6.0         NaN               virginia
0   2023            3.0         NaN  virginia commonwealth
0   2023            6.0         NaN          west virginia
0   2023            6.0         NaN                 xavier

[68 rows x 4 columns]>

In [22]:
temp = merged_df.copy()
temp.drop('team_stats', axis=1, inplace=True)
temp.drop('team', axis=1, inplace=True)
first_column = temp.pop('Team')
temp.insert(0, 'Team', first_column) 

temp.to_csv('../data/upperclass_allStats.csv')

In [17]:
#left merge on name and YEAR 
temp


Unnamed: 0,Team,year,upperclassmen,SRS,SOS,Tm,Opp,MP,FG,FGA,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,alabama,2023,2.0,23.19,9.65,3027,2526,1510,1023,2314,...,615,848,.725,484,1652,555,224,189,512,691
1,arizona,2023,6.0,19.08,8.34,2866,2490,1400,1023,2072,...,525,742,.708,356,1376,662,213,110,468,593
2,arizona state,2023,4.0,11.29,8.18,2559,2447,1455,910,2163,...,465,675,.689,395,1315,515,255,170,422,668
3,arkansas,2023,3.0,15.99,9.87,2666,2446,1445,961,2063,...,565,810,.698,373,1272,462,302,182,458,694
4,auburn,2023,7.0,14.35,9.29,2474,2302,1365,875,1991,...,503,723,.696,396,1231,479,263,172,414,655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,vermont,2023,6.0,1.63,-3.13,2471,2276,1377,901,1901,...,377,541,.697,220,1065,465,200,98,318,504
63,virginia,2023,6.0,13.28,5.98,2237,1996,1325,793,1765,...,431,613,.703,273,1064,519,225,142,282,471
64,virginia commonwealth,2023,3.0,9.89,1.95,2480,2202,1400,864,1867,...,540,776,.696,324,1161,456,311,160,468,575
65,west virginia,2023,6.0,15.95,10.89,2583,2411,1365,882,1953,...,571,769,.743,383,1152,442,227,104,440,632
