In [8]:
import requests
from bs4 import BeautifulSoup 
import sys
import pandas as pd
import re
import numpy as np

In [9]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 0)  # No column width truncation
pd.set_option('display.max_colwidth', None)  # Show full content of each column

from IPython.core.display import display, HTML


Download historical data from warrennolan.com

In [10]:
def fetch_berths(year):

    berths = []
    link = "https://en.wikipedia.org/wiki/" + year + "_NCAA_Division_I_men's_basketball_tournament"
    r = requests.get(link)
    soup = BeautifulSoup(r.content, 'html.parser')
    tables = soup.find_all("table", class_="wikitable sortable plainrowheaders")
    for table in tables:
        regional_table_title = table.find("caption")
        try:
            if 'egional' not in regional_table_title.text:
                continue
        except:
            continue
        teams = table.find_all("tr")[1:]
        #Rows are shared on wikipedia for teams in the play-in
        most_recent_seed = 0
        for team in teams:
            team_stats = team.find_all("td")
            try:
                seed, overall_seed = team_stats[0].text.strip(), int(team_stats[3].text.strip())
            except:
                seed, overall_seed = most_recent_seed, int(team_stats[2].text.strip())
            if seed[-1] == '*':
                seed = seed[:-1]
                most_recent_seed = seed
            seed = int(seed)
            team_name = team.find("th").text.strip()
            berths.append((team_name, seed, overall_seed))
    return berths

#x = soup.find_all("table", class_="wikitable sortable plainrowheaders jquery-tablesorter")


In [11]:
def get_team_mappngs():
    team_maps = {
    "UConn": "Connecticut",
    "St. Bonaventure": "Saint Bonaventure",
    "UNC Greensboro": "UNCG",
    "Mount St. Mary's": "Mount Saint Mary's",
    "Loyola Chicago": "Loyola-Chicago",
    "Saint Mary's": "Saint Mary's College",
    "Texas A&M–Corpus Christi":  "Texas A&M-Corpus Christi",
    "Southeast Missouri State": "Southeast Missouri",
    "Florida Atlantic": "FAU",
    "NC State": "North Carolina State"
    }
    return team_maps


In [12]:
def get_stats(team, year):
    print(team)
    updated_team = ''
    for character in team:
        if character.isalpha():
            updated_team += character
        elif character in (' ', '-') and updated_team[-1] != '-':
            #Skip consecutive -'s, i.e. William & Mary
            updated_team += '-'
    try:
        link = "https://www.warrennolan.com/basketball/" + year + "/team-net-sheet?team=" + updated_team
        r = requests.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')
        main_table = soup.find("div", class_='main-body-row-flex-left-wrap')
        metrics = main_table.find_all("div", class_="ts-data-left")
        resume_metrics = list(map(lambda x: x.strip(), metrics[0].text.strip().split('\n')))
        resume_metrics = list(map(int, resume_metrics))
        pred_metrics = list(map(lambda x: x.strip(), metrics[1].text.strip().split('\n')))
        pred_metrics = list(map(int, pred_metrics))
        
        team_records = []
        quadrants = soup.find("div", class_="ts-flex-size-2").find_all("div", class_="ts-data-center")
        for quadrant in quadrants[:-1]:
            records = [record.strip() for record in quadrant.text.strip().split('\n') if len(record) > 0][1:]
            #overall, home, away, neutral, non-conference
            team_records.append(records[0])            
            
        #wins against bubble only getting tracked as of 2025
        final_list = []
        if year >= '2025':
            [kpi, sor, wab] = resume_metrics
        else:
            [kpi, sor] = resume_metrics
            resume_metrics.append(np.nan)
        [bpi, kenpom, trank] = pred_metrics
        return resume_metrics + pred_metrics + team_records
    except Exception as e:
        print(e)
        print("ERROR for team: " + team)
        print("Link is " + link)
        return [0 for _ in range(31)]

In [13]:
def download_data(league):
    if league == 'men':
        link = 'https://www.warrennolan.com/basketball/'
    elif league == 'women':
        link = 'https://www.warrennolan.com/basketballw/'
    else:
        return
    

    years = ['2021', '2022', '2023', '2024', '2025']
    years = ['2021', '2022', '2023', '2024']
    years = ['2025']
    pages = ['net','sos-net']
    for year in years:
        print("Downloading data for " + league + " year: " + year)
        final_df = pd.DataFrame()
        for page in pages:
            r = requests.get(link + year + '/' + page)
            soup = BeautifulSoup(r.content, 'html.parser')   
            #table = soup.find_all("table", class_="normal-grid alternating-rows stats-table")
            #print(type(soup), type(table))

            rows = soup.find_all("tr")
            row_values = [[t for t in row.text.strip().split('\n') if len(t) > 0] for row in rows]
            header, body = row_values[0], row_values[1:]

            #Rpi is also including conference played in and record, we'll try without
            #if page == 'rpi-live':
            #    for team in body:
            #        del team[2]

            #Don't need 'Delta' columns
            if 'Delta' in header[-1]:
                header = header[:-1]
                body = list(map(lambda x: x[:len(header )], body))


            df = pd.DataFrame(body, columns=header)

            if final_df.empty:
                final_df = df
            else:
                final_df = final_df.merge(df, on='Team',how="left")

        #final_df = final_df.drop(columns=['Record_y', 'Record', 'SOS_y'])
        #renaming_columns = {'Record_x': 'Record', 'SOS_x': 'SOS_RPI', 'Rank_x': 'Rank_ELO', 'SOS Rank': 'Rank_SOS_NET', \
        #                   'Non-Conf SOS Rank': 'Rank_Non-Conf_SOS_NET', 'Rank_y': 'Rank_SOS_RPI'}
        #final_df = final_df.rename(columns=renaming_columns)

        #Add in the auto-bids
        """
        autobid_link = link + year + '/autobids'
        r = requests.get(autobid_link)
        soup = BeautifulSoup(r.content, 'html.parser')
        teams = soup.find_all("a", "blue-black", href=re.compile("schedule"))
        teams = [team.text for team in teams]
        final_df['autobid'] = 0
        final_df.loc[final_df['Team'].isin(teams), 'autobid'] = 1
        """
        
        #Drop teams who didn't play a game (llke Ivy in 2020-21 COVID)
        final_df = final_df.loc[final_df["Record"] != "0-0"]
        #display(HTML(final_df.to_html(notebook=True)))

        
        #Fetch the NET predictive and resume stats
        nitty_gritty_columns = ['kpi', 'sor', 'wab', 'bpi', 'kenpom', 'trank']
        record_columns = []
        for q in ['q1', 'q2', 'q3', 'q4']:
             record_columns.append(q)
        nitty_gritty_columns.extend(record_columns)
        #final_df[['kpi', 'sor', 'wab', 'bpi', 'kenpom', 'trank']] = final_df['Team'].apply(lambda x: pd.Series(get_metrics(x, year)))    
        final_df[nitty_gritty_columns] = final_df['Team'].apply(lambda x: pd.Series(get_stats(x, year)))    

        #win_loss_columns = ['Record', 'NCRec', 'H', 'R', 'N', 'Q1', 'Q2', 'Q3', 'Q4', 'Opp Record']
        final_df = final_df.drop(columns=['Record'])
        win_loss_columns = record_columns
        for win_loss_column in win_loss_columns:
            win = win_loss_column + '_wins'
            loss = win_loss_column + '_losses'
            final_df[[win, loss]] = final_df[win_loss_column].str.split('-', expand=True)
            final_df[[win, loss]] = final_df[[win, loss]].astype(int)
            final_df = final_df.drop(columns=[win_loss_column])
        
        display(HTML(final_df.to_html(notebook=True)))
    
        final_df['berth'] = 0
        final_df['seed'] = 0
        final_df['overall_seed'] = 0
        final_df['autobid'] = 0
        
        #Add in the auto-bids
        autobid_link = link + year + '/autobids'
        r = requests.get(autobid_link)
        soup = BeautifulSoup(r.content, 'html.parser')
        teams = soup.find_all("a", "blue-black", href=re.compile("schedule"))
        teams = [team.text for team in teams]
        final_df.loc[final_df['Team'].isin(teams), 'autobid'] = 1
        
        #Add in the seeds & tournament status
        berths = fetch_berths(year)
        
        if year != '2025':
            #Disconnect between wikipedia/warrennolan formattting
            team_maps = get_team_mappngs()
            for berth in berths:
                team_name, seed, overall_seed = berth
                
                test_grab = final_df.loc[final_df['Team'] == team_name, 'Team']
                if test_grab.empty:
                    team_name = team_maps[team_name]
                try:
                    final_df.loc[final_df['Team'] == team_name, ['seed', 'overall_seed', 'berth']] = seed, overall_seed, 1
                except:
                    print("Couldn't insert for " + team_name)


            
        final_df['year'] = year
        file_name = year + '_' + league + '.csv'
        final_df.to_csv(file_name)
        print("Saved " + file_name)

        

In [14]:
download_data('men')
#download_data('women')

Downloading data for men year: 2025
Duke
Auburn
Houston
Florida
Tennessee
Alabama
Texas Tech
Gonzaga
Iowa State
Maryland
Michigan State
Arizona
Saint John's
Wisconsin
Kentucky
Missouri
Illinois
Texas A&M
Purdue
Kansas
Saint Mary's College
Clemson
Louisville
BYU
Michigan
Marquette
UCLA
Ole Miss
Oregon
Baylor
VCU
Connecticut
Georgia
Mississippi State
UC San Diego
North Carolina
Utah State
Creighton
Texas
Arkansas
Ohio State
New Mexico
Oklahoma
Boise State
Xavier
SMU
Colorado State
Vanderbilt
Cincinnati
Memphis
West Virginia
San Diego State
Northwestern
Indiana
Villanova
Drake
Santa Clara
McNeese
Nebraska
Liberty
Pittsburgh
UC Irvine
Iowa
San Francisco
North Texas
Penn State
Dayton
George Mason
Wake Forest
USC
UCF
Yale
Utah
Arizona State
Kansas State
Saint Joseph's
Rutgers
TCU
Nevada
Bradley
Stanford
High Point
Butler
Lipscomb
Oregon State
Colorado
South Carolina
Georgetown
LSU
Florida State
Akron
Grand Canyon
Northern Iowa
Arkansas State
Minnesota
Oklahoma State
Saint Bonaventure
UNLV
Tr

Unnamed: 0,Team,NET Rank,SOS Rank,Non-Conf SOS Rank,kpi,sor,wab,bpi,kenpom,trank,q1_wins,q1_losses,q2_wins,q2_losses,q3_wins,q3_losses,q4_wins,q4_losses
0,Duke,1,57,12,6,4,6,1,1,2,9,3,7,0,10,0,5,0
1,Auburn,2,2,8,1,1,1,3,4,3,16,5,6,0,2,0,4,0
2,Houston,3,24,74,5,2,2,2,3,1,14,3,6,1,5,0,5,0
3,Florida,4,23,262,3,3,3,4,2,4,11,4,9,0,4,0,6,0
4,Tennessee,5,8,129,4,5,5,5,5,6,11,7,5,0,5,0,6,0
5,Alabama,6,1,9,2,6,4,6,6,5,11,8,8,0,4,0,2,0
6,Texas Tech,7,47,296,21,10,15,8,7,7,10,5,3,3,6,0,6,0
7,Gonzaga,8,73,30,26,39,35,10,9,11,5,5,5,3,5,0,10,0
8,Iowa State,9,36,94,23,13,18,7,10,8,10,7,6,2,3,0,5,0
9,Maryland,10,52,327,17,18,19,13,13,10,8,7,6,1,3,0,8,0


Saved 2025_men.csv
