web scraper using beautiful soup to get the table from teamrankings.com


In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os

# Directory where your HTML files are stored
directory_path = '../data/html_teamranking/'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".html"):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and parse each HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        # Locate the table
        table = soup.find('table')
        if not table:
            print(f"No table found in {filename}")
            continue  # Skip to next file if no table found

        # Extract headers
        headers = [header.text.strip() for header in table.find_all('th')]

        # Extract rows
        rows = []
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            if row_data:
                rows.append(row_data)
        
        # Save data to a new CSV file
        csv_filename = filename.replace('.html', '.csv')
        csv_path = os.path.join(directory_path, csv_filename)
        
        with open(csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headers)  # Write headers
            writer.writerows(rows)    # Write data rows
        print(f"Data from {filename} has been saved to {csv_filename}")

print("All files processed.")


Data from afterLoss_winLoss.html has been saved to afterLoss_winLoss.csv
Data from afterWin_winLoss.html has been saved to afterWin_winLoss.csv
No table found in all_bbalcolleges.html
Data from asUnderdog_winLoss.html has been saved to asUnderdog_winLoss.csv
Data from away_winLoss.html has been saved to away_winLoss.csv
Data from equalRest_winLoss.html has been saved to equalRest_winLoss.csv
Data from home_winLoss.html has been saved to home_winLoss.csv
Data from neutralSite_winLoss.html has been saved to neutralSite_winLoss.csv
Data from overallWinPercentage.html has been saved to overallWinPercentage.csv
Data from restAdvantage_winLoss.html has been saved to restAdvantage_winLoss.csv
All files processed.


In [2]:
import pandas as pd
import os

#initializing so there's no yellow lines lol
afterLoss_winLoss = 0
afterWin_winLoss = 0
asUnderdog_winLoss = 0
restAdvantage_winLoss = 0
equalRest_winLoss = 0
away_winLoss = 0
home_winLoss = 0
neutralSite_winLoss = 0
overallWinPercentage=0

# Directory where your CSV files are stored
directory_path = '../data/html_teamranking/'

# Loop through each CSV file in the directory and create DataFrames
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):
        # Construct full file path
        csv_path = os.path.join(directory_path, filename)
        
        # Read CSV into a DataFrame
        df = pd.read_csv(csv_path)
        
        # Use filename as the name of the DataFrame (without ".csv")
        exec(f"{filename.replace('.csv', '')} = df")
        
        # Print to confirm
        print(f"DataFrame created for {filename}")

team_expenses = pd.read_csv("../data/Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022.csv")
print(f"DataFrame created for Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022")


DataFrame created for afterLoss_winLoss.csv
DataFrame created for afterWin_winLoss.csv
DataFrame created for asUnderdog_winLoss.csv
DataFrame created for away_winLoss.csv
DataFrame created for equalRest_winLoss.csv
DataFrame created for home_winLoss.csv
DataFrame created for neutralSite_winLoss.csv
DataFrame created for overallWinPercentage.csv
DataFrame created for restAdvantage_winLoss.csv
DataFrame created for Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022


In [3]:
#----------these datasets range from the 2003-2004 season to 2024-2025 season (current)

#previous game was a loss -> is the next game a win/loss?
afterLoss_winLoss

#previous game was a win -> is the next game a win/loss?
afterWin_winLoss

#games won/lost as the underdog aka lower team ranking/seeding
asUnderdog_winLoss

#games won/lost WITH a rest advantage, or having 1+ days of rest compared to the other team
restAdvantage_winLoss

#games won/lost with equal days rest compared to the other team (baseline for restAdvantange)
equalRest_winLoss

#games won/lost as the away team
away_winLoss

#games won/lost as the home team
home_winLoss

#games won/lost playing on a neutral site (might be useful to compare as baseline agasint home/away)
neutralSite_winLoss

#total wins and losses of every game
overallWinPercentage

#total team expenses 
bball_expenses = team_expenses[["Survey Year", "UNITID", "OPE ID", "Institution Name", "State CD", "Expenses Men's Team"]]

In [4]:
!pip install "fuzzywuzzy[speedup]"

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.1-cp310-cp310-win_amd64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-win_amd64.whl (98 kB)
   ---------------------------------------- 98.1/98.1 kB 2.7 MB/s eta 0:00:00
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-win_amd64.whl (1.6 MB)
   ---------------------------------------- 1.6/1.6 MB 12.8 MB/s eta 0:00:00
Installin


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\14253\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def custom_score(str1, str2):
    """
    Custom scoring function that penalizes missing words less.
    Uses fuzz.partial_ratio for lenient matching, with extra weight for fewer missing words.
    """
    # First get the basic partial match score
    partial_score = fuzz.partial_ratio(str1, str2)
    
    # Adjust the score based on the length of the two strings (i.e., fewer words missing is better)
    word_diff_penalty = abs(len(str1.split()) - len(str2.split()))
    
    # Optionally, add a small penalty for missing words
    adjusted_score = partial_score - (word_diff_penalty * 2)  # Adjust the factor for leniency
    
    # Ensure the score stays within 0-100
    return max(min(adjusted_score, 100), 0)

def find_best_match_between_dfs(df1, column1, df2, column2, threshold=80):
    """
    Finds the best fuzzy match for each item in `column1` from `df1` within the entire `column2` from `df2`.
    Returns a new DataFrame with original names, best matches, and scores using custom scoring for partial matching.
    """
    matches = []
    
    for item in df1[column1]:
        # Extract the best match and custom score
        match_data = process.extractOne(item, df2[column2], scorer=lambda str1, str2: custom_score(str1, str2))
        if match_data:
            best_match = match_data[0]  # Best match string
            score = match_data[1]       # Similarity score
        else:
            best_match = None
            score = None
        
        # Append results if they meet the threshold
        matches.append({
            column1: item,
            f'Best_Match_in_{column2}': best_match if score and score >= threshold else None,
            f'Match_Score_in_{column2}': score if score and score >= threshold else None
        })
    
    # Create and return a new DataFrame with matches and scores
    return pd.DataFrame(matches)

result_df = find_best_match_between_dfs(bball_expenses, 'Institution Name', afterLoss_winLoss, 'Team')


In [6]:
print(result_df)
team_names = result_df

#this team_names dataset is relating the "Institution Name" from the ball_expenses dataframe to the 
# teamranking ones ie afterLoss_winLoss "Team" names

#---------------THIS TEAM_NAMES IS ALSO NOT PERFECT SO THERE ARE WRONG RELATIONS AND THINGS BC I AUTOMATED, PLEASE HELP CHECK AND EDIT

                         Institution Name Best_Match_in_Team  \
0            Abilene Christian University      Hsn Christian   
1                Alabama A & M University            Alabama   
2                Alabama State University         Alabama St   
3                 Alcorn State University          Alcorn St   
4                     American University           American   
...                                   ...                ...   
6970                  Winthrop University           Winthrop   
6971  Wright State University-Main Campus          Wright St   
6972                    Xavier University             Xavier   
6973                      Yale University               Yale   
6974          Youngstown State University           NC State   

      Match_Score_in_Team  
0                    83.0  
1                    94.0  
2                    98.0  
3                    98.0  
4                    98.0  
...                   ...  
6970                 98.0  
6971   

In [7]:
dfs = {
    "afterLoss_winLoss": afterLoss_winLoss,
    "afterWin_winLoss": afterWin_winLoss,
    "asUnderdog_winLoss": asUnderdog_winLoss,
    "restAdvantage_winLoss": restAdvantage_winLoss,
    "equalRest_winLoss": equalRest_winLoss,
    "away_winLoss": away_winLoss,
    "home_winLoss": home_winLoss,
    "neutralSite_winLoss": neutralSite_winLoss,
    "overallWinPercentage": overallWinPercentage
}

# Rename the win percentage column for each DataFrame
for name, df in dfs.items():
    # Replace 'Win %' with the formatted name, e.g., 'afterLoss_winLoss_win_percent'
    df.rename(columns={"Win %": f"{name}_Win_percent"}, inplace=True)
    df.rename(columns={"MOV": f"{name}_MOV"}, inplace=True)
    df.rename(columns={"ATS +/-": f"{name}_ATS"}, inplace=True)
    df.drop('Win-Loss Record', axis=1, inplace=True)

In [13]:
display(afterLoss_winLoss.head())

Unnamed: 0,Team,afterLoss_winLoss_Win_percent,afterLoss_winLoss_MOV,afterLoss_winLoss_ATS
0,Mercyhurst,100.0%,5.0,12.5
1,Kansas,85.2%,12.5,2.3
2,Gonzaga,82.1%,14.9,2.0
3,Duke,78.9%,13.0,0.8
4,Kentucky,72.7%,9.3,0.6


In [8]:
#merge win % datasets

from functools import reduce

merged_win_loss_stats = reduce(lambda left, right: pd.merge(left, right, on="Team", how="outer"), dfs.values())

In [9]:
# Create a new DataFrame that only includes the columns with 'Win_percent' in their name
win_percent_columns = [col for col in merged_win_loss_stats.columns if 'Win_percent' in col]

win_percent_df = merged_win_loss_stats[['Team'] + win_percent_columns]
print(win_percent_df)

            Team afterLoss_winLoss_Win_percent afterWin_winLoss_Win_percent  \
0     Mercyhurst                        100.0%                          NaN   
1         Kansas                         85.2%                        79.6%   
2        Gonzaga                         82.1%                        82.7%   
3           Duke                         78.9%                        81.5%   
4       Kentucky                         72.7%                        76.1%   
..           ...                           ...                          ...   
359  San Jose St                         26.1%                        37.5%   
360  Ark Pine Bl                         25.1%                        41.6%   
361  Miss Val St                         24.0%                        51.2%   
362   Chicago St                         22.3%                        31.5%   
363    W Georgia                          0.0%                          NaN   

    asUnderdog_winLoss_Win_percent restAdvantage_wi

In [15]:
bball_expenses_2022 = bball_expenses[bball_expenses['Survey Year'] == 2022]
print(bball_expenses_2022.columns)

Index(['Survey Year', 'UNITID', 'OPE ID', 'Institution Name', 'State CD',
       'Expenses Men's Team'],
      dtype='object')


In [11]:
# Step 1: Merge `bball_expenses` with `result_df` on "Institution Name"
merged_df = pd.merge(win_percent_df, team_names, left_on="Team", right_on="Best_Match_in_Team", how="left")
merged_df = merged_df.drop_duplicates(subset=["Institution Name"])
combined_df = pd.merge(merged_df, bball_expenses_2022, left_on="Team", right_on="Institution Name", how="outer")
combined_df = combined_df.drop_duplicates(subset=["Team"])

print(combined_df)

            Team afterLoss_winLoss_Win_percent afterWin_winLoss_Win_percent  \
0     Mercyhurst                        100.0%                          NaN   
1         Kansas                         85.2%                        79.6%   
7        Gonzaga                         82.1%                        82.7%   
8           Duke                         78.9%                        81.5%   
9       Kentucky                         72.7%                        76.1%   
..           ...                           ...                          ...   
374  Delaware St                         28.5%                        49.8%   
375  Kennesaw St                         27.2%                        41.3%   
376  San Jose St                         26.1%                        37.5%   
377   Chicago St                         22.3%                        31.5%   
380          NaN                           NaN                          NaN   

    asUnderdog_winLoss_Win_percent restAdvantage_wi

In [24]:
display(merged_win_loss_stats)

Unnamed: 0,Team,afterLoss_winLoss_Win_percent,afterLoss_winLoss_MOV,afterLoss_winLoss_ATS,afterWin_winLoss_Win_percent,afterWin_winLoss_MOV,afterWin_winLoss_ATS,asUnderdog_winLoss_Win_percent,asUnderdog_winLoss_MOV,asUnderdog_winLoss_ATS,...,away_winLoss_ATS,home_winLoss_Win_percent,home_winLoss_MOV,home_winLoss_ATS,neutralSite_winLoss_Win_percent,neutralSite_winLoss_MOV,neutralSite_winLoss_ATS,overallWinPercentage_Win_percent,overallWinPercentage_MOV,overallWinPercentage_ATS
0,Mercyhurst,100.0%,5.0,12.5,,,,50.0%,-6.0,7.3,...,7.3,,,,0.0%,-22.0,--,25.0%,-12.8,7.3
1,Kansas,85.2%,12.5,2.3,79.6%,11.4,0.4,42.7%,-3.8,-0.2,...,-0.3,93.3%,18.0,1.3,74.9%,9.2,+0.8,80.7%,11.8,0.7
2,Gonzaga,82.1%,14.9,2.0,82.7%,13.8,1.0,40.5%,-2.7,2.1,...,1.3,93.2%,21.9,1.5,73.4%,7.3,+0.5,82.8%,14.2,1.2
3,Duke,78.9%,13.0,0.8,81.5%,13.5,0.3,43.3%,-1.1,2.9,...,0.2,91.7%,20.3,1.2,80.2%,9.7,-0.3,81.5%,13.9,0.5
4,Kentucky,72.7%,9.3,0.6,76.1%,10.0,0.4,40.0%,-2.4,2.0,...,1.0,88.0%,15.7,0.0,69.2%,7.0,+0.4,75.5%,10.1,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,San Jose St,26.1%,-8.1,-1.3,37.5%,-3.5,-0.1,17.5%,-11.1,-0.7,...,-0.7,44.0%,-1.0,-1.2,28.4%,-5.9,-1.0,30.7%,-6.3,-0.9
360,Ark Pine Bl,25.1%,-11.6,-0.8,41.6%,-3.9,-0.9,12.8%,-15.9,-1.4,...,-0.7,47.9%,0.8,0.4,39.6%,-6.3,-1.8,29.2%,-9.9,-0.6
361,Miss Val St,24.0%,-11.9,-0.5,51.2%,0.2,-1.8,9.4%,-19.5,-1.2,...,-2.3,56.2%,2.6,1.2,31.3%,-7.4,0.0,31.7%,-8.8,-1.2
362,Chicago St,22.3%,-12.0,-1.8,31.5%,-7.0,0.0,12.3%,-18.0,-1.4,...,-2.0,44.1%,-1.1,-1.6,32.2%,-5.2,+0.3,24.8%,-10.8,-1.6


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

for col in merged_win_loss_stats.columns:
    if merged_win_loss_stats[col].dtype == object and merged_win_loss_stats[col].str.contains('%').any():
        merged_win_loss_stats[col] = merged_win_loss_stats[col].apply(lambda x: float(x.strip('%')) / 100 if isinstance(x, str) else x)

X = merged_win_loss_stats.drop(columns=['overallWinPercentage_Win_percent'])
y = merged_df['overallWinPercentage_Win_percent']

data = pd.concat([X, y], axis = 1)
data.dropna(inplace=True)

X= data.drop(columns=['overallWinPercentage_Win_percent', 'Team'])
y= data['overallWinPercentage_Win_percent']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

ValueError: could not convert string to float: 'Buffalo'