web scraper using beautiful soup to get the table from teamrankings.com


In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os

# Directory where your HTML files are stored
directory_path = '../data/html_teamranking/'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".html"):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and parse each HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        # Locate the table
        table = soup.find('table')
        if not table:
            print(f"No table found in {filename}")
            continue  # Skip to next file if no table found

        # Extract headers
        headers = [header.text.strip() for header in table.find_all('th')]

        # Extract rows
        rows = []
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            if row_data:
                rows.append(row_data)
        
        # Save data to a new CSV file
        csv_filename = filename.replace('.html', '.csv')
        csv_path = os.path.join(directory_path, csv_filename)
        
        with open(csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headers)  # Write headers
            writer.writerows(rows)    # Write data rows
        print(f"Data from {filename} has been saved to {csv_filename}")

print("All files processed.")


Data from away_winLoss.html has been saved to away_winLoss.csv
No table found in all_bbalcolleges.html
Data from overallWinPercentage.html has been saved to overallWinPercentage.csv
Data from asUnderdog_winLoss.html has been saved to asUnderdog_winLoss.csv
Data from equalRest_winLoss.html has been saved to equalRest_winLoss.csv
Data from afterLoss_winLoss.html has been saved to afterLoss_winLoss.csv
Data from restAdvantage_winLoss.html has been saved to restAdvantage_winLoss.csv
Data from home_winLoss.html has been saved to home_winLoss.csv
Data from neutralSite_winLoss.html has been saved to neutralSite_winLoss.csv
Data from afterWin_winLoss.html has been saved to afterWin_winLoss.csv
All files processed.


In [2]:
import pandas as pd
import os

#initializing so there's no yellow lines lol
afterLoss_winLoss = 0
afterWin_winLoss = 0
asUnderdog_winLoss = 0
restAdvantage_winLoss = 0
equalRest_winLoss = 0
away_winLoss = 0
home_winLoss = 0
neutralSite_winLoss = 0
overallWinPercentage=0

# Directory where your CSV files are stored
directory_path = '../data/html_teamranking/'

# Loop through each CSV file in the directory and create DataFrames
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):
        # Construct full file path
        csv_path = os.path.join(directory_path, filename)
        
        # Read CSV into a DataFrame
        df = pd.read_csv(csv_path)
        
        # Use filename as the name of the DataFrame (without ".csv")
        exec(f"{filename.replace('.csv', '')} = df")
        
        # Print to confirm
        print(f"DataFrame created for {filename}")

team_expenses = pd.read_csv("../data/Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022.csv")
print(f"DataFrame created for Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022")


DataFrame created for afterLoss_winLoss.csv
DataFrame created for asUnderdog_winLoss.csv
DataFrame created for restAdvantage_winLoss.csv
DataFrame created for overallWinPercentage.csv
DataFrame created for neutralSite_winLoss.csv
DataFrame created for equalRest_winLoss.csv
DataFrame created for away_winLoss.csv
DataFrame created for afterWin_winLoss.csv
DataFrame created for home_winLoss.csv
DataFrame created for Sport_Data_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021_2022


In [3]:
#----------these datasets range from the 2003-2004 season to 2024-2025 season (current)

#previous game was a loss -> is the next game a win/loss?
afterLoss_winLoss

#previous game was a win -> is the next game a win/loss?
afterWin_winLoss

#games won/lost as the underdog aka lower team ranking/seeding
asUnderdog_winLoss

#games won/lost WITH a rest advantage, or having 1+ days of rest compared to the other team
restAdvantage_winLoss

#games won/lost with equal days rest compared to the other team (baseline for restAdvantange)
equalRest_winLoss

#games won/lost as the away team
away_winLoss

#games won/lost as the home team
home_winLoss

#games won/lost playing on a neutral site (might be useful to compare as baseline agasint home/away)
neutralSite_winLoss

#total wins and losses of every game
overallWinPercentage

#total team expenses 
bball_expenses = team_expenses[["Survey Year", "UNITID", "OPE ID", "Institution Name", "State CD", "Expenses Men's Team"]]


In [4]:
!pip install fuzzywuzzy[speedup]



In [5]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def custom_score(str1, str2):
    """
    Custom scoring function that penalizes missing words less.
    Uses fuzz.partial_ratio for lenient matching, with extra weight for fewer missing words.
    """
    # First get the basic partial match score
    partial_score = fuzz.partial_ratio(str1, str2)
    
    # Adjust the score based on the length of the two strings (i.e., fewer words missing is better)
    word_diff_penalty = abs(len(str1.split()) - len(str2.split()))
    
    # Optionally, add a small penalty for missing words
    adjusted_score = partial_score - (word_diff_penalty * 2)  # Adjust the factor for leniency
    
    # Ensure the score stays within 0-100
    return max(min(adjusted_score, 100), 0)

def find_best_match_between_dfs(df1, column1, df2, column2, threshold=80):
    """
    Finds the best fuzzy match for each item in `column1` from `df1` within the entire `column2` from `df2`.
    Returns a new DataFrame with original names, best matches, and scores using custom scoring for partial matching.
    """
    matches = []
    
    for item in df1[column1]:
        # Extract the best match and custom score
        match_data = process.extractOne(item, df2[column2], scorer=lambda str1, str2: custom_score(str1, str2))
        if match_data:
            best_match = match_data[0]  # Best match string
            score = match_data[1]       # Similarity score
        else:
            best_match = None
            score = None
        
        # Append results if they meet the threshold
        matches.append({
            column1: item,
            f'Best_Match_in_{column2}': best_match if score and score >= threshold else None,
            f'Match_Score_in_{column2}': score if score and score >= threshold else None
        })
    
    # Create and return a new DataFrame with matches and scores
    return pd.DataFrame(matches)

result_df = find_best_match_between_dfs(bball_expenses, 'Institution Name', afterLoss_winLoss, 'Team')


In [None]:
print(result_df)
team_names = result_df

#this team_names dataset is relating the "Institution Name" from the ball_expenses dataframe to the 
# teamranking ones ie afterLoss_winLoss "Team" names

#---------------THIS TEAM_NAMES IS ALSO NOT PERFECT SO THERE ARE WRONG RELATIONS AND THINGS BC I AUTOMATED, PLEASE HELP CHECK AND EDIT

                         Institution Name Best_Match_in_Team  \
0            Abilene Christian University      Hsn Christian   
1                Alabama A & M University            Alabama   
2                Alabama State University         Alabama St   
3                 Alcorn State University          Alcorn St   
4                     American University           American   
...                                   ...                ...   
6970                  Winthrop University           Winthrop   
6971  Wright State University-Main Campus          Wright St   
6972                    Xavier University             Xavier   
6973                      Yale University               Yale   
6974          Youngstown State University           NC State   

      Match_Score_in_Team  
0                    83.0  
1                    94.0  
2                    98.0  
3                    98.0  
4                    98.0  
...                   ...  
6970                 98.0  
6971   