web scraper using beautiful soup to get the table from teamrankings.com


In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os

# Directory where your HTML files are stored
directory_path = '../data/html_teamranking/'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".html"):
        # Construct full file path
        file_path = os.path.join(directory_path, filename)
        
        # Open and parse each HTML file
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        # Locate the table
        table = soup.find('table')
        if not table:
            print(f"No table found in {filename}")
            continue  # Skip to next file if no table found

        # Extract headers
        headers = [header.text.strip() for header in table.find_all('th')]

        # Extract rows
        rows = []
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            if row_data:
                rows.append(row_data)
        
        # Save data to a new CSV file
        csv_filename = filename.replace('.html', '.csv')
        csv_path = os.path.join(directory_path, csv_filename)
        
        with open(csv_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headers)  # Write headers
            writer.writerows(rows)    # Write data rows
        print(f"Data from {filename} has been saved to {csv_filename}")

print("All files processed.")


Data from away_winLoss.html has been saved to away_winLoss.csv
Data from overallWinPercentage.html has been saved to overallWinPercentage.csv
Data from asUnderdog_winLoss.html has been saved to asUnderdog_winLoss.csv
Data from equalRest_winLoss.html has been saved to equalRest_winLoss.csv
Data from afterLoss_winLoss.html has been saved to afterLoss_winLoss.csv
Data from restAdvantage_winLoss.html has been saved to restAdvantage_winLoss.csv
Data from home_winLoss.html has been saved to home_winLoss.csv
Data from neutralSite_winLoss.html has been saved to neutralSite_winLoss.csv
Data from afterWin_winLoss.html has been saved to afterWin_winLoss.csv
All files processed.


In [2]:
import pandas as pd
import os

#initializing so there's no yellow lines lol
afterLoss_winLoss = 0
afterWin_winLoss = 0
asUnderdog_winLoss = 0
restAdvantage_winLoss = 0
equalRest_winLoss = 0
away_winLoss = 0
home_winLoss = 0
neutralSite_winLoss = 0
overallWinPercentage=0

# Directory where your CSV files are stored
directory_path = '../data/html_teamranking/'

# Loop through each CSV file in the directory and create DataFrames
for filename in os.listdir(directory_path):
    if filename.endswith(".csv"):
        # Construct full file path
        csv_path = os.path.join(directory_path, filename)
        
        # Read CSV into a DataFrame
        df = pd.read_csv(csv_path)
        
        # Use filename as the name of the DataFrame (without ".csv")
        exec(f"{filename.replace('.csv', '')} = df")
        
        # Print to confirm
        print(f"DataFrame created for {filename}")


DataFrame created for afterLoss_winLoss.csv
DataFrame created for asUnderdog_winLoss.csv
DataFrame created for restAdvantage_winLoss.csv
DataFrame created for overallWinPercentage.csv
DataFrame created for neutralSite_winLoss.csv
DataFrame created for equalRest_winLoss.csv
DataFrame created for away_winLoss.csv
DataFrame created for afterWin_winLoss.csv
DataFrame created for home_winLoss.csv


In [None]:
#----------these datasets range from the 2003-2004 season to 2024-2025 season (current)

#previous game was a loss -> is the next game a win/loss?
print(afterLoss_winLoss)

#previous game was a win -> is the next game a win/loss?
print(afterWin_winLoss)

#games won/lost as the underdog aka lower team ranking/seeding
print(asUnderdog_winLoss)

#games won/lost WITH a rest advantage, or having 1+ days of rest compared to the other team
print(restAdvantage_winLoss)

#games won/lost with equal days rest compared to the other team (baseline for restAdvantange)
print(equalRest_winLoss)

#games won/lost as the away team
print(away_winLoss)

#games won/lost as the home team
print(home_winLoss)

#games won/lost playing on a neutral site (might be useful to compare as baseline agasint home/away)
print(neutralSite_winLoss)

#total wins and losses of every game
print(overallWinPercentage)


            Team Win-Loss Record   Win %   MOV  ATS +/-
0     Mercyhurst           1-0-0  100.0%   5.0     12.5
1         Kansas        138-24-0   85.2%  12.5      2.3
2        Gonzaga        110-24-0   82.1%  14.9      2.0
3           Duke        123-33-0   78.9%  13.0      0.8
4       Kentucky        152-57-0   72.7%   9.3      0.6
..           ...             ...     ...   ...      ...
359  San Jose St       140-396-0   26.1%  -8.1     -1.3
360  Ark Pine Bl       137-410-0   25.1% -11.6     -0.8
361  Miss Val St       127-403-0   24.0% -11.9     -0.5
362   Chicago St       128-446-0   22.3% -12.0     -1.8
363    W Georgia           0-1-0    0.0% -23.0      0.5

[364 rows x 5 columns]
             Team Win-Loss Record  Win %   MOV  ATS +/-
0         Gonzaga       634-133-0  82.7%  13.8      1.0
1            Duke       634-144-0  81.5%  13.5      0.3
2          Kansas       616-158-0  79.6%  11.4      0.4
3        Kentucky       540-170-0  76.1%  10.0      0.4
4         Arizona       