In [145]:

from bs4 import BeautifulSoup
import json
import requests

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')

    tournaments = []

    tables = soup.find_all('table', class_='tablepress')
    
    for table in tables:
        tournament_title = table.find_previous('h2').get_text(strip=True)
        rows = table.find_all('tr')
        games = []
        for row in rows[1:]:
            cols = row.find_all('td')
            game_info = {
                'game': cols[0].get_text(strip=True),
                'time': cols[1].get_text(strip=True),
                'road_team': cols[2].get_text(strip=True),
                'road_score': cols[3].get_text(strip=True),
                'home_team': cols[4].get_text(strip=True),
                'home_score': cols[5].get_text(strip=True),
            }
            games.append(game_info)
        
        tournament = {
            'tournament': tournament_title,
            'games': games
        }
        tournaments.append(tournament)

    return tournaments



In [146]:


urlPath = 'https://d1baseball.com/college-baseball-tournament-central/'

# request the url and return it as a string

html = requests.get(urlPath).text

## Pass html to the parse_html function and return the results

tournaments = parse_html(html)

## Print the results to a text file

with open('tournaments.txt', 'w') as f:
    f.write(json.dumps(tournaments))




In [147]:
import pandas as pd
import json

# Load raw JSON data
with open('tournaments.txt') as f:
    raw_data = json.load(f)

# Create an empty list to store the cleaned data
cleaned_data = []

# Parse each tournament in raw data
for tournament in raw_data:
    tournament_name = tournament['tournament']
    games = tournament['games']
    
    # Parse each game in a tournament
    for game in games:
        # Extract information from each game
        game_info = {
            'tournament': tournament_name,
            'game': game['game'],
            'time': game['time'],
            'road_team': game['road_team'],
            'road_score': game['road_score'] if game['road_score'] != '' else None,
            'home_team': game['home_team'],
            'home_score': game['home_score'] if game['home_score'] != '' else None
        }
        
        # Append the game_info to cleaned_data list
        cleaned_data.append(game_info)

# Create a DataFrame from cleaned data
df = pd.DataFrame(cleaned_data)

# Save DataFrame to a csv file
df.to_csv('data/NCAA_D1/cleaned_tournaments.csv', index=False)


In [148]:
## look at the 'game' column and extract any text that is surrounded by parentheses into a column called 'loc_1' a delete it from the 'game' column

df['loc_1'] = df['game'].str.extract(r'\((.*?)\)')
df['game'] = df['game'].str.replace(r'\(.*?\)', '')





  df['game'] = df['game'].str.replace(r'\(.*?\)', '')


In [149]:
## Delete everything but numerical values from the 'game' column

df['game'] = df['game'].str.replace(r'\D', '')


  df['game'] = df['game'].str.replace(r'\D', '')


In [150]:
## Remove the word Tournament from the 'tournament' column and rename the column to 'conference'

df['tournament'] = df['tournament'].str.replace('Tournament', '')
df.rename(columns={'tournament': 'conference'}, inplace=True)


In [151]:
# extract the date from the time column

df['date'] = df['time'].str.extract(r'(\d+/\d+)')
df['time'] = df['time'].str.replace(r'\d+/\d+', '')

# extract the time from the time column

df['time'] = df['time'].str.extract(r'(\d+:\d+)')


  df['time'] = df['time'].str.replace(r'\d+/\d+', '')


In [152]:
# ## Create a list of all the conference names used in the other data sets
# conf_full_names = ['America East Conference','American Athletic Conference','Atlantic 10 Conference','Atlantic Coast Conference','Atlantic Sun Conference',
#     'Big 12 Conference','Big East Conference','Big South Conference','Big Ten Conference','Big West Conference','Colonial Athletic Association','Conference USA','Horizon League','Ivy League','Metro Atlantic Athletic Conference','Mid-American Conference',
#     'Mid-Eastern Athletic Conference','Missouri Valley Conference','Mountain West Conference','Northeast Conference','Ohio Valley Conference','Pacific-12 Conference',
#     'Patriot League','Southeastern Conference','Southern Conference','Southland Conference','Southwest Athletic Conference','Sun Belt Conference','West Coast Conference',
#     'Western Athletic Conference']

df.describe()

Unnamed: 0,conference,game,time,road_team,road_score,home_team,home_score,loc_1,date
count,161,161,161,161,96,161,96,157,161
unique,29,20,26,112,19,136,20,30,10
top,ACC,1,3:00,TBA,4,Army,5,"Durham, NC",05/24
freq,12,27,21,25,12,5,9,12,73


In [153]:
# Define a dictionary mapping old conference names to their full names
conference_name_map = {
    'ACC': 'Atlantic Coast Conference',
    'America East': 'America East Conference',
    'Atlantic 10': 'Atlantic 10 Conference',
    'Atlantic Sun': 'Atlantic Sun Conference',
    'Big 12': 'Big 12 Conference',
    'Big East': 'Big East Conference',
    'Big South': 'Big South Conference',
    'Big Ten': 'Big Ten Conference',
    'Big West': 'Big West Conference',
    'Colonial': 'Colonial Athletic Association',
    'Conference USA': 'Conference USA',
    'Horizon': 'Horizon League',
    'Ivy League': 'Ivy League',
    'Metro Atlantic': 'Metro Atlantic Athletic Conference',
    'Mid-American': 'Mid-American Conference',
    'MEAC': 'Mid-Eastern Athletic Conference',
    'Missouri Valley': 'Missouri Valley Conference',
    'Mountain West': 'Mountain West Conference',
    'Northeast': 'Northeast Conference',
    'Ohio Valley': 'Ohio Valley Conference',
    'Pac-12': 'Pacific-12 Conference',
    'Patriot': 'Patriot League',
    'SEC': 'Southeastern Conference',
    'SoCon': 'Southern Conference',
    'Southland': 'Southland Conference',
    'Southwestern Athletic': 'Southwest Athletic Conference',
    'Sun Belt': 'Sun Belt Conference',
    'West Coast': 'West Coast Conference',
    'Western Athletic': 'Western Athletic Conference'
}

# First, ensure that the 'conference' column values are all lower case and stripped of leading/trailing spaces:
df['conference'] = df['conference'].str.lower().str.strip()

# Then, create a new dictionary with lower case keys:
conference_name_map_lower = {k.lower().strip(): v for k, v in conference_name_map.items()}

# Finally, replace the values in the 'conference' column using the new dictionary to create 'conference_full' column:
df['conference_full'] = df['conference'].replace(conference_name_map_lower)
df['conference_full'] = df['conference_full'].str.title()

# Drop the 'conference' column and rename the 'conference_full' column to 'conference' and reorder the columns

df.drop(columns=['conference'], inplace=True)
df.rename(columns={'conference_full': 'conference'}, inplace=True)
df = df[['date', 'time', 'conference', 'game', 'loc_1', 'road_team', 'road_score', 'home_team', 'home_score']]






In [154]:
## change date column to something more pleasing to the eye
## want it in this type of format: Tuesday, May 23

# add the year to the date column
df['date'] = df['date'] + '/2023'
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df['date'] = df['date'].dt.strftime('%A, %B %d')

import pandas as pd

# Assuming the 'time' column in your DataFrame is of string type

# Add ' am' to rows with time in the 10 o'clock and 11 o'clock hour
df.loc[df['time'].str.startswith(('10:', '11:')), 'time'] += ' am'

# Add ' pm' to the rest of the rows
df.loc[~df['time'].str.startswith(('10:', '11:')), 'time'] += ' pm'






In [156]:
# df.info()
df.describe()
df.head(20)

# df['time'].tolist()


Unnamed: 0,date,time,conference,game,loc_1,road_team,road_score,home_team,home_score
0,"Tuesday, May 23",11:00 am,Atlantic Coast Conference,1,"Durham, NC",Virginia Tech,7.0,Boston College,11.0
1,"Tuesday, May 23",3:30 pm,Atlantic Coast Conference,2,"Durham, NC",Georgia Tech,5.0,North Carolina,11.0
2,"Tuesday, May 23",7:00 pm,Atlantic Coast Conference,3,"Durham, NC",NC State,8.0,Duke,7.0
3,"Wednesday, May 24",11:00 am,Atlantic Coast Conference,4,"Durham, NC",Pittsburgh,9.0,Notre Dame,5.0
4,"Wednesday, May 24",3:00 pm,Atlantic Coast Conference,5,"Durham, NC",Virginia,15.0,Georgia Tech,1.0
5,"Wednesday, May 24",7:00 pm,Atlantic Coast Conference,6,"Durham, NC",Virginia Tech,,Clemson,
6,"Thursday, May 25",11:00 am,Atlantic Coast Conference,7,"Durham, NC",Pittsburgh,,Wake Forest,
7,"Thursday, May 25",3:00 pm,Atlantic Coast Conference,8,"Durham, NC",North Carolina,,Virginia,
8,"Thursday, May 25",7:00 pm,Atlantic Coast Conference,9,"Durham, NC",NC State,,Miami,
9,"Friday, May 26",11:00 am,Atlantic Coast Conference,10,"Durham, NC",Boston College,,Clemson,


In [157]:
## Save to a csv to check the data

df.to_csv('data/NCAA_D1/JSON_tournaments.csv', index=False)

## save to a json for the ncaa map

df.to_json('data/NCAA_D1/tournaments.json', orient='records')

## Image Editing

In [52]:
## Crop the conference logo images so they don't have so much padding

from PIL import Image
import os

folder_path = "C:\\Users\\Justin\\Desktop\\Project\\BB_parks\\data\\NCAA_D1\\assests\\conf_logos\\"



# Iterate over the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".png") or filename.endswith(".jpg"):
        file_path = os.path.join(folder_path, filename)
        
        # Open the image
        image = Image.open(file_path)
        
        # Find the non-transparent region (bbox) of the image
        bbox = image.getbbox()
        
        # Crop the image based on the bounding box
        cropped_image = image.crop(bbox)
        
        # Save the cropped image, overwriting the original file
        cropped_image.save(file_path)


In [46]:
df.columns

Index(['date', 'time', 'conference', 'game', 'loc_1', 'road_team',
       'road_score', 'home_team', 'home_score'],
      dtype='object')