In [114]:

from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')

    tournaments = []

    tables = soup.find_all('table', class_='tablepress')
    
    for table in tables:
        tournament_title = table.find_previous('h2').get_text(strip=True)
        rows = table.find_all('tr')
        games = []
        for row in rows[1:]:
            cols = row.find_all('td')
            game_info = {
                'game': cols[0].get_text(strip=True),
                'time': cols[1].get_text(strip=True),
                'road_team': cols[2].get_text(strip=True),
                'road_score': cols[3].get_text(strip=True),
                'home_team': cols[4].get_text(strip=True),
                'home_score': cols[5].get_text(strip=True),
            }
            games.append(game_info)
        
        tournament = {
            'tournament': tournament_title,
            'games': games
        }
        tournaments.append(tournament)

    return tournaments



In [115]:
import json

urlPath = 'https://d1baseball.com/college-baseball-tournament-central/'

# request the url and return it as a string

html = requests.get(urlPath).text

## Pass html to the parse_html function and return the results

tournaments = parse_html(html)

## Print the results to a text file

with open('tournaments.txt', 'w') as f:
    f.write(json.dumps(tournaments))




In [116]:
import pandas as pd
import json

# Load raw JSON data
with open('tournaments.txt') as f:
    raw_data = json.load(f)

# Create an empty list to store the cleaned data
cleaned_data = []

# Parse each tournament in raw data
for tournament in raw_data:
    tournament_name = tournament['tournament']
    games = tournament['games']
    
    # Parse each game in a tournament
    for game in games:
        # Extract information from each game
        game_info = {
            'tournament': tournament_name,
            'game': game['game'],
            'time': game['time'],
            'road_team': game['road_team'],
            'road_score': game['road_score'] if game['road_score'] != '' else None,
            'home_team': game['home_team'],
            'home_score': game['home_score'] if game['home_score'] != '' else None
        }
        
        # Append the game_info to cleaned_data list
        cleaned_data.append(game_info)

# Create a DataFrame from cleaned data
df = pd.DataFrame(cleaned_data)

# Save DataFrame to a csv file
df.to_csv('data/NCAA_D1/cleaned_tournaments.csv', index=False)


In [117]:
## look at the 'game' column and extract any text that is surrounded by parentheses into a column called 'loc_1' a delete it from the 'game' column

df['loc_1'] = df['game'].str.extract(r'\((.*?)\)')
df['game'] = df['game'].str.replace(r'\(.*?\)', '')





  df['game'] = df['game'].str.replace(r'\(.*?\)', '')


In [118]:
## Delete everything but numerical values from the 'game' column

df['game'] = df['game'].str.replace(r'\D', '')


  df['game'] = df['game'].str.replace(r'\D', '')


In [119]:
## Remove the word Tournament from the 'tournament' column and rename the column to 'conference'

df['tournament'] = df['tournament'].str.replace('Tournament', '')
df.rename(columns={'tournament': 'conference'}, inplace=True)


In [120]:
# extract the date from the time column

df['date'] = df['time'].str.extract(r'(\d+/\d+)')
df['time'] = df['time'].str.replace(r'\d+/\d+', '')

# extract the time from the time column

df['time'] = df['time'].str.extract(r'(\d+:\d+)')










  df['time'] = df['time'].str.replace(r'\d+/\d+', '')


In [121]:
# df.info()
df.describe()
df.head(20)


Unnamed: 0,conference,game,time,road_team,road_score,home_team,home_score,loc_1,date
0,ACC,1,11:00,Virginia Tech,7.0,Boston College,11.0,"Durham, NC",05/23
1,ACC,2,3:30,Georgia Tech,5.0,North Carolina,11.0,"Durham, NC",05/23
2,ACC,3,7:00,NC State,8.0,Duke,7.0,"Durham, NC",05/23
3,ACC,4,11:00,Pittsburgh,,Notre Dame,,"Durham, NC",05/24
4,ACC,5,3:00,Georgia Tech,,Virginia,,"Durham, NC",05/24
5,ACC,6,7:00,Virginia Tech,,Clemson,,"Durham, NC",05/24
6,ACC,7,11:00,Pittsburgh,,Wake Forest,,"Durham, NC",05/25
7,ACC,8,3:00,North Carolina,,Virginia,,"Durham, NC",05/25
8,ACC,9,7:00,NC State,,Miami,,"Durham, NC",05/25
9,ACC,10,11:00,Boston College,,Clemson,,"Durham, NC",05/26


In [123]:
## Save to a csv to check the data

df.to_csv('data/NCAA_D1/JSON_tournaments.csv', index=False)

## save to a json for the ncaa map

df.to_json('data/NCAA_D1/tournaments.json', orient='records')

In [104]:
df.columns

Index(['conference', 'game', 'time', 'road_team', 'road_score', 'home_team',
       'home_score', 'loc_1', 'date'],
      dtype='object')