In [16]:
### Scrape game info to find HR totals for each field



In [17]:
import requests
from bs4 import BeautifulSoup

# Load the webpage
url = "https://www.warrennolan.com/baseball/2023/ncaa-regionals"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables in the webpage
tables = soup.find_all('tbody')

games = []

# Loop over each table
for table in tables:
    # Find all game rows in each table
    game_rows = table.find_all('tr')

    # Loop over each game row
    for row in game_rows:
        game_cells = row.find_all('td', {'class': 'data-cell'})
        
        # Check if the row is a game row
        if game_cells and "Game" in game_cells[0].text:
            game = {}

            # Game number
            game['game_number'] = game_cells[0].text.strip()

            # Team 1 name and score
            team_1 = game_cells[2].find('div', {'class': 'name-subcontainer'}).text.strip()
            score_1 = game_cells[3].text.strip()

            # Team 2 name and score
            team_2 = game_cells[5].find('div', {'class': 'name-subcontainer'}).text.strip()
            score_2 = game_cells[4].text.strip()

            game['team_1'] = team_1
            game['score_1'] = score_1
            game['team_2'] = team_2
            game['score_2'] = score_2

            games.append(game)




In [18]:
import pandas as pd

# Assuming the 'games' variable is a list of dictionaries obtained from the previous step

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(games)

# Now, let's add the region numbers. We'll assume that each region is represented by one table on the webpage.
# Therefore, we can simply increment the region number after each table.

region_number = 1
for i in range(len(df)):
    # Check if it's a new game number, which would indicate a new region
    if df.iloc[i]['game_number'] == 'Game 1':
        region_number += 1
    df.loc[i, 'region_id'] = region_number

# Now, the 'df' DataFrame should contain all the game data with an additional 'region_number' column.
# You can view the first few rows of the DataFrame with the following command:
df.head()


Unnamed: 0,game_number,team_1,score_1,team_2,score_2,region_id
0,Game 1,Samford,4,Southern Miss,2,2.0
1,Game 2,Penn,6,Auburn,3,2.0
2,Game 3,Southern Miss,7,Auburn,2,2.0
3,Game 4,Penn,5,Samford,4,2.0
4,Game 5,Southern Miss,9,Samford,4,2.0


## Scrapping espn for box score and stats

In [19]:
### ESPN Baseball Stats

base_url = 'https://www.espn.com/college-baseball/scoreboard/_/date/'

## Date Format: YYYMMDD
# Dates interested in 20230602 until present

# Not all day will have games. Will probably return 404 error
# Want to get the data for each game
# the main thing I am interested in is the home runs hit by each team and the location of the game



In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import timedelta, date

# Helper function to generate dates
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# Start and end dates
start_date = date(2023, 6, 2)
end_date = date.today()  # today's date

base_url = 'https://www.espn.com/college-baseball/scoreboard/_/date/'
game_data = []  # this list will hold our game data

# Loop over each date
for single_date in daterange(start_date, end_date):
    url = base_url + single_date.strftime("%Y%m%d")
    
    # Try accessing the page
    try:
        response = requests.get(url)
        response.raise_for_status()  # raise exception if invalid response
    except (requests.HTTPError, requests.ConnectionError):
        # Handle the exception if it occurs
        print(f"No data for {single_date.strftime('%Y-%m-%d')}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all game sections
    game_sections = soup.find_all('section', class_='Scoreboard bg-clr-white flex flex-auto justify-between')

    # For each game section, get the required data
    for section in game_sections:
        game = {}
        
        # Find the link to the box score and extract the game Id
        box_score_link = section.find('a', text='Box Score')
        if box_score_link is not None:
            game['game_id'] = box_score_link['href'].split('/')[-1]

        game_info = section.find('div', class_='ScoreboardScoreCell__Note clr-gray-04 n9 w-auto ml0')
        if game_info is not None:
            game['game_info'] = game_info.text
            
        game_run_elements = section.find_all('div', class_='ScoreboardScoreCell__Value flex justify-center pl2 baseball')
        if game_run_elements is not None and len(game_run_elements) >= 4:  # 4 because we have runs, hits, and errors for both teams
            game['away_team_runs'] = game_run_elements[0].text
            game['home_team_runs'] = game_run_elements[3].text

        # Add game data to the list
        game_data.append(game)

# Convert the list of games to a DataFrame
df = pd.DataFrame(game_data)

# Print DataFrame
# print(df)


In [21]:
# clean up the game infor column to just have the name of the site
# Goal, drop everything but the site name. should end up with 16 unique values
# strip NCAA Baseball Championship - Regionals from the game_info column
df['game_info'] = df['game_info'].str.replace('NCAA Baseball Championship -', '')

# find Regional or Super  and drop everything after that
df['game_info'] = df['game_info'].str.split('Regional').str[0]
df['game_info'] = df['game_info'].str.split('Super').str[0]




# strip the leading and trailing whitespace
df['game_info'] = df['game_info'].str.strip()



In [22]:
df.head()

# len(df['game_info'].unique())

# print a url for the box score
# df['box_score_link'].head()

# give me some sample links to look at
# df['box_score_link'].sample(10)


# are any null?
df['away_team_runs'].isnull().sum()
# df['home_team_runs'].isnull().sum()


# ADD UP ALL THE RUN SCORED IN EACH GAME AND AT EACH SITE
# drop null scores
df = df.dropna(subset=['away_team_runs', 'home_team_runs'])
df['away_team_runs'] = df['away_team_runs'].astype(int)
df['home_team_runs'] = df['home_team_runs'].astype(int)

df['total_runs'] = df['away_team_runs'] + df['home_team_runs']
df.head()

# find number of runs scored at each site and runs per game
df.groupby('game_info')['total_runs'].agg(['sum', 'mean']).sort_values(by='sum', ascending=False)

# Count of the sites
df['game_info'].value_counts()

# number of sites
len(df['game_info'].unique())





19

In [23]:
df.head()

Unnamed: 0,game_id,game_info,away_team_runs,home_team_runs,total_runs
0,401551019,Winston-Salem,0,12,12
1,401551049,Fayetteville,6,13,19
2,401551027,Baton Rouge,2,7,9
3,401551041,Charlottesville,1,15,16
4,401551023,Coral Gables,1,9,10


## working above

In [25]:
base_game_url = 'https://www.espn.com/college-baseball/boxscore/_/gameId/'

import pandas as pd

# List of urls
urls = [base_game_url + game_id for game_id in df['game_id']]

# Create an empty dataframe to store the results
results = pd.DataFrame(columns=['team_1', 'team_2', 'runs_1', 'hits_1', 'errors_1', 'home_runs_1', 'runs_2', 'hits_2', 'errors_2', 'home_runs_2'])

for url in urls:
    tables = pd.read_html(url)

    # The second table contains the data we're interested in
    score_table = tables[1]

    # The team names are in the first table
    team_table = tables[0]
    team_1 = team_table.iloc[0, 0]
    team_2 = team_table.iloc[1, 0]

    # Extract the runs, hits, and errors for each team
    runs_1, hits_1, errors_1 = score_table.iloc[0, -3:]
    runs_2, hits_2, errors_2 = score_table.iloc[1, -3:]

    # The team's home runs are in tables 3 and 9
    # Convert the 'HR' column to numeric before summing
    home_runs_1 = pd.to_numeric(tables[3]['HR'], errors='coerce').sum() / 2  # Sum the HR column for team 1
    home_runs_2 = pd.to_numeric(tables[9]['HR'], errors='coerce').sum() / 2  # Sum the HR column for team 2

    # Add the results to the results dataframe
    results = results.append({
        'team_1': team_1, 'team_2': team_2,
        'runs_1': runs_1, 'hits_1': hits_1, 'errors_1': errors_1, 'home_runs_1': home_runs_1,
        'runs_2': runs_2, 'hits_2': hits_2, 'errors_2': errors_2, 'home_runs_2': home_runs_2,
    }, ignore_index=True)

# Now results is a dataframe with the runs, hits, errors, and home runs for each team in each game
# print(results)


  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = results.append({
  results = re

In [26]:
# Merge the results with the original dataframe
df = df.merge(results, left_index=True, right_index=True)



# Print the dataframe
# print(df)

# Save the dataframe to a csv file
df.to_csv('NCAA_GAME_DATA.csv', index=False)


In [27]:
## Calculate how many games were played at each site
df['game_info'].value_counts()
# df['game_info'].value_counts().sum()





Gainesville        9
Baton Rouge        9
Charlottesville    9
Stanford           9
Winston-Salem      8
Lexington          7
Conway             7
Auburn             7
Columbia           6
Fayetteville       6
Stillwater         6
Clemson            6
Nashville          6
Coral Gables       6
Terre Haute        6
Tuscaloosa         6
Eugene             3
Hattiesburg        3
Fort Worth         2
Name: game_info, dtype: int64

In [None]:
## Calculate how many games were played at each site
# df.head()


# Group by the site add a column for the number of games at the site and sum the runs, hits, and home runs 
by_site_df = df.groupby('game_info')['total_runs', 'hits', 'home_runs'].sum().sort_values(by='total_runs', ascending=False)

# Add a column for the number of games at the site
by_site_df['games'] = df['game_info'].value_counts()

# Calculate per game stats
by_site_df['runs_per_game'] = (by_site_df['total_runs'] / by_site_df['games']).round(0)
by_site_df['hits_per_game'] = (by_site_df['hits'] / by_site_df['games']).round(2)
by_site_df['home_runs_per_game'] = (by_site_df['home_runs'] / by_site_df['games']).round(2)

by_site_df.head()





  by_site_df = df.groupby('game_info')['total_runs', 'hits', 'home_runs'].sum().sort_values(by='total_runs', ascending=False)


Unnamed: 0_level_0,total_runs,hits,home_runs,games,runs_per_game,hits_per_game,home_runs_per_game
game_info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Winston-Salem,135,126,32,8,16.875,15.75,4.0
Charlottesville,114,164,36,9,12.666667,18.222222,4.0
Baton Rouge,113,188,40,9,12.555556,20.888889,4.444444
Stanford,110,156,56,8,13.75,19.5,7.0
Conway,100,133,26,7,14.285714,19.0,3.714286
