In [1]:
#web scraping
import requests as req
from bs4 import BeautifulSoup as bs

#regular expressions
import re
    
#for saving workdata
import pickle as pk

#in this case, numpy for nan
import numpy as np

#for time
import datetime as dt

#for lagging requests
import time

Helper function for slacing from a pattern, provided by ChatGPT:

In [14]:
def slice_from_pattern(string, pattern):
    # Use the `search` function to find the position of the pattern in the string
    match = re.search(pattern, string)
    if match:
        # Get the start index of the match
        start_index = match.start()
        # Return the slice of the string from the start index to the end
        return string[start_index:]
    # If the pattern is not found, return the original string
    return string

Start off by pulling the boxscore links for all games so each individual boxscore can be scraped

In [15]:


#start and end year is inclusive
def get_box_links(start_year,end_year):
    box_links = []
    for year in range(start_year,end_year + 1):
        #page to scrape
        url = f"https://www.baseball-reference.com/leagues/majors/{year}-schedule.shtml"
        resp = req.get(url)
        soup = bs(resp.text)

        """
           this list comprehhension finds all instances of a tags with the text being boxscores, 
           and for each instance it only takes the href attribute or unique boxscore link.
           
           the extending adds all elements of the resultant list to the box_links list
        """
        box_links.extend([x["href"] for x in soup.find_all("a",text="Boxscore")])
    return box_links
        

head over to the site to see the layout, use inspect element to see the html as well to know how to get the boxscore links

In [46]:
box_links = get_box_links(2016,2022)
pk.dump(box_links, open('box_links.pkl', 'wb'))

In [16]:
box_links = pk.load(open('box_links.pkl', 'rb'))

now all the boxscore links from 2016-2022 (inclusive) are obtained, the scraping for specific stats can begin, start by creating a function to manage collecting data from the different tables and creating the dictionary

In [17]:
def game_organizer(box_link):
    
    box_url = "http://baseball-reference.com" + box_link  
    resp = req.get(box_url)
    soup = bs(resp.text)
    
    #isolate the specific game id from the box link
    box_id = box_link.split('/')[-1][:-6]
    
    #TODO: pass on id and soup to other funcs to create the other dicts for tables
    
    #get rid of weird comments found in the html, this code block from: rdoharr's MLB prediction project, thanks
    uncommented_html = ''
    for h in resp.text.split('\n'):
        if '<!--     <div' in h: continue
        if h.strip() == '<!--': continue
        if h.strip() == '-->': continue
        uncommented_html += h + '\n'
    
    #the order which the tables appear is, away_batting, home_batting, away_pitching, home pitching, (the order place, is also the table number)
    
    uncommented_soup = bs(uncommented_html)
    gamedata = {
        "general_game_data" : get_game_facts(soup,box_id),
        "away_batting_data" : get_table_summary(uncommented_soup,0),
        "home_batting_data" : get_table_summary(uncommented_soup,1),
        "away_pitching_data" : get_table_summary(uncommented_soup,2),
        "home_pitching_data" : get_table_summary(uncommented_soup,3),
        "away_pitcher_data" : get_pitcher_table(uncommented_soup,0),
        "home_pitcher_data" : get_pitcher_table(uncommented_soup,1)
    }
    
    return gamedata

first I want a dict that will include:  
general facts about the game such as, which teams played, weather, time, ect

In [418]:


def get_game_facts(soup,box_id):
    game_facts = {"game_id":box_id}
    #find the scorebox section that includes date, attendance, ect
    scorebox = soup.find("div",class_="scorebox")
        
    #this list comprehension goes through all "a" tags in the  scorebox section and only  adds each tag if "teams" is in the href

    names = [x for x in scorebox.find_all("a") if "teams" in x["href"]]

    #for each tag, isolate thhe href, and then isoolate the team abbreviation 
    game_facts["away_team_abr"] = names[0]["href"].split('/')[2]
    game_facts["home_team_abr"] = names[1]["href"].split('/')[2]

    div_data = scorebox.find("div",class_ = "scorebox_meta").find_all("div")

    #not going to explain simple list compreheensions anymore since there are already enough exps...
    otherdata = [x.text for x in div_data if "Rescheduled" not in x.text][:-1]
    
    #date always the first thing
    game_facts["date"] = otherdata[0]
         
    flag = False
    index = 0
    
    for i in range(len(otherdata)):
        if "Start" in otherdata[i]:
            index = i
            flag = True
    
    
    game_facts["start_time"] = otherdata[index][12:-6]
    flag = False

    
    for i in range(len(otherdata)):
        #this loop format will check if data contains unique information
        if "Attendance" in otherdata[i]:
            flag = True
            index = i
            break
    if flag == True:      
        game_facts["attendance"] = otherdata[index][12:].replace(',','')
    else:
        game_facts["attendance"] = np.nan
        


    flag = False
    
    
   
    for i in range(len(otherdata)):
        if "Venue" in otherdata[i]:
            flag = True
            index = i
            break
            
    if flag == True:
        game_facts["venue"] = otherdata[index][7:]
    else:
        game_facts["venue"] = np.nan
        

    flag = False
    
    for i in range(len(otherdata)):
        if "Duration" in otherdata[i]:
            flag = True
            index = i
            break
    
    if flag == True:
        game_facts["duration_hours"] = int(otherdata[index][15:-3]) + (int(otherdata[index][17:]) / 60)
        game_facts["duration_mins"] = (int(otherdata[index][15:-3]) * 60) + int(otherdata[index][17:])
    else:
        game_facts["duration_hours"] = np.nan
        game_facts["duration_mins"] = np.nan
    

    flag = False
    
    
    day_grass_status = otherdata[0].split(', ')
    
    #ignore date
    for i in range(len(otherdata)):
        if i != 0 and ", " in otherdata[i]:
            index = i
            flag = True
            break
    if flag == True:    
        game_facts["day_game"] = otherdata[index].split(', ')[0] == "Day Game"
    else:
        game_facts["day_game"] = np.nan
    
    if flag == True:        
        game_facts["on_grass"] = otherdata[index].split(', ')[1] == "on grass"
    else:
        game_facts["on_grass"] = np.nan
        
    flag = False

    """
    find the div tag with class attribute, section_wrapper that contains the other info I am looking for.
    Then get rid of the \n and \t characters.
    Finally find all of the content that is between the strong and div tags
    """
    rawotherinfo = re.findall("</strong>.*</div>",str(soup.find_all("div",class_="section_wrapper")[2]).replace('\n','').replace('\t',''))

    if rawotherinfo == []:
        rawotherinfo = re.findall("</strong>.*</div>",str(soup.find_all("div",class_="section_wrapper")[1]).replace('\n','').replace('\t',''))[0]
    else:
        rawotherinfo = rawotherinfo[0]

    #get rid of the strong and div tags and then split based on sentences
    otherinfo = rawotherinfo.replace("</div>",'').replace("<div>",'').replace("<strong>",'').replace('</strong>','').split('.')
    
    #get rid of annoying arrow that pops up
    if otherinfo[-1] == '    -->':
        otherinfo = otherinfo[:-1]
    
    #since sometimes certain info is missing use flag to be safe
    flag = False
    
    for i in range(len(otherinfo)):
        #this loop format will check if data contains unique information
        if "HP" in otherinfo[i]:
            flag = True
            index = i
            break
    if flag == True:
        umpires = otherinfo[index].split(',')
        try:
            #terenary operator to condense if statement, in the format of: [return_val] if [condition] else [statement_on_false
            game_facts["hp_umpire"] = umpires[0][6:] if "HP" in umpires[0] else np.nan
            
        except:
             game_facts["hp_umpire"] = np.nan
                
        try:
            game_facts["1b_umpire"] = umpires[1][6:] if "1B" in umpires[1] else np.nan
            
        except:
             game_facts["1b_umpire"] = np.nan
        
        try:
            game_facts["2b_umpire"] = umpires[2][6:] if "2B" in umpires[2] else np.nan
            
        except:
             game_facts["2b_umpire"] = np.nan
                
        try:
            game_facts["3b_umpire"] = umpires[3][6:] if "3B" in umpires[3] else np.nan
            
        except:
             game_facts["3b_umpire"] = np.nan
                
    else:
        game_facts["hp_umpire"] = np.nan
        game_facts["1b_umpire"] = np.nan
        game_facts["2b_umpire"] = np.nan
        game_facts["3b_umpire"] = np.nan
        

    flag = False
    
    for i in range(len(otherinfo)):
        if "Field" in otherinfo[i]:
            flag = True
            index = i
            break
    
    if flag == True:
        game_facts["field_cond"] = otherinfo[index][17:]
    else:
        game_facts["field_cond"] = np.nan
        

    flag = False
    
    
    for i in range(len(otherinfo)):
        if "Start Time Weather" in otherinfo[i]:
            flag = True
            index = i
            break
  
            
    if flag == True:       
        conditions = otherinfo[i].split(',')
    else:
        game_facts["start_weather"] = np.nan
        game_facts["wind_cond"] = np.nan
        game_facts["sky_cond"] = np.nan
        return game_facts
    
    flag = False
   
    weather = conditions[0][20:].replace('&deg','').split('; ')

    #just in case baseball reference uploads in celcius instead of farenheit
    if len(weather) == 2 and (weather[1] == 'C' or weather[1] == 'c'):
        try: 
            weather[0] = (weather[0] * 1.8) + 32
        except:
            weather[0] = np.nan
    
    try:
        weather[0] = float(weather[0])
    except:
        weather[0] = np.nan
    
    game_facts["start_weather"] = weather[0]
    
    for i in range(len(conditions)):
        if "Wind" in conditions[i]:
            flag = True
            index = i
            break
            
    if flag == True:
        game_facts["wind_cond"] = conditions[index][1:]
    else:
        game_facts["wind_cond"] = np.nan
    
    flag = False
    
    #if sky conditions were reported at all
    if len(conditions) > 1 and "Wind" not in conditions[-1][1:]:
        game_facts["sky_cond"] = conditions[-1][1:]
    else:
        game_facts["sky_cond"] = np.nan
    return game_facts

Next I want to make a function to pull the stats for batting/pitching, wether it is away or home:

In [19]:
def get_table_summary(soup, table_num):
    
    table = {}
    
    #raw tables for batting and pitching for away/home 
    raw_stat_tables = soup.find_all("table",class_="stats_table")[1:][0:4]
   

    #specfic table to look for
    raw_table = str(raw_stat_tables[table_num].find("tfoot")).split('</td>')[:-1]
    
    
    #get rid of the player data-stat
    raw_table[0] = slice_from_pattern(raw_table[0].split('</th>')[1], "data-stat")

    #loop through all other datastats and begin to add them
    for stat in raw_table:
        isolated_data_stat = slice_from_pattern(stat, "data-stat")
        spliced_data_stat = isolated_data_stat[10:].split(">")

        data_name = spliced_data_stat[0].replace('"', '')
        data_val = spliced_data_stat[1].replace(' ', '').replace("%",'')
        
        table[data_name] = data_val
        
    keys = list(table.keys())
    if 'details' in keys:
        del table['details']
    return table


Finally, I want individual data for each of the teams starting pitchers in a dictionary

In [20]:
#raw tables for batting and pitching for away/home
def get_pitcher_table(soup, table_num):
    raw_stat_tables = soup.find_all("table",class_="stats_table")[1:][2:4]

    starting_pitcher = raw_stat_tables[table_num].find("tbody").find("th")["data-append-csv"]
    table = {'starting_pitcher' : starting_pitcher}

    raw_table = str(raw_stat_tables[table_num].find("tbody")).split('</td>')

    raw_table[0] = slice_from_pattern(raw_table[0].split('</th>')[1], "data-stat")
    raw_tables = raw_table[:26]

    for stat in raw_tables:
        isolated_data_stat = slice_from_pattern(stat, "data-stat")
        spliced_data_stat = isolated_data_stat[10:].split(">")

        data_name = spliced_data_stat[0].replace('"', '')
        data_val = spliced_data_stat[1].replace(' ', '').replace("%",'')
        
        table[data_name] = data_val
        
    del table["inherited_runners"]
    del table["inherited_score"]
    
    return table

Now that all the functions for creating a game dictionary is finished, the list of games to store each dictionary can be constructed  
this is what will be used to build the games dataframe

In [455]:
def game_list_maker():
    print(f"current time is: {dt.datetime.now()}")
    startind = len(game_list)
    print(f"starting at grabbing at page {startind}")
    
    count = 1
    
    links = box_links[startind:]
    for link in links:
        if count % 20 == 0:
            time.sleep(61)
        
        time.sleep(5)
        try:
            gamedata = game_organizer(link)
            #this will raise an error if none is returned
            len(gamedata)
            game_list.append(gamedata)
            
        except Exception as ex:
            print(f"stopped at the {len(game_list)} page, error was: {ex}")
            break
        #time updates for game list making process
        if len(game_list) % 1000 == 0:
            print(f"{len(game_list)} games have been processed, current time: {dt.datetime.now()}")
        if count % 50 == 0:
            print(count)
        count = count + 1
        

In [476]:
game_list_maker()

time.sleep(60)

if len(game_list) < len(box_links):
    time.sleep(300)
    game_list_maker()
    time.sleep(300)
    
if len(game_list) < len(box_links):
    time.sleep(300)
    game_list_maker()
    
if len(game_list) < len(box_links):
    time.sleep(300)
    time.sleep(300)
    game_list_maker()
    
len(game_list)
pk.dump(game_list, open('game_list.pkl', 'wb'))

Since the baseball-reference only allows 20 requests every minute and has pretty rough rate limiting procedures,  
these time lags are needed to not get banned while pulling the data.  
Expect to run this for more than a day.

In [477]:
pk.dump(game_list, open('game_list.pkl', 'wb'))

Save data to game_list so we don't have to go through the requesting process all over again.

In [475]:
len(box_links) == len(game_list)

True

Check that all games were collected 