In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from http import HTTPStatus
from pprint import pprint as pp
from datetime import datetime, date, timezone
import re

In [2]:
mlb_today_url = r'https://www.espn.com/mlb/schedule'

s = requests.Session()
r = s.get(mlb_today_url, timeout=10)

print(f"Connection status: {HTTPStatus(r.status_code).phrase}")

Connection status: OK


In [66]:
soup = BeautifulSoup(r.text, 'lxml')

matchups = []

for day in soup.findAll("div", {"class":"ScheduleTables mb5 ScheduleTables--mlb ScheduleTables--baseball"}):
    try:
        dt = datetime.strptime(day.find("div", {"class":"Table__Title"}).text.strip(), "%A, %B %d, %Y").date()
    except:
        dt = pd.NaT

    for match in day.findAll("td", {"class":["date__col Table__TD","teams__col Table__TD"]}):
        try:
            url_find = match.find("a")['href']
            if bool(re.search(r"/mlb/game", url_find)):
                game_url = r"https://www.espn.com" + url_find
                game_row = (dt, game_url)
                matchups.append(game_row)
            else:
                continue
        except:
            continue
    
matchup_df = pd.DataFrame(matchups, columns=['MATCH_DATE','GAME_URL'])

matchup_df.head()

Unnamed: 0,MATCH_DATE,GAME_URL
0,2022-05-20,https://www.espn.com/mlb/game?gameId=401354814
1,2022-05-20,https://www.espn.com/mlb/game?gameId=401354825
2,2022-05-20,https://www.espn.com/mlb/game?gameId=401354815
3,2022-05-20,https://www.espn.com/mlb/game?gameId=401354819
4,2022-05-20,https://www.espn.com/mlb/game?gameId=401354826


In [64]:
# test_url = r'https://www.espn.com/mlb/game?gameId=401439775'

# test_s = requests.Session()
# test_r = s.get(test_url, timeout=10)

# test_soup = BeautifulSoup(test_r.text, 'lxml')

# probable_pitchers = []

# live_prev_starting_pitchers = [pitchers.a['href'] for pitchers in test_soup.findAll("table", {"data-type":"pitching"})]

# for pitchers in test_soup.findAll("div", {"class":"pitchers__player_stats"}):
#     pitcher_name = pitchers.find("span", {"class":"fullName"}).text
#     if pitcher_name == "Undecided":
#         probable_pitchers.append("TBD")
#     else:
#         probable_pitchers.append(pitchers.a["href"])

# len(probable_pitchers)

2

In [65]:
def get_game_pitchers_url(soup):
    probable_pitchers = []

    live_prev_starting_pitchers = [pitchers.a['href'] for pitchers in soup.findAll("table", {"data-type":"pitching"})]
    for pitchers in soup.findAll("div", {"class":"pitchers__player_stats"}):
        pitcher_name = pitchers.find("span", {"class":"fullName"}).text
        if pitcher_name == "Undecided":
            probable_pitchers.append("TBD")
        else:
            probable_pitchers.append(pitchers.a["href"])

    if len(live_prev_starting_pitchers) > 0:
        away_pitcher = live_prev_starting_pitchers[0]
        home_pitcher = live_prev_starting_pitchers[1]
    elif len(probable_pitchers) > 0:
        away_pitcher = probable_pitchers[0]
        home_pitcher = probable_pitchers[1]
    else:
        away_pitcher = np.nan
        home_pitcher = np.nan

    return (away_pitcher, home_pitcher)

In [6]:
def get_game_teams_url(soup):
    teams = [r"https://www.espn.com" + team.a['href'] for team in soup.findAll("div", {"class":"team-info-wrapper"})]

    if len(teams) > 0:
        away_team = teams[0]
        home_team = teams[1]
    else:
        away_team = np.nan
        home_team = np.nan

    return (away_team, home_team)

In [7]:
def get_game_time(soup):
    utc_dttm_str = soup.find("div", {"class":"game-date-time"}).find("span")['data-date']
    utc_dttm = datetime.strptime(utc_dttm_str, "%Y-%m-%dT%H:%MZ")
    local_dttm = utc_dttm.replace(tzinfo=timezone.utc).astimezone(tz=None)
    local_dttm = pd.NaT if local_dttm is None else local_dttm
    return tuple([local_dttm])
    

In [8]:
def get_game_stadium(soup):
    return tuple( [re.sub( r'\W+\W', '', soup.find("div", {"class":"game-field"}).text )] )


In [9]:
def get_game_line(soup):
    return tuple( [soup.find("div", {"class":"odds-lines-plus-logo"}).li.text] )

In [10]:
def get_game_info(game_url):
    try:
        s = requests.Session()
        r = s.get(game_url, timeout=10)

        soup = BeautifulSoup(r.text, 'lxml')

        game_pitchers_url = get_game_pitchers_url(soup)
        game_teams_url = get_game_teams_url(soup)
        game_time = get_game_time(soup)
        game_stadium = get_game_stadium(soup)
        game_line = get_game_line(soup)

        return game_pitchers_url + game_teams_url + game_time + game_stadium + game_line

    except:
        return (np.nan, np.nan, np.nan, np.nan, pd.NaT, np.nan, np.nan)

In [67]:
matchup_df = pd.concat(
    [
        matchup_df
        , pd.DataFrame(
            list(matchup_df["GAME_URL"].apply(get_game_info))
            , columns=["AWAY_PITCHER_URL","HOME_PITCHER_URL","AWAY_TEAM_URL","HOME_TEAM_URL","MATCH_DTTM","STADIUM","GAME_LINE"]
        )
    ]
    , axis="columns"
)


In [68]:
matchup_df


Unnamed: 0,MATCH_DATE,GAME_URL,AWAY_PITCHER_URL,HOME_PITCHER_URL,AWAY_TEAM_URL,HOME_TEAM_URL,MATCH_DTTM,STADIUM,GAME_LINE
0,2022-05-20,https://www.espn.com/mlb/game?gameId=401354814,https://www.espn.com/mlb/player/_/id/41831,https://www.espn.com/mlb/player/_/id/33173,https://www.espn.com/mlb/team/_/name/ari/arizo...,https://www.espn.com/mlb/team/_/name/chc/chica...,2022-05-20 13:20:00-05:00,Wrigley Field,Line: CHC -145
1,2022-05-20,https://www.espn.com/mlb/game?gameId=401354825,https://www.espn.com/mlb/player/_/id/5403,https://www.espn.com/mlb/player/_/id/40761,https://www.espn.com/mlb/team/_/name/stl/st-lo...,https://www.espn.com/mlb/team/_/name/pit/pitts...,2022-05-20 17:35:00-05:00,PNC ParkCoverageApple TV,Line: STL -150
2,2022-05-20,https://www.espn.com/mlb/game?gameId=401354815,https://www.espn.com/mlb/player/_/id/29155,https://www.espn.com/mlb/player/_/id/39640,https://www.espn.com/mlb/team/_/name/atl/atlan...,https://www.espn.com/mlb/team/_/name/mia/miami...,2022-05-20 17:40:00-05:00,loanDepot park,Line: ATL -120
3,2022-05-20,https://www.espn.com/mlb/game?gameId=401354819,https://www.espn.com/mlb/player/_/id/33223,https://www.espn.com/mlb/player/_/id/39817,https://www.espn.com/mlb/team/_/name/lad/los-a...,https://www.espn.com/mlb/team/_/name/phi/phila...,2022-05-20 18:05:00-05:00,Citizens Bank ParkCoverageESPN,Line: LAD -145
4,2022-05-20,https://www.espn.com/mlb/game?gameId=401354826,https://www.espn.com/mlb/player/_/id/36723,https://www.espn.com/mlb/player/_/id/4717904,https://www.espn.com/mlb/team/_/name/tb/tampa-...,https://www.espn.com/mlb/team/_/name/bal/balti...,2022-05-20 18:05:00-05:00,Oriole Park at Camden Yards,Line: TB -135
5,2022-05-20,https://www.espn.com/mlb/game?gameId=401354817,https://www.espn.com/mlb/player/_/id/35124,https://www.espn.com/mlb/player/_/id/32582,https://www.espn.com/mlb/team/_/name/cin/cinci...,https://www.espn.com/mlb/team/_/name/tor/toron...,2022-05-20 18:07:00-05:00,Rogers Centre,Line: TOR -180
6,2022-05-20,https://www.espn.com/mlb/game?gameId=401354818,https://www.espn.com/mlb/player/_/id/42409,https://www.espn.com/mlb/player/_/id/40934,https://www.espn.com/mlb/team/_/name/det/detro...,https://www.espn.com/mlb/team/_/name/cle/cleve...,2022-05-20 18:10:00-05:00,Progressive Field,Line: CLE -115
7,2022-05-20,https://www.espn.com/mlb/game?gameId=401354824,https://www.espn.com/mlb/player/_/id/32175,https://www.espn.com/mlb/player/_/id/32640,https://www.espn.com/mlb/team/_/name/sea/seatt...,https://www.espn.com/mlb/team/_/name/bos/bosto...,2022-05-20 18:10:00-05:00,Fenway Park,Line: BOS -125
8,2022-05-20,https://www.espn.com/mlb/game?gameId=401354820,https://www.espn.com/mlb/player/_/id/41120,https://www.espn.com/mlb/player/_/id/41227,https://www.espn.com/mlb/team/_/name/min/minne...,https://www.espn.com/mlb/team/_/name/kc/kansas...,2022-05-20 19:10:00-05:00,Kauffman Stadium,Line: MIN -130
9,2022-05-20,https://www.espn.com/mlb/game?gameId=401354827,https://www.espn.com/mlb/player/_/id/31098,https://www.espn.com/mlb/player/_/id/41261,https://www.espn.com/mlb/team/_/name/tex/texas...,https://www.espn.com/mlb/team/_/name/hou/houst...,2022-05-20 19:10:00-05:00,Minute Maid ParkCoverageApple TV,Line: HOU -175
