In [103]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from http import HTTPStatus
from pprint import pprint as pp
from datetime import datetime, date, timezone
import re

In [104]:
mlb_today_url = r'https://www.espn.com/mlb/schedule'

s = requests.Session()
r = s.get(mlb_today_url, timeout=10)

print(f"Connection status: {HTTPStatus(r.status_code).phrase}")

Connection status: OK


In [181]:
soup = BeautifulSoup(r.text, 'lxml')

matchups = []

for day in soup.findAll("div", {"class":"ScheduleTables mb5 ScheduleTables--mlb ScheduleTables--baseball"}):
    try:
        dt = datetime.strptime(day.find("div", {"class":"Table__Title"}).text.strip(), "%A, %B %d, %Y").date()
    except:
        dt = pd.NaT

    for match in day.findAll("td", {"class":["date__col Table__TD","teams__col Table__TD"]}):
        try:
            url_find = match.find("a")['href']
            if bool(re.search(r"/mlb/game", url_find)):
                game_url = r"https://www.espn.com" + url_find
                game_row = (dt, game_url)
                matchups.append(game_row)
            else:
                continue
        except:
            continue
    
matchup_df = pd.DataFrame(matchups, columns=['MATCH_DATE','GAME_URL'])

matchup_df.head()

Unnamed: 0,MATCH_DATE,GAME_URL
0,2022-05-19,https://www.espn.com/mlb/game?gameId=401354809
1,2022-05-19,https://www.espn.com/mlb/game?gameId=401354810
2,2022-05-19,https://www.espn.com/mlb/game?gameId=401354812
3,2022-05-19,https://www.espn.com/mlb/game?gameId=401439510
4,2022-05-19,https://www.espn.com/mlb/game?gameId=401354808


In [178]:
# test_url = r'https://www.espn.com/mlb/game?gameId=401354794'
test_url = r'https://www.espn.com/mlb/game?gameId=401354809'

test_s = requests.Session()
test_r = test_s.get(test_url, timeout=10)

test_soup = BeautifulSoup(test_r.text, 'lxml')

test_soup.find("div", {"class":"odds-lines-plus-logo"}).li.text



'Line: NYY -190'

In [107]:
def get_game_pitchers_url(soup):
    probable_pitchers = []

    live_prev_starting_pitchers = [pitchers.a['href'] for pitchers in soup.findAll("table", {"data-type":"pitching"})]
    for pitcher in soup.findAll("div", {"class":"pitchers__player_stats"}):
        if pitcher.find("span", {"class":"fullName"}).text=="Undecided":
            probable_pitchers.append('TBD')
        else:
            probable_pitchers.append(pitcher.a['href'])

    if len(live_prev_starting_pitchers) > 0:
        away_pitcher = live_prev_starting_pitchers[0]
        home_pitcher = live_prev_starting_pitchers[1]
    elif len(probable_pitchers) > 0:
        away_pitcher = probable_pitchers[0]
        home_pitcher = probable_pitchers[1]
    else:
        away_pitcher = np.nan
        home_pitcher = np.nan

    return (away_pitcher, home_pitcher)

In [108]:
def get_game_teams_url(soup):
    teams = [r"https://www.espn.com" + team.a['href'] for team in soup.findAll("div", {"class":"team-info-wrapper"})]

    if len(teams) > 0:
        away_team = teams[0]
        home_team = teams[1]
    else:
        away_team = np.nan
        home_team = np.nan

    return (away_team, home_team)

In [119]:
def get_game_time(soup):
    utc_dttm_str = soup.find("div", {"class":"game-date-time"}).find("span")['data-date']
    utc_dttm = datetime.strptime(utc_dttm_str, "%Y-%m-%dT%H:%MZ")
    local_dttm = utc_dttm.replace(tzinfo=timezone.utc).astimezone(tz=None)
    local_dttm = pd.NaT if local_dttm is None else local_dttm
    return tuple([local_dttm])
    

In [167]:
def get_game_stadium(soup):
    return tuple( [re.sub( r'\W+\W', '', soup.find("div", {"class":"game-field"}).text )] )


In [179]:
def get_game_line(soup):
    return tuple( [soup.find("div", {"class":"odds-lines-plus-logo"}).li.text] )

In [180]:
def get_game_info(game_url):
    try:
        s = requests.Session()
        r = s.get(game_url, timeout=10)

        soup = BeautifulSoup(r.text, 'lxml')

        game_pitchers_url = get_game_pitchers_url(soup)
        game_teams_url = get_game_teams_url(soup)
        game_time = get_game_time(soup)
        game_stadium = get_game_stadium(soup)
        game_line = get_game_line(soup)

        return game_pitchers_url + game_teams_url + game_time + game_stadium + game_line

    except:
        return (np.nan, np.nan, np.nan, np.nan, pd.NaT, np.nan, np.nan)

In [183]:
matchup_df = pd.concat(
    [
        matchup_df
        , pd.DataFrame(
            list(matchup_df["GAME_URL"].apply(get_game_info))
            , columns=["AWAY_PITCHER_URL","HOME_PITCHER_URL","AWAY_TEAM_URL","HOME_TEAM_URL","MATCH_DTTM","STADIUM","GAME_LINE"]
        )
    ]
    , axis="columns"
)


In [184]:
matchup_df


Unnamed: 0,MATCH_DATE,GAME_URL,AWAY_PITCHER_URL,HOME_PITCHER_URL,AWAY_TEAM_URL,HOME_TEAM_URL,MATCH_DTTM,STADIUM,GAME_LINE
0,2022-05-19,https://www.espn.com/mlb/game?gameId=401354809,https://www.espn.com/mlb/player/_/id/38173,https://www.espn.com/mlb/player/_/id/41118,https://www.espn.com/mlb/team/_/name/nyy/new-y...,https://www.espn.com/mlb/team/_/name/bal/balti...,2022-05-19 11:35:00-05:00,Oriole Park at Camden Yards,Line: NYY -190
1,2022-05-19,https://www.espn.com/mlb/game?gameId=401354810,https://www.espn.com/mlb/player/_/id/32055,https://www.espn.com/mlb/player/_/id/31053,https://www.espn.com/mlb/team/_/name/sd/san-di...,https://www.espn.com/mlb/team/_/name/phi/phila...,2022-05-19 12:05:00-05:00,Citizens Bank Park,Line: EVEN
2,2022-05-19,https://www.espn.com/mlb/game?gameId=401354812,https://www.espn.com/mlb/player/_/id/38680,https://www.espn.com/mlb/player/_/id/33148,https://www.espn.com/mlb/team/_/name/stl/st-lo...,https://www.espn.com/mlb/team/_/name/nym/new-y...,2022-05-19 12:10:00-05:00,Citi FieldCoverageESPN,Line: NYM -185
3,2022-05-19,https://www.espn.com/mlb/game?gameId=401439510,https://www.espn.com/mlb/player/_/id/34973,https://www.espn.com/mlb/player/_/id/39875,https://www.espn.com/mlb/team/_/name/cin/cinci...,https://www.espn.com/mlb/team/_/name/cle/cleve...,2022-05-19 12:10:00-05:00,Progressive Field,Line: CLE -130
4,2022-05-19,https://www.espn.com/mlb/game?gameId=401354808,https://www.espn.com/mlb/player/_/id/31878,https://www.espn.com/mlb/player/_/id/41233,https://www.espn.com/mlb/team/_/name/chw/chica...,https://www.espn.com/mlb/team/_/name/kc/kansas...,2022-05-19 13:10:00-05:00,Kauffman Stadium,Line: CHW -140
5,2022-05-19,https://www.espn.com/mlb/game?gameId=401354811,https://www.espn.com/mlb/player/_/id/42406,https://www.espn.com/mlb/player/_/id/6321,https://www.espn.com/mlb/team/_/name/sea/seatt...,https://www.espn.com/mlb/team/_/name/bos/bosto...,2022-05-19 18:10:00-05:00,Fenway Park,Line: BOS -125
6,2022-05-19,https://www.espn.com/mlb/game?gameId=401354807,https://www.espn.com/mlb/player/_/id/39910,https://www.espn.com/mlb/player/_/id/32815,https://www.espn.com/mlb/team/_/name/ari/arizo...,https://www.espn.com/mlb/team/_/name/chc/chica...,2022-05-19 18:40:00-05:00,Wrigley Field,Line: CHC -135
7,2022-05-19,https://www.espn.com/mlb/game?gameId=401354813,https://www.espn.com/mlb/player/_/id/41409,https://www.espn.com/mlb/player/_/id/36581,https://www.espn.com/mlb/team/_/name/tex/texas...,https://www.espn.com/mlb/team/_/name/hou/houst...,2022-05-19 19:10:00-05:00,Minute Maid Park,Line: HOU -215
8,2022-05-20,https://www.espn.com/mlb/game?gameId=401354814,,,,,NaT,,
9,2022-05-20,https://www.espn.com/mlb/game?gameId=401354825,,,,,NaT,,
