In [35]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

In [36]:
#We need to grab seasons 2012 through 2022
SEASONS = list(range(2012,2023))

In [37]:
SEASONS

[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [38]:
#Initialize where the data tables are going to be stored once they are scrapped.
DATA_DIR = "data"
TEAM_BATTING_DIR = os.path.join(DATA_DIR, "team_batting")
TEAM_FIELDING_DIR = os.path.join(DATA_DIR, "team_fielding")
TEAM_PITCHING_DIR = os.path.join(DATA_DIR, "team_pitching")
TEAM_SCHEDULE_DIR = os.path.join(DATA_DIR, "team_schedule")
SEASON_SUMMARY_DIR = os.path.join(DATA_DIR, "season_summary")
SEASON_STATS_DIR = os.path.join(DATA_DIR, "season_stats")
WINS_ABOVE_AVG_POSITION_DIR = os.path.join(DATA_DIR, "wins_above_avg_position")

There's going to be two types of URLs used in the scrapping portion of the project:
    
    - URL_1 is to be used to scrape the batting, fielding, pitching, and wins_above tables.

    - URL_2 is to be used to scrape the schedule and season summary for each team in each season.

In [39]:
Season_Year = ""
Team_Name = ""

URL_1 = f"https://www.baseball-reference.com/leagues/majors/{Season_Year}.shtml"

URL_2 = f"https://www.baseball-reference.com/teams/{Team_Name}/{Season_Year}-schedule-scores.shtml"

In [40]:
#Function to scrape html from a webpage

async def get_html(url, selector, sleep=5, retries=3):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        try:
            print(f"Starting scrape. Attempt {i}")
            async with async_playwright() as p:
                browser = await p.firefox.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [43]:
#Scrape content from each mlb season stats page

async def scrape_season_stats(Season_Year):
    URL_1 = f"https://www.baseball-reference.com/leagues/majors/{Season_Year}.shtml"
   
    save_path = os.path.join(SEASON_STATS_DIR, URL_1.split("/")[-1])

    html = await get_html(URL_1, "#content")
    with open(save_path, "w+") as f:
        f.write(html)

In [42]:
#Test single season scrape
season = 2017
await scrape_season_stats(season)

Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2017.shtml
Starting scrape. Attempt 2
Timeout error on https://www.baseball-reference.com/leagues/majors/2017.shtml
Starting scrape. Attempt 3


CancelledError: 

In [21]:
#!playwright install

In [31]:
#Now loop through Seasons list and scrape each seasons data

for season in SEASONS:
    await scrape_season_stats(season)

Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml


CancelledError: 