In [20]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time

In [13]:
#We need to grab seasons 2012 through 2022
SEASONS = list(range(2012,2023))

In [14]:
SEASONS

[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [21]:
#Initialize where the data tables are going to be stored once they are scrapped.
DATA_DIR = "data"
TEAM_BATTING_DIR = os.path.join(DATA_DIR, "team_batting")
TEAM_FIELDING_DIR = os.path.join(DATA_DIR, "team_fielding")
TEAM_PITCHING_DIR = os.path.join(DATA_DIR, "team_pitching")
TEAM_SCHEDULE_DIR = os.path.join(DATA_DIR, "team_schedule")
SEASON_SUMMARY_DIR = os.path.join(DATA_DIR, "season_summary")
SEASON_STATS_DIR = os.path.join(DATA_DIR, "season_stats")
GAMES_DIR = os.path.join(DATA_DIR, "games")
WINS_ABOVE_AVG_POSITION_DIR = os.path.join(DATA_DIR, "wins_above_avg_position")

There's going to be two types of URLs used in the scrapping portion of the project:
    
    - URL_1 is to be used to scrape the batting, fielding, pitching, and wins_above tables.

    - URL_2 is to be used to scrape the schedule and season summary for each team in each season.

In [16]:
Season_Year = ""
Team_Name = ""

URL_1 = f"https://www.baseball-reference.com/leagues/majors/{Season_Year}.shtml"

URL_2 = f"https://www.baseball-reference.com/teams/{Team_Name}/{Season_Year}-schedule-scores.shtml"

In [28]:
#Function to scrape html from a webpage

async def get_html(url, selector, sleep=3, retries=10):
    html = None
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        try:
            print(f"Scraping page. Attempt: {i}")
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [18]:
#Scrape content from each mlb season stats page

async def scrape_single_season_stats(Season):
    
    URL_1 = f"https://www.baseball-reference.com/leagues/majors/{Season}.shtml"
   
    save_path = os.path.join(SEASON_STATS_DIR, URL_1.split("/")[-1])

    html = await get_html(URL_1, "#content")
    with open(save_path, "w+") as f:
        f.write(html)

    return "Completed"
    

In [6]:
#Scrape content from each mlb season stats page

async def scrape_season_stats(Seasons):

    for season in Seasons:
        URL_1 = f"https://www.baseball-reference.com/leagues/majors/{season}.shtml"
       
        save_path = os.path.join(SEASON_STATS_DIR, URL_1.split("/")[-1])
    
        html = await get_html(URL_1, "#content")
        with open(save_path, "w+") as f:
            f.write(html)

    return "Completed"
    

In [19]:
#Test single season scrape
season1 = 2012
await scrape_single_season_stats(season1)

Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml
Starting scrape. Attempt 2
Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml
Starting scrape. Attempt 3
2012 Major League Baseball Team Statistics | Baseball-Reference.com


'Completed'

In [21]:
#!playwright install

In [14]:
#Now loop through Seasons list and scrape each seasons data
season_list = list(range(2019,2023))
await scrape_season_stats(season_list)

Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2019.shtml
Starting scrape. Attempt 2
Timeout error on https://www.baseball-reference.com/leagues/majors/2019.shtml
Starting scrape. Attempt 3
2019 Major League Baseball Team Statistics | Baseball-Reference.com
Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2020.shtml
Starting scrape. Attempt 2
Timeout error on https://www.baseball-reference.com/leagues/majors/2020.shtml
Starting scrape. Attempt 3
2020 Major League Baseball Team Statistics | Baseball-Reference.com
Starting scrape. Attempt 1
2021 Major League Baseball Team Statistics | Baseball-Reference.com
Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2022.shtml
Starting scrape. Attempt 2
2022 Major League Baseball Team Statistics | Baseball-Reference.com


'Completed'

In [15]:
#Now loop through Seasons list and scrape each seasons data
season_list = list(range(2010,2014))
await scrape_season_stats(season_list)

Starting scrape. Attempt 1
2010 Major League Baseball Team Statistics | Baseball-Reference.com
Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2011.shtml
Starting scrape. Attempt 2
2011 Major League Baseball Team Statistics | Baseball-Reference.com
Starting scrape. Attempt 1
Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml
Starting scrape. Attempt 2
Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml
Starting scrape. Attempt 3
Timeout error on https://www.baseball-reference.com/leagues/majors/2012.shtml


TypeError: write() argument must be str, not None

In [40]:
import time

#Scrape content from each mlb game of season. Start with 2012 b/c miami marlines began then
Team_LIST = ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'HOU', 'KAN', 'LAA', 'LAD', 'MIA', 'MIL', 
             'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SF', 'SEA', 'STL', 'TB','TEX', 'TOR', 'WSN']


# ['ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET', 'HOU', 'KAN', 'LAA', 'LAD', 'MIA', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK', 'PHI', 'PIT', 'SD', 'SF', 'SEA', 'STL', 'TB',
#Season = 2012


async def scrape_games(Team_list, Season):
    s_t = time.time()
    for team in Team_list:
        URL_2 = f"https://www.baseball-reference.com/teams/{team}/{Season}-schedule-scores.shtml"

        save_name = f"{team}_{Season}"
        save_path = os.path.join(GAMES_DIR, save_name)
    
        html = await get_html(URL_2, "#content")
        if (html):
            with open(save_path, "w+") as f:
                f.write(html)
    t = (time.time() - s_t)
    print(f"Completed. Total time: {t}")
    return True
    

In [26]:
#Due to complexity, manually scrape each season from 2012 - 2022
#Season = 2016
#await scrape_games(Team_LIST, Season)


One of the big issues I ran into was simple error with the playwright execution. It would work for most of the files, but after my initial build of the data set, I know that I will be missing
1 - 2 files per season and will need to go back and grab those.

In [None]:
#Test single season scrape
#await scrape_games(Team_LIST, Season)

So now I have a full season of data for the 2012 season. I have this data in a series of html files, so the next step will be to preprocess these files.
I want to go through the html and pull out the relavant data tables, and then put that data in an organized file system. I can then use python and SQL to 
configure training and testing datasets. 

Once the script to process and organize the data is complete, I will write a systematic process for gather data from X to Y years, processing that data, and saving everything in valuable way for training & testing models.

Now that we have our scraping script pretty well built, lets try scraping multiple seasons of data, and naming the files in a way that makes it easy to sort afterwards

In [41]:
SEASONS = list(range(2012,2017))

for Season in SEASONS:
    await scrape_games(Team_LIST, Season)

Scraping page. Attempt: 1
2012 Arizona Diamondbacks Schedule | Baseball-Reference.com
Scraping page. Attempt: 1
Timeout error on https://www.baseball-reference.com/teams/ATL/2012-schedule-scores.shtml
Scraping page. Attempt: 2
Timeout error on https://www.baseball-reference.com/teams/ATL/2012-schedule-scores.shtml
Scraping page. Attempt: 3
Timeout error on https://www.baseball-reference.com/teams/ATL/2012-schedule-scores.shtml
Scraping page. Attempt: 4
2012 Atlanta Braves Schedule | Baseball-Reference.com
Scraping page. Attempt: 1
Timeout error on https://www.baseball-reference.com/teams/BAL/2012-schedule-scores.shtml
Scraping page. Attempt: 2
Timeout error on https://www.baseball-reference.com/teams/BAL/2012-schedule-scores.shtml
Scraping page. Attempt: 3
Timeout error on https://www.baseball-reference.com/teams/BAL/2012-schedule-scores.shtml
Scraping page. Attempt: 4
2012 Baltimore Orioles Schedule | Baseball-Reference.com
Scraping page. Attempt: 1
2012 Boston Red Sox Schedule | Base