In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup as bs

import re
import threading
import queue
import time

import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
driver = webdriver.Chrome('./chromedriver')

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
def get_box_data(day, driver):
    url = 'https://www.mlb.com/scores/%s' % day
    driver.get(url)
    time.sleep(5) # give it a couple of more second to load
    soup = bs(driver.page_source.encode("utf-8"), "lxml")
    games = []
    matchups = soup.findAll('div',{'class':'sc-pliRl kBFOtq'})
    for i in matchups:
        game = {}
        team_names = i.findAll('div',{'class':'sc-prorn gDCUMt'})
        game['away_team_name'] = team_names[0].text.strip('')
        game['home_team_name'] = team_names[1].text.strip('')

        team_record = i.findAll('div',{'class':'sc-fzqOul sc-fzoCCn lxBud'})
        game['away_team_wins'] = team_record[0].text.split('-')[0]
        game['away_team_losses'] = team_record[0].text.split('-')[1]
        game['home_team_wins'] = team_record[1].text.split('-')[0]
        game['home_team_losses'] = team_record[1].text.split('-')[1]

        away_runs = i.findAll('div',{'class':'sc-fznNTe bxeZNG'})
        home_runs = i.findAll('div',{'class':'sc-fznNTe jUJTWQ'})
        if len(away_runs) > 0:
            game['away_team_runs'] = away_runs[0].contents[0]
            game['home_team_runs'] = home_runs[0].contents[0]
        else:
            game['away_team_runs'] = 'Game Cancelled'
            game['home_team_runs'] = 'Game Cancelled'

        away_hits = i.findAll('div',{'class':'sc-fznNTe iXeXYx'})
        home_hits = i.findAll('div',{'class':'sc-fznNTe ijaSXv'})
        if len(away_hits) > 0:
            game['away_team_hits'] = away_hits[0].contents[0]
            game['home_team_hits'] = home_hits[0].contents[0]
        else:
            game['away_team_hits'] = 'Game Cancelled'
            game['home_team_hits'] = 'Game Cancelled'

        away_errors = i.findAll('div',{'class':'sc-fznNTe iXeXYx'})
        home_errors = i.findAll('div',{'class':'sc-fznNTe ijaSXv'})
        if len(away_errors) > 0:
            game['away_team_errors'] = away_errors[1].contents[0]
            game['home_team_errors'] = home_errors[1].contents[0]
        else:
            game['away_team_errors'] = 'Game Cancelled'
            game['home_team_errors'] = 'Game Cancelled'

        team_pitcher = i.findAll('div',{'class':'sc-pIJJz hMgjpE'})
        pitcher_stats = i.findAll('div',{'class':'sc-pANHa rtjrj'})
        try:
            pattern = "(\d+)\s\-\s(\d+),\s(\d+\.\d+)\sERA"
            winning_pitcher = team_pitcher[0].text
            g = re.search(pattern, pitcher_stats[0].text)
            game['winning_pitcher'] = winning_pitcher
            game['winning_pitcher_wins'] = g.group(1)
            game['winning_pitcher_losses'] = g.group(2)
            game['winning_pitcher_era'] = g.group(3)
            g = re.search(pattern, pitcher_stats[1].text)
            losing_pitcher = team_pitcher[1].text
            game['losing_pitcher'] = losing_pitcher
            game['losing_pitcher_wins'] = g.group(1)
            game['losing_pitcher_losses'] = g.group(2)
            game['losing_pitcher_era'] = g.group(3)
        except:
            pass

        games.append(game)
    return games

In [5]:
def do_work(q, lock):
    #start a web browser
    driver = webdriver.Chrome('./chromedriver')
    driver.implicitly_wait(10)
    
    #start working through the queue
    while not q.empty():
        day = q.get()
        games = get_box_data(day, driver)
        #some days have no games
        if len(games) == 0:
            q.task_done()
            print(f"{day} no games.")
            continue
        
        new_games = pd.DataFrame(games)
        new_games['date']=day
        
        #save the games to disk
        lock.acquire()
        try:
            game_df = pd.read_csv('data.csv', low_memory=False)
        except:
            game_df = pd.DataFrame()
        game_df = pd.concat([game_df,new_games])
        game_df['date'] = pd.to_datetime(game_df.date).dt.date
        game_df.to_csv('data.csv', index=False)
        lock.release()
        
        q.task_done()
        print(f"{day} done.")
    driver.quit()

In [6]:
#fill the queue with dates that we need games from
q = queue.Queue(maxsize=0)

#get last date from disk if we've already saved some data
get_day = '2018-03-29'
    
# fill queue with all the dates until yesterday
yesterday = pd.datetime.now().date() - pd.Timedelta(days=1)
days = []
while  pd.to_datetime(get_day).date() < yesterday:
    get_day = (pd.to_datetime(get_day).date() + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
    if pd.to_datetime(get_day).month<3:continue    # baseball doesn't happen before march
    if pd.to_datetime(get_day).month>11:continue   # baseball doesn't happen in december
    q.put(get_day)
q.qsize()

764

In [None]:
num_threads = 3    # num of firefox windows
lock = threading.Lock()

#start the workers
for i in tqdm(range(num_threads)):
    worker = threading.Thread(target=do_work, args=(q,lock,))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

100%|████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<?, ?it/s]


2018-04-01 done.
2018-03-30 done.
2018-03-31 done.


Exception in thread Thread-7:
Traceback (most recent call last):
  File "c:\users\jaype\miniconda3\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "c:\users\jaype\miniconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-5-fc531f6e74c0>", line 9, in do_work
  File "<ipython-input-4-caf6badcde6e>", line 5, in get_box_data
  File "c:\users\jaype\miniconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 679, in page_source
    return self.execute(Command.GET_PAGE_SOURCE)['value']
  File "c:\users\jaype\miniconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "c:\users\jaype\miniconda3\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: session 