In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:

import os

LINKS_FILE = os.getcwd()+"/data/links.txt"
df_before_cleanup_path = os.getcwd()+"/data/df_before_cleanup.csv"
df_w_features_path = os.getcwd()+"/data/df_w_features.csv"
game_summaries_path = os.getcwd()+"/data/game_summaries.csv"
teams_path = os.getcwd()+"/data/teams.csv"
pitching_path = os.getcwd()+"/data/pitching.csv"
pitchers_path = os.getcwd()+"/data/pitchers.csv"
batting_path = os.getcwd()+"/data/batting.csv"
open(pitching_path, 'w').close()
open(pitchers_path, 'w').close()
open(batting_path, 'w').close()
open(game_summaries_path, 'w').close()
open(teams_path, 'w').close()
open(df_w_features_path, 'w').close()
open(LINKS_FILE, 'w').close()
open(df_before_cleanup_path, 'w').close()

In [4]:
import warnings
%matplotlib inline
%load_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)


In [5]:
import threading
import queue

In [6]:
import scrape_in_code_folder as scrape

# Get links for individual games

In [7]:
# fill the queue with dates that we need games from
day_q = queue.Queue(maxsize=0)

get_day = pd.to_datetime('2020-07-15')

# fill queue with all the dates until yesterday
yesterday = pd.datetime.now() - pd.Timedelta(days=1)
days = []
while get_day < yesterday:
    get_day = get_day + pd.Timedelta(days=1)
    if pd.to_datetime(get_day).month < 3:
        continue    # baseball doesn't happen before march
    if pd.to_datetime(get_day).month > 11:
        continue   # baseball doesn't happen in december
    day_q.put(get_day)
day_q.qsize()


491

In [8]:
def do_work(q, lock):
    #start working through the queue
    while not q.empty():
        day = q.get()
        games = scrape.get_game_links(day)
        
        #some days have no games
        if len(games)==0:
            q.task_done()
            print(f"{day} no games.")
            continue
        
        #save the games to disk
        lock.acquire()
        with open(LINKS_FILE, "a") as myfile:
            for g in games: myfile.write(g+'\n')
        lock.release()
        
        q.task_done()
        print(f"{day} done.")

In [9]:
num_threads = 25    # num of firefox windows
lock = threading.Lock()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(day_q,lock,))
    worker.setDaemon(True)
    worker.start()

#wait for workers to finish
day_q.join()

2020-07-17 00:00:00 no games.
2020-07-22 00:00:00 no games.
2020-07-16 00:00:00 no games.
2020-07-30 00:00:00 done.
2020-07-18 00:00:00 no games.
2020-07-28 00:00:00 done.
2020-07-19 00:00:00 no games.
2020-08-03 00:00:00 done.
2020-07-29 00:00:00 done.
2020-07-21 00:00:00 no games.
2020-08-06 00:00:00 done.
2020-08-04 00:00:00 done.
2020-07-20 00:00:00 no games.
2020-07-31 00:00:00 done.
2020-08-09 00:00:00 done.
2020-08-01 00:00:00 done.
2020-07-26 00:00:00 done.
2020-07-24 00:00:00 done.
2020-07-23 00:00:00 done.
2020-08-08 00:00:00 done.
2020-08-02 00:00:00 done.
2020-08-07 00:00:00 done.
2020-07-27 00:00:00 done.
2020-08-05 00:00:00 done.
2020-07-25 00:00:00 done.
2020-08-10 00:00:00 done.
2020-08-13 00:00:00 done.
2020-08-17 00:00:00 done.
2020-08-11 00:00:00 done.
2020-08-25 00:00:00 done.
2020-08-27 00:00:00 done.
2020-09-03 00:00:00 done.
2020-08-19 00:00:00 done.
2020-08-26 00:00:00 done.
2020-08-30 00:00:00 done.
2020-08-12 00:00:00 done.
2020-08-18 00:00:00 done.
2020-08-20

# parse individual game links

In [10]:
q = queue.Queue()
with open(LINKS_FILE, 'r') as f:
    links = [x.strip() for x in f.readlines()] 
for l in links:
    q.put(l)
q.qsize()

3960

In [11]:
def do_work(q, lock):
    #start working through the queue
    while not q.empty():
        link = q.get()
        scrape.process_link(link,lock)
        q.task_done()

In [12]:
num_threads = 25   # num of firefox windows
lock = threading.Lock()

#start the workers
for i in range(num_threads):
    worker = threading.Thread(target=do_work, args=(q,lock,))
    worker.setDaemon(True)
    worker.start()
        
#wait for workers to finish
q.join()

https://www.baseball-reference.com/boxes/WAS/WAS202007300.shtml
https://www.baseball-reference.com/boxes/BAL/BAL202007300.shtml
https://www.baseball-reference.com/boxes/MIN/MIN202007300.shtml
https://www.baseball-reference.com/boxes/NYN/NYN202007300.shtml
https://www.baseball-reference.com/boxes/ATL/ATL202007300.shtml
https://www.baseball-reference.com/boxes/DET/DET202007300.shtml
https://www.baseball-reference.com/boxes/ANA/ANA202007300.shtml
https://www.baseball-reference.com/boxes/ARI/ARI202007300.shtml
https://www.baseball-reference.com/boxes/SFN/SFN202007300.shtml
https://www.baseball-reference.com/boxes/CLE/CLE202007282.shtml
https://www.baseball-reference.com/boxes/CLE/CLE202007281.shtml
https://www.baseball-reference.com/boxes/WAS/WAS202007280.shtml
https://www.baseball-reference.com/boxes/CIN/CIN202007280.shtml
https://www.baseball-reference.com/boxes/TBA/TBA202007280.shtml
https://www.baseball-reference.com/boxes/PIT/PIT202007280.shtml
https://www.baseball-reference.com/boxes