In [1]:
import requests, bs4, re
from bs4 import BeautifulSoup as bs
import pandas as pd
from bs4 import Comment
import time
import random
from fake_useragent import UserAgent
import pickle


## Get the data via web scraping.

In [2]:
teams = ["buf", "mia", "nwe", "nyj", "pit", "rav", "cle", "cin", "oti", "clt", "htx", "jax", "kan", "rai", 
         "den", "sdg", "was", "nyg", "dal", "phi", "gnb", "chi", "min", "det", "nor", "tam", "atl", "car", 
         "sea", "ram", "crd", "sfo"]

data_accum = []

# Use random user agent when scraping.
ua = UserAgent()
user_agent = {'User-agent': ua.random}

for t in teams:
    url = f"https://www.pro-football-reference.com/teams/{t}/2020_roster.htm"
    
    response = requests.get(url, headers = user_agent)
    status = response.status_code
    if status == 200:
        page = response.text
        soup = bs(page, "html.parser")
    else:
        print(f"Oops! Received status code {status}")
        
    # The table is in a comment so parse it again with BeautifulSoup.
    temp_table = soup.find(attrs = {"class":"table_wrapper", "id":"all_roster"})
    comment = temp_table.find(text=lambda text: isinstance(text, Comment))
    commentSoup = bs(comment, "html.parser")

    # Now sort through the web page.
    rows = commentSoup.find_all("tr")

    # Get column names.
    column_headers = [x.text for x in rows[0].find_all("th")]
    column_headers.extend(["Team", "Record"])

    # Get player stats.
    players = []
    for e in rows[1:]:
        # Table has uniform number as "th".
        temp = [e.find("th").text] + [x.text for x in e.find_all("td")]
        temp.append(t)

        # From original page get the team's record.
        record = soup.find(text=re.compile("Record")).next.split(",")[0].strip()
        temp.append(record)
        players.append(temp)
        
    # Convert collected data to dictionary and add to list accumulator.
    for e in players:
        dct = dict()
        for h, p in zip(column_headers, e):
            dct[h] = p
        data_accum.append(dct)
    
    # Add (1.5-15sec) delay that makes the web scraping more human-like.
    timeout = 1.5*random.randint(1,10)
    print("Completed", teams.index(t)+1, "of", len(teams), "teams")
    time.sleep(timeout)

Completed 1 of 32 teams
Completed 2 of 32 teams
Completed 3 of 32 teams
Completed 4 of 32 teams
Completed 5 of 32 teams
Completed 6 of 32 teams
Completed 7 of 32 teams
Completed 8 of 32 teams
Completed 9 of 32 teams
Completed 10 of 32 teams
Completed 11 of 32 teams
Completed 12 of 32 teams
Completed 13 of 32 teams
Completed 14 of 32 teams
Completed 15 of 32 teams
Completed 16 of 32 teams
Completed 17 of 32 teams
Completed 18 of 32 teams
Completed 19 of 32 teams
Completed 20 of 32 teams
Completed 21 of 32 teams
Completed 22 of 32 teams
Completed 23 of 32 teams
Completed 24 of 32 teams
Completed 25 of 32 teams
Completed 26 of 32 teams
Completed 27 of 32 teams
Completed 28 of 32 teams
Completed 29 of 32 teams
Completed 30 of 32 teams
Completed 31 of 32 teams
Completed 32 of 32 teams


In [3]:
# Convert list to dataframe and save.

df = pd.DataFrame(data_accum)

with open('./data/web_scraped_dataframe.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)