In [1]:
import requests
import pandas as pd
import random
import time
from io import StringIO
from bs4 import BeautifulSoup

In [2]:
def getTeamUrls(stats_url):
    start = time.time()
    data = requests.get(stats_url)
    delay = time.time() - start
    time.sleep(random.randint(60, 80)*delay)

    soup = BeautifulSoup(data.text)
    afc_standings = soup.select('table.stats_table')
    afc_team_links = []
    for tag in afc_standings:
        links = tag.find_all('a')
        afc_team_links.extend(links)
    afc_team_links = [link.get("href") for link in afc_team_links]
    
    
    
    nfc_standings = soup.select('#NFC')
    nfc_team_links = []
    for tag in nfc_standings:
        links = tag.find_all('a')
        nfc_team_links.extend(links)
    nfc_team_links = [link.get("href") for link in nfc_team_links]
    team_links = afc_team_links + nfc_team_links
    team_urls = [f"https://pro-football-reference.com{link}" for link in team_links]
    return team_urls

In [3]:
#Making pandas database out of Game Results and Basic Stats
def makeTeamDb (team_url):
    start = time.time()
    data = requests.get(team_url)
    delay = time.time() - start
    time.sleep(random.randint(60, 80)*delay)
    
    matches = pd.read_html(StringIO(data.text), match = "Schedule & Game Results Table")[0]

    #Sanitizing for teams that went to playoffs (removes playoff rows)
    while len(matches) > 18:
        print(len(matches))
        matches = matches.drop([18])
        matches = matches.reset_index(drop = 'true')

    
    #Adding names for unnamed columns manually
    matches = matches.rename(columns={'Unnamed: 3_level_1' : 'Time', 'Unnamed: 4_level_1' : 'Game Link', 'Unnamed: 5_level_1' : 'Result',
                                      'Unnamed: 8_level_1' : 'Home/Away'},)
    
    
    #Renaming columns 
    new_columns = []
    for column in matches.columns:
        if column[0] == 'Offense':
            new_columns.append('Off' + column[1])
                
        elif column[0] == 'Defense':
            new_columns.append('Def' + column[1])
    
        elif column[0] == 'Expected Points':
            new_columns.append('EP' + column[1][:3])
    
        elif column[0] == 'Score':
            new_columns.append(column[1] + 'Sc')
    
        else:
            new_columns.append(column[1])
    
    matches.columns = new_columns
    
    
    #Fixing empty data values
    matches['OT'] = matches['OT'].apply(lambda x : 'N' if pd.isna(x) or x=='N' else 'Y')
    matches['Result'] = matches['Result'].apply(lambda x : '' if pd.isna(x) or x=='' else x)
    matches['Rec'] = matches['Rec'].apply(lambda x : '' if pd.isna(x) or x=='' else x)
    matches['Home/Away'] = matches['Home/Away'].apply(lambda x : 'H' if pd.isna(x) or x=='H' else 'A')
    
    #Fixing empty data numbers to 0.0
    for column in matches.columns[13:]:
        matches[column] = matches[column].apply(lambda x : '0.0' if pd.isna(x) or x=='0.0' else x)
    
    #Fixing links to more detailed game stats
    data = requests.get(team_url)
    soup = BeautifulSoup(data.text)
    game_table = soup.select('#games')
    game_links = []
    for tag in game_table:
        links = tag.find_all('a')
        game_links.extend(links)
    
    game_links = [link.get("href") for link in game_links]
    game_links = [link for link in game_links if '/boxscores' in link]
    game_links = [f"https://pro-football-reference.com{link}" for link in game_links]

    #Drops bye week to ensure proper adding of game_links and to remove it from the dataframes
    bye_week = 0
    while (matches['Opp'][bye_week] != 'Bye Week'):
        bye_week+=1

    matches = matches.drop([bye_week])
    matches = matches.reset_index(drop = 'true')


    #Sanitizes game_links to ensure that playoff game links are removed as the dataframe won't contain playoff matches
    game_links = game_links[:17]
    matches['Game Link'] = game_links

    
    #Finding the team name and adding it to the table
    soup = BeautifulSoup(data.text)
    team_name = soup.find_all('h1')[0].find_all('span')[1].text
    matches['Team'] = team_name
    
    #Finally calculating and adding the year to the table
    year = team_url.split('/')[-1].replace('.htm', '')
    matches['Season'] = year
    return matches

In [4]:
years = list(range(2024,2021, -1))

In [5]:
all_matches = [] #list of dataframes of all the game data for each year for each team

for year in years: 
    stats_url = 'https://www.pro-football-reference.com/years/' + str(year) + '/index.htm' #sets the url to the current year's url (updates as the year changes)
    
    time.sleep(random.randint(60,80)) #extra sleep in between years to be safe
    
    team_urls = getTeamUrls(stats_url) #gets all the team urls from the page

    
    for team_url in team_urls:
        print(team_url) #used to monitor how quickly scraping occurs
        db = makeTeamDb(team_url)
        all_matches.append(db)



match_df = pd.concat(all_matches)
match_df.to_csv("matches.csv")
match_df

https://pro-football-reference.com/teams/buf/2024.htm
https://pro-football-reference.com/teams/mia/2024.htm
https://pro-football-reference.com/teams/nyj/2024.htm
https://pro-football-reference.com/teams/nwe/2024.htm
https://pro-football-reference.com/teams/pit/2024.htm
https://pro-football-reference.com/teams/rav/2024.htm
https://pro-football-reference.com/teams/cin/2024.htm
https://pro-football-reference.com/teams/cle/2024.htm
https://pro-football-reference.com/teams/htx/2024.htm
https://pro-football-reference.com/teams/clt/2024.htm
https://pro-football-reference.com/teams/jax/2024.htm
https://pro-football-reference.com/teams/oti/2024.htm
https://pro-football-reference.com/teams/kan/2024.htm
https://pro-football-reference.com/teams/sdg/2024.htm
https://pro-football-reference.com/teams/den/2024.htm
https://pro-football-reference.com/teams/rai/2024.htm
https://pro-football-reference.com/teams/phi/2024.htm
https://pro-football-reference.com/teams/was/2024.htm
https://pro-football-referen

Unnamed: 0,Week,Day,Date,Time,Game Link,Result,OT,Rec,Home/Away,Opp,...,Def1stD,DefTotYd,DefPassY,DefRushY,DefTO,EPOff,EPDef,EPSp.,Team,Season
0,1,Sun,September 8,1:00PM ET,https://pro-football-reference.com/boxscores/2...,W,N,1-0,H,Arizona Cardinals,...,18.0,270.0,146.0,124.0,1.0,13.51,-3.22,-2.25,Buffalo Bills,2024
1,2,Thu,September 12,8:15PM ET,https://pro-football-reference.com/boxscores/2...,W,N,2-0,A,Miami Dolphins,...,20.0,351.0,212.0,139.0,3.0,8.59,14.11,-2.53,Buffalo Bills,2024
2,3,Mon,September 23,7:30PM ET,https://pro-football-reference.com/boxscores/2...,W,N,3-0,H,Jacksonville Jaguars,...,19.0,239.0,147.0,92.0,2.0,28.39,12.44,-6.15,Buffalo Bills,2024
3,4,Sun,September 29,8:20PM ET,https://pro-football-reference.com/boxscores/2...,L,N,3-1,A,Baltimore Ravens,...,22.0,427.0,156.0,271.0,1.0,-6.92,-19.69,2.37,Buffalo Bills,2024
4,5,Sun,October 6,1:00PM ET,https://pro-football-reference.com/boxscores/2...,L,N,3-2,A,Houston Texans,...,18.0,425.0,331.0,94.0,2.0,-1.59,0.68,-2.09,Buffalo Bills,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,14,Mon,December 12,8:15PM ET,https://pro-football-reference.com/boxscores/2...,L,N,4-9,H,New England Patriots,...,18.0,328.0,225.0,103.0,1.0,-17.43,0.47,0.9,Arizona Cardinals,2022
13,15,Sun,December 18,4:05PM ET,https://pro-football-reference.com/boxscores/2...,L,N,4-10,A,Denver Broncos,...,23.0,324.0,156.0,168.0,2.0,-19.01,0.79,6.6,Arizona Cardinals,2022
14,16,Sun,December 25,8:20PM ET,https://pro-football-reference.com/boxscores/2...,L,Y,4-11,H,Tampa Bay Buccaneers,...,21.0,396.0,281.0,115.0,2.0,-13.64,4.6,6.77,Arizona Cardinals,2022
15,17,Sun,January 1,1:00PM ET,https://pro-football-reference.com/boxscores/2...,L,N,4-12,A,Atlanta Falcons,...,20.0,298.0,166.0,132.0,1.0,6.1,-3.27,-5.1,Arizona Cardinals,2022


In [9]:
match_df.to_csv('matches.csv', index=False, encoding='utf-8')


In [11]:
#code to save/cache all the html urls to this project in the html folder in case website goes down

#team_urls = getTeamUrls(stats_url)
#team_urls = ['https://www.pro-football-reference.com/teams/cle/2024.htm']


#for team_url in team_urls:
    #start = time.time()
    #data = requests.get(team_url)
    #delay = time.time() - start
    #time.sleep(random.randint(20, 60)*delay)
    #team_abr = team_url.split('/')[-2]
    #year = team_url.split('/')[-1].replace('.htm', '')[-2:]
    #with open ('./htmls/'+ team_abr + year + '.txt', 'w') as f:
        #f.write(data.text)

#with open ('./htmls/' + team_abr + '.txt', 'w') as f:
    #f.write("Goodbye")