In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Set year of desired dataset
year = 2023

In [3]:
# create beautifulSoup object for passing
url = "https://www.pro-football-reference.com/years/{}/passing.htm".format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

# get header data
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
headers = headers[1:]


# get table row data
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead")
player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
player_stats = player_stats[2:]

# create pandas DataFrame object
pass_stats = pd.DataFrame(player_stats, columns = headers)

# drop unnecessary columns
pass_stats = pass_stats.drop(['QBrec', '1D', 'Lng', 'Succ%'], axis=1)
pass_stats = pass_stats.drop(pass_stats.columns[-4:], axis=1) # drop last 4 columns
pass_stats = pass_stats.drop(pass_stats.columns[-2], axis=1) # drop yards lost to sack column
pass_stats = pass_stats.drop(pass_stats.columns[1:6], axis=1) # drop all repeating stats but name

# adjust column headers to be more descriptive
pass_stats = pass_stats.rename(columns={'Att' : 'PassAtt'   , 
                                        'Yds' : 'PassYds'   , 
                                        'TD'  : 'PassTD'    ,
                                        'Y/A' : 'PassYds/A' ,
                                        'Y/G' : 'PassYds/G'  })



pass_stats.head()

Unnamed: 0,Player,Cmp,PassAtt,Cmp%,PassTD,TD%,Int,Int%,PassYds/A,AY/A,Y/C,PassYds/G,Rate,QBR,Sk,Sk%
0,Jared Goff,407,605,67.3,30,5.0,12,2.0,7.6,7.7,11.2,269.1,97.9,60.3,30,4.7
1,Dak Prescott*,410,590,69.5,36,6.1,9,1.5,7.7,8.2,11.0,265.6,105.9,72.7,39,6.2
2,Josh Allen,385,579,66.5,29,5.0,18,3.1,7.4,7.0,11.2,253.3,92.2,69.6,24,4.0
3,Brock Purdy*,308,444,69.4,31,7.0,11,2.5,9.6,9.9,13.9,267.5,113.0,72.7,28,5.9
4,Patrick Mahomes*,401,597,67.2,27,4.5,14,2.3,7.0,6.9,10.4,261.4,92.6,63.0,27,4.3


In [4]:
# create beautifulSoup object for rushing
url = "https://www.pro-football-reference.com/years/{}/rushing.htm".format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

# get header data
headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')]
headers = headers[1:]
# adjust column headers to be more descriptive

# get table row data
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead")
player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
player_stats = player_stats[2:]

# create pandas DataFrame object
rush_stats = pd.DataFrame(player_stats, columns = headers)

# drop unnecessary columns
rush_stats = rush_stats.drop(['1D', 'Lng', 'Succ%'], axis=1)
rush_stats = rush_stats.drop(rush_stats.columns[1:6], axis=1) # drop all repeating stats but name
rush_stats = rush_stats.drop(rush_stats.columns[-1], axis=1) #remove fumbles

# adjust column headers to be more descriptive
rush_stats = rush_stats.rename(columns={'Att' : 'RushAtt'   , 
                                        'Yds' : 'RushYds'   , 
                                        'TD'  : 'RushTD'    ,
                                        'Y/A' : 'RushYds/A' ,
                                        'Y/G' : 'RushYds/G'  })

rush_stats.head()

Unnamed: 0,Player,RushAtt,RushYds,RushTD,RushYds/A,RushYds/G
0,Derrick Henry*,280,1167,12,4.2,68.6
1,Christian McCaffrey*+,272,1459,14,5.4,91.2
2,Rachaad White,272,990,6,3.6,58.2
3,Travis Etienne,267,1008,11,3.8,59.3
4,Joe Mixon,257,1034,9,4.0,60.8


In [5]:
# create beautifulSoup object for receiving
url = "https://www.pro-football-reference.com/years/{}/receiving.htm".format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

# get header data
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
headers = headers[1:]
# adjust column headers to be more descriptive

# get table row data
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead")
player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
player_stats = player_stats[2:]

# create pandas DataFrame object
rec_stats = pd.DataFrame(player_stats, columns = headers)

# drop unnecessary columns
rec_stats = rec_stats.drop(['1D', 'Lng', 'Succ%'], axis=1)
rec_stats = rec_stats.drop(rec_stats.columns[1:6], axis=1) # drop all repeating stats but name
rec_stats = rec_stats.drop(rec_stats.columns[-1], axis=1) #remove fumbles

# adjust column headers to be more descriptive
rec_stats = rec_stats.rename(columns={'Yds' : 'RecYds'   , 
                                        'TD'  : 'RecTD'    ,
                                        'Y/G' : 'RecYds/G'  })

rec_stats.head()

Unnamed: 0,Player,Tgt,Rec,Ctch%,RecYds,Y/R,RecTD,Y/Tgt,R/G,RecYds/G
0,Tyreek Hill*+,171,119,69.6%,1799,15.1,13,10.5,7.4,112.4
1,Amon-Ra St. Brown*+,164,119,72.6%,1515,12.7,10,9.2,7.4,94.7
2,Evan Engram*,143,114,79.7%,963,8.4,4,6.7,6.7,56.6
3,Michael Pittman Jr.,156,109,69.9%,1152,10.6,4,7.4,6.8,72.0
4,Keenan Allen*,150,108,72.0%,1243,11.5,7,8.3,8.3,95.6


In [6]:
# create beautifulSoup object for total scrimmage yards
url = "https://www.pro-football-reference.com/years/{}/scrimmage.htm".format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

# get header data
headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')]
headers = headers[1:]
# adjust column headers to be more descriptive

# get table row data
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead")
player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
player_stats = player_stats[2:]

# create pandas DataFrame object
scrim_stats = pd.DataFrame(player_stats, columns = headers)

# drop unnecessary columns
scrim_stats = scrim_stats.drop(['Succ%'], axis=1)
scrim_stats = scrim_stats.drop(scrim_stats.columns[1:-6], axis=1) #take first and last few columns
scrim_stats = scrim_stats.drop(scrim_stats.columns[-1], axis=1) #remove fumbles

scrim_stats.head()

Unnamed: 0,Player,A/G,Touch,Y/Tch,YScm,RRTD
0,Christian McCaffrey*+,17.0,339,6.0,2023,21
1,CeeDee Lamb*+,0.8,149,12.5,1862,14
2,Tyreek Hill*+,0.4,125,14.5,1814,13
3,Breece Hall,13.1,299,5.3,1585,9
4,Puka Nacua*,0.7,117,13.5,1575,6


In [7]:
# create beautifulSoup object for fantasy stats
url = "https://www.pro-football-reference.com/years/{}/fantasy.htm".format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

# get header data
headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')]
headers = headers[1:]
# adjust column headers to be more descriptive

# get table row data
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead")
player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
player_stats = player_stats[2:]

# create pandas DataFrame object
fantasy_stats = pd.DataFrame(player_stats, columns = headers)

# drop unnecessary columns
fantasy_stats = fantasy_stats.drop(fantasy_stats.columns[4:-7], axis=1) #take first and last few columns

# adjust column headers to be more descriptive
fantasy_stats = fantasy_stats.rename(columns={'FantPos' : 'Pos'})

fantasy_stats.head()

Unnamed: 0,Player,Tm,Pos,Age,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank
0,Christian McCaffrey*+,SFO,RB,27,324,391.3,399.3,357.8,157,1,1
1,CeeDee Lamb*+,DAL,WR,24,268,403.2,411.2,335.7,131,1,2
2,Josh Allen,BUF,QB,27,393,392.6,420.6,410.6,122,1,3
3,Tyreek Hill*+,MIA,WR,29,257,376.4,380.4,316.9,120,2,4
4,Jalen Hurts*,PHI,QB,25,357,356.8,382.8,371.8,89,2,5


In [8]:
# combine all data
stats = pd.merge(fantasy_stats, pass_stats, on='Player', how='inner')
stats = pd.merge(stats, rush_stats, on='Player', how='inner')
stats = pd.merge(stats, rec_stats, on='Player', how='inner')
stats = pd.merge(stats, scrim_stats, on='Player', how='inner')

In [9]:
# data manipulation
stats = stats.replace(r'', 0, regex=True) # replace empty columns with zero
stats = stats.replace(r'\+|\*', '', regex=True) # remove trailing + and * from names
stats.head()

Unnamed: 0,Player,Tm,Pos,Age,FantPt,PPR,DKPt,FDPt,VBD,PosRank,...,Y/R,RecTD,Y/Tgt,R/G,RecYds/G,A/G,Touch,Y/Tch,YScm,RRTD
0,Derrick Henry,TEN,RB,29,219,246.7,252.7,232.7,57,5,...,7.6,0,5.9,1.6,12.6,16.5,308,4.5,1381,12
1,Deebo Samuel,SFO,WR,27,184,243.7,249.7,213.7,51,7,...,14.9,7,10.0,4.0,59.5,2.5,97,11.5,1117,12
2,Keenan Allen,LAC,WR,31,171,278.9,282.9,224.9,39,11,...,11.5,7,8.3,8.3,95.6,0.2,110,11.4,1249,7
3,Ja'Marr Chase,CIN,WR,23,163,262.7,265.7,212.7,31,13,...,12.2,7,8.4,6.3,76.0,0.2,103,11.7,1210,7
4,Taysom Hill,NOR,TE,33,111,143.5,150.5,127.0,25,7,...,8.8,2,7.3,2.1,18.2,5.1,114,6.1,692,6


In [10]:
stats.to_csv('{}playerstats.csv'.format(year)) # csv output format {year}playerstats.csv