**Creating A Web Scraping tool for 2020 NFL Fantasy Stats**

In [2]:
#Installs 
!pip install beautifulsoup4

In [3]:
#Libraries 
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [6]:
#Creating the BeautifulSoup object
year = 2020
url = 'https://www.pro-football-reference.com/years/{}/fantasy.htm'.format(year)
html = urlopen(url)
soup = BeautifulSoup(html)

In [None]:
#getting the header data for CSV file
headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')] 
headers = headers[1:] #Do not need the first (0 index) column header
print(headers[:5])

In [8]:
#Getting the Table Row Data for the CSV file
rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead") 
player_stats = [[td.getText() for td in rows[i].findAll('td')] 
                for i in range(len(rows))] 
player_stats = player_stats[2:]

In [9]:
#DF object
stats = pd.DataFrame(player_stats, columns = headers)
stats.head()

Unnamed: 0,Player,Tm,FantPos,Age,G,GS,Cmp,Att,Yds,TD,Int,Att.1,Yds.1,Y/A,TD.1,Tgt,Rec,Yds.2,Y/R,TD.2,Fmb,FL,TD.3,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank
0,Derrick Henry *+,TEN,RB,26,16,16,0,0,0,0,0,378,2027,5.36,17,31,19,114,6.0,0,3,2,17,1.0,,314,333.1,341.1,323.6,184,1,1
1,Alvin Kamara*,NOR,RB,25,15,10,0,0,0,0,0,187,932,4.98,16,107,83,756,9.11,5,1,0,21,,,295,377.8,383.8,336.3,165,2,2
2,Dalvin Cook*,MIN,RB,25,14,14,0,0,0,0,0,312,1557,4.99,16,54,44,361,8.2,1,5,3,17,3.0,,294,337.8,346.8,315.8,164,3,3
3,Travis Kelce*+,KAN,TE,31,15,15,1,2,4,0,0,0,0,,0,145,105,1416,13.49,11,1,1,11,1.0,,208,312.8,316.8,260.3,117,1,4
4,Davante Adams*+,GNB,WR,28,14,14,0,0,0,0,0,0,0,,0,149,115,1374,11.95,18,1,1,18,,,243,358.4,362.4,300.9,117,1,5


In [10]:
stats = stats.replace(r'', 0, regex=True) #replace the empty string
stats['Year'] = year #create a new column as the year variable
stats.head()

Unnamed: 0,Player,Tm,FantPos,Age,G,GS,Cmp,Att,Yds,TD,Int,Att.1,Yds.1,Y/A,TD.1,Tgt,Rec,Yds.2,Y/R,TD.2,Fmb,FL,TD.3,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank,Year
0,Derrick Henry *+,TEN,RB,26,16,16,0,0,0,0,0,378,2027,5.36,17,31,19,114,6.0,0,3,2,17,1,0,314,333.1,341.1,323.6,184,1,1,2020
1,Alvin Kamara*,NOR,RB,25,15,10,0,0,0,0,0,187,932,4.98,16,107,83,756,9.11,5,1,0,21,0,0,295,377.8,383.8,336.3,165,2,2,2020
2,Dalvin Cook*,MIN,RB,25,14,14,0,0,0,0,0,312,1557,4.99,16,54,44,361,8.2,1,5,3,17,3,0,294,337.8,346.8,315.8,164,3,3,2020
3,Travis Kelce*+,KAN,TE,31,15,15,1,2,4,0,0,0,0,0.0,0,145,105,1416,13.49,11,1,1,11,1,0,208,312.8,316.8,260.3,117,1,4,2020
4,Davante Adams*+,GNB,WR,28,14,14,0,0,0,0,0,0,0,0.0,0,149,115,1374,11.95,18,1,1,18,0,0,243,358.4,362.4,300.9,117,1,5,2020


In [12]:
#Creating the CSV file
stats.to_csv('2020playerstats.csv') 

In [14]:
#Function to get a CSV file of player data by year
def player_csv(year):

    url = "https://www.pro-football-reference.com/years/{}/fantasy.htm".format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr')[1].findAll('th')] #Find the second table row tag, find every table header column within it and extract the html text via the get_text method.
    headers = headers[1:] #Do not need the first (0 index) column header
    
    rows = soup.findAll('tr', class_ = lambda table_rows: table_rows != "thead") #Here we grab all rows that are not classed as table header rows - football reference throws in a table header row everyy 30 rows 
    player_stats = [[td.getText() for td in rows[i].findAll('td')] #get the table data cell text from each table data cell
                    for i in range(len(rows))] #for each row
    player_stats = player_stats[2:]

    stats = pd.DataFrame(player_stats, columns = headers)
    
    stats = stats.replace(r'', 'N/A', regex=True)
    stats['Year'] = year
    
    stats.to_csv('{}playerstats.csv'.format(year)) #add your desired path to the function
    
    print("Player data for the year {} has been created.".format(year))