In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

<h2> Cleaning the Dataset </h2>

In [2]:
nfl = pd.read_csv('nflwr_data.csv')
nfl.head()

Unnamed: 0,Rk,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,...,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,Unnamed: 23
0,1,2019,1,25,Marquise Brown\BrowMa04,WR,22.0,BAL,2019.0,2019.0,...,5.0,4.0,,,,21.0,326.0,3.0,Oklahoma,College Stats
1,2,2019,1,32,N'Keal Harry\HarrNK00,WR,21.0,NWE,,,...,,,,,,,,,Arizona St.,College Stats
2,3,2019,2,36,Deebo Samuel\SamuDe00,WR,23.0,SFO,2019.0,2019.0,...,6.0,3.0,5.0,37.0,1.0,18.0,187.0,1.0,South Carolina,College Stats
3,4,2019,2,51,A.J. Brown\BrowAJ00,WR,22.0,TEN,2019.0,2019.0,...,8.0,5.0,1.0,-2.0,0.0,22.0,348.0,3.0,Mississippi,College Stats
4,5,2019,2,56,Mecole Hardman\HardMe00,WR,21.0,KAN,2019.0,2019.0,...,8.0,3.0,2.0,1.0,0.0,20.0,374.0,4.0,Georgia,College Stats


In [3]:
#Deleting the columns that is not going to be used
nfl.drop(['Rk','Unnamed: 23'], axis=1,inplace=True)
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,...,CarAV,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ
0,2019,1,25,Marquise Brown\BrowMa04,WR,22.0,BAL,2019.0,2019.0,0,...,0.0,5.0,4.0,,,,21.0,326.0,3.0,Oklahoma
1,2019,1,32,N'Keal Harry\HarrNK00,WR,21.0,NWE,,,0,...,,,,,,,,,,Arizona St.
2,2019,2,36,Deebo Samuel\SamuDe00,WR,23.0,SFO,2019.0,2019.0,0,...,0.0,6.0,3.0,5.0,37.0,1.0,18.0,187.0,1.0,South Carolina
3,2019,2,51,A.J. Brown\BrowAJ00,WR,22.0,TEN,2019.0,2019.0,0,...,0.0,8.0,5.0,1.0,-2.0,0.0,22.0,348.0,3.0,Mississippi
4,2019,2,56,Mecole Hardman\HardMe00,WR,21.0,KAN,2019.0,2019.0,0,...,0.0,8.0,3.0,2.0,1.0,0.0,20.0,374.0,4.0,Georgia


<h3> Deleting Some Missing Values </h3>

In [4]:
nfl.isnull().sum()

Year              0
Rnd               0
Pick              0
Player            0
Pos               0
DrAge           312
Tm                0
From            342
To              342
AP1               0
PB                0
St                0
CarAV           342
G               346
GS              349
Att             868
Yds             868
TD              868
Rec             471
Yds.1           471
TD.1            471
College/Univ      5
dtype: int64

In [5]:
print("Number of Observations: {}".format(nfl.shape[0]))

Number of Observations: 1541


In [6]:
#Deleting observations where we could not find enough info
nfl = nfl[nfl['From'].isnull() == False]
print('Number of Observations: {}'.format(nfl.shape[0]))

Number of Observations: 1199


In [7]:
#Deleting players that did no retire by 2018
nfl = nfl[nfl['To']!= 2018]
print('Number of Observations: {}'.format(nfl.shape[0]))

Number of Observations: 1155


In [8]:
nfl.isnull().sum()

Year              0
Rnd               0
Pick              0
Player            0
Pos               0
DrAge             0
Tm                0
From              0
To                0
AP1               0
PB                0
St                0
CarAV             0
G                 4
GS                7
Att             511
Yds             511
TD              511
Rec             125
Yds.1           125
TD.1            125
College/Univ      0
dtype: int64

<h3> Finding New Data </h3>

This function returns a list containing the player's name and respective ProFootballReference URL. We run this function and then append the output sequentially to our existing dataset.

In [9]:
baseurl = "http://www.pro-football-reference.com/players/"
def spliting(row):
    list_split = row['Player'].split('\\')
    player_name = list_split[0]
    player_url_code = list_split[1]
    first_letter = player_url_code[0]
    full_url = baseurl + first_letter + '/' + player_url_code + '.htm'
    return [player_name, full_url]
result = nfl.apply(spliting,axis=1)

In [10]:
#Converting into numpy array to add to the dataset
nfl['Player'] = np.array([row[0] for row in result])
nfl['PFR_URL'] = np.array([row[1] for row in result])

In [11]:
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,...,G,GS,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,PFR_URL
0,2019,1,25,Marquise Brown,WR,22.0,BAL,2019.0,2019.0,0,...,5.0,4.0,,,,21.0,326.0,3.0,Oklahoma,http://www.pro-football-reference.com/players/...
2,2019,2,36,Deebo Samuel,WR,23.0,SFO,2019.0,2019.0,0,...,6.0,3.0,5.0,37.0,1.0,18.0,187.0,1.0,South Carolina,http://www.pro-football-reference.com/players/...
3,2019,2,51,A.J. Brown,WR,22.0,TEN,2019.0,2019.0,0,...,8.0,5.0,1.0,-2.0,0.0,22.0,348.0,3.0,Mississippi,http://www.pro-football-reference.com/players/...
4,2019,2,56,Mecole Hardman,WR,21.0,KAN,2019.0,2019.0,0,...,8.0,3.0,2.0,1.0,0.0,20.0,374.0,4.0,Georgia,http://www.pro-football-reference.com/players/...
5,2019,2,57,JJ Arcega-Whiteside,WR,22.0,PHI,2019.0,2019.0,0,...,8.0,1.0,,,,2.0,14.0,0.0,Stanford,http://www.pro-football-reference.com/players/...


<h3> Getting the height and weight </h3>

This function was created to take in a player's respective URL and parse the website to find their height and weight using BeautifulSoup4.

In [12]:
def player_info(row):
    response = requests.get(row['PFR_URL'])
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    try:
        height = parser.find_all(itemprop='height')[0].text
        weight = parser.find_all(itemprop='weight')[0].text
    except IndexError:
        height=weight=None
    return height, weight

result = nfl.apply(player_info,axis=1)
print(result.head())

0     (5-9, 170lb)
2    (5-11, 214lb)
3     (6-0, 226lb)
4    (5-10, 187lb)
5     (6-2, 225lb)
dtype: object


In [13]:
nfl['Height'] = np.array([row[0] for row in result])
nfl['Weight'] = np.array([row[1] for row in result])

In [14]:
#Deleting the observations where no height or weight could be parsed
nfl = nfl[nfl['Height'].isnull() == False]
nfl = nfl[nfl['Weight'].isnull() == False]

<h3> Converting the Height and Weight to Interger </h3>

In [15]:
def convert_height(row):
    height = row['Height'].split('-')
    converted_height = 12*int(height[0]) + int(height[1])
    return converted_height
nfl['Height'] = nfl.apply(convert_height,axis=1)

In [16]:
def convert_weight(row):
    weight = int(row['Weight'][:3])
    return weight
nfl['Weight'] = nfl.apply(convert_weight, axis=1)

In [17]:
nfl.head()

Unnamed: 0,Year,Rnd,Pick,Player,Pos,DrAge,Tm,From,To,AP1,...,Att,Yds,TD,Rec,Yds.1,TD.1,College/Univ,PFR_URL,Height,Weight
0,2019,1,25,Marquise Brown,WR,22.0,BAL,2019.0,2019.0,0,...,,,,21.0,326.0,3.0,Oklahoma,http://www.pro-football-reference.com/players/...,69,170
2,2019,2,36,Deebo Samuel,WR,23.0,SFO,2019.0,2019.0,0,...,5.0,37.0,1.0,18.0,187.0,1.0,South Carolina,http://www.pro-football-reference.com/players/...,71,214
3,2019,2,51,A.J. Brown,WR,22.0,TEN,2019.0,2019.0,0,...,1.0,-2.0,0.0,22.0,348.0,3.0,Mississippi,http://www.pro-football-reference.com/players/...,72,226
4,2019,2,56,Mecole Hardman,WR,21.0,KAN,2019.0,2019.0,0,...,2.0,1.0,0.0,20.0,374.0,4.0,Georgia,http://www.pro-football-reference.com/players/...,70,187
5,2019,2,57,JJ Arcega-Whiteside,WR,22.0,PHI,2019.0,2019.0,0,...,,,,2.0,14.0,0.0,Stanford,http://www.pro-football-reference.com/players/...,74,225


<h3> Output the Dataset </h3>

In [18]:
nfl.to_csv('nfl.csv')