# _Fantasy Football Analytics_

I want to see if I can analyze last season's NFL individual player statistics to create the best team possible. 

First though I have to get the data!

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import os

# Matplotlib
%matplotlib inline
%config InlineBackend.figure_format='retina'
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [3]:
os.getcwd()

'/Users/jai/Documents/projects/fantasy-football'

### _Gather Passing Data_

In [4]:
# import libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup

To get things started, I'm just going to focus on pulling the 2018 passing statistics from [pro-football-reference.com](https://www.pro-football-reference.com/). Much like [basketball-reference.com](https://www.basketball-reference.com/), which I used for my first capstone project, I think this website alone will provide the data I need to better analyze NFL players.

In [5]:
# url of website
year = 2018
statistic = 'passing'
url = 'https://www.pro-football-reference.com/years/{}/{}.htm'.format(year, statistic); url

'https://www.pro-football-reference.com/years/2018/passing.htm'

In [8]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

In [6]:
import urllib.request

with urllib.request.urlopen(url) as resonse:
    the_page = resonse.read()


# open url
#htm = urlopen(url, verify=F)
soup = BeautifulSoup(the_page)

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1056)>

In [11]:
for i in [the_page, soup]:
    print(type(i))

<class 'bytes'>
<class 'bs4.BeautifulSoup'>


In [12]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]; headers

['Rk',
 'Player',
 'Tm',
 'Age',
 'Pos',
 'G',
 'GS',
 'QBrec',
 'Cmp',
 'Att',
 'Cmp%',
 'Yds',
 'TD',
 'TD%',
 'Int',
 'Int%',
 'Lng',
 'Y/A',
 'AY/A',
 'Y/C',
 'Y/G',
 'Rate',
 'QBR',
 'Sk',
 'Yds',
 'NY/A',
 'ANY/A',
 'Sk%',
 '4QC',
 'GWD']

In [13]:
# drop first column, don't need ranking order
headers = headers[1:]; headers

['Player',
 'Tm',
 'Age',
 'Pos',
 'G',
 'GS',
 'QBrec',
 'Cmp',
 'Att',
 'Cmp%',
 'Yds',
 'TD',
 'TD%',
 'Int',
 'Int%',
 'Lng',
 'Y/A',
 'AY/A',
 'Y/C',
 'Y/G',
 'Rate',
 'QBR',
 'Sk',
 'Yds',
 'NY/A',
 'ANY/A',
 'Sk%',
 '4QC',
 'GWD']

In [14]:
# gather rows
rows = soup.findAll('tr')[1:]

In [15]:
player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]; 
player_stats[:2]

[['Ben Roethlisberger',
  'PIT',
  '36',
  'QB',
  '16',
  '16',
  '9-6-1',
  '452',
  '675',
  '67.0',
  '5129',
  '34',
  '5.0',
  '16',
  '2.4',
  '97',
  '7.6',
  '7.5',
  '11.3',
  '320.6',
  '96.5',
  '73.0',
  '24',
  '166',
  '7.10',
  '7.04',
  '3.4',
  '2',
  '3'],
 ['Andrew Luck*',
  'IND',
  '29',
  'QB',
  '16',
  '16',
  '10-6-0',
  '430',
  '639',
  '67.3',
  '4593',
  '39',
  '6.1',
  '15',
  '2.3',
  '68',
  '7.2',
  '7.4',
  '10.7',
  '287.1',
  '98.7',
  '71.5',
  '18',
  '134',
  '6.79',
  '6.95',
  '2.7',
  '3',
  '3']]

In [16]:
# create a new DataFrame with player_stats and headers
df = pd.DataFrame(player_stats, columns = headers); df.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds.1,NY/A,ANY/A,Sk%,4QC,GWD
0,Ben Roethlisberger,PIT,36,QB,16,16,9-6-1,452,675,67.0,5129,34,5.0,16,2.4,97,7.6,7.5,11.3,320.6,96.5,73.0,24,166,7.1,7.04,3.4,2,3
1,Andrew Luck*,IND,29,QB,16,16,10-6-0,430,639,67.3,4593,39,6.1,15,2.3,68,7.2,7.4,10.7,287.1,98.7,71.5,18,134,6.79,6.95,2.7,3,3
2,Matt Ryan,ATL,33,QB,16,16,7-9-0,422,608,69.4,4924,35,5.8,7,1.2,75,8.1,8.7,11.7,307.8,108.1,70.6,42,296,7.12,7.71,6.5,1,1
3,Kirk Cousins,MIN,30,QB,16,16,8-7-1,425,606,70.1,4298,30,5.0,10,1.7,75,7.1,7.3,10.1,268.6,99.7,60.6,40,262,6.25,6.48,6.2,1,0
4,Aaron Rodgers*,GNB,35,QB,16,16,6-9-1,372,597,62.3,4442,25,4.2,2,0.3,75,7.4,8.1,11.9,277.6,97.6,56.8,49,353,6.33,6.96,7.6,3,3


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 29 columns):
Player    106 non-null object
Tm        106 non-null object
Age       106 non-null object
Pos       106 non-null object
G         106 non-null object
GS        106 non-null object
QBrec     106 non-null object
Cmp       106 non-null object
Att       106 non-null object
Cmp%      106 non-null object
Yds       106 non-null object
TD        106 non-null object
TD%       106 non-null object
Int       106 non-null object
Int%      106 non-null object
Lng       106 non-null object
Y/A       106 non-null object
AY/A      106 non-null object
Y/C       106 non-null object
Y/G       106 non-null object
Rate      106 non-null object
QBR       106 non-null object
Sk        106 non-null object
Yds       106 non-null object
NY/A      106 non-null object
ANY/A     106 non-null object
Sk%       106 non-null object
4QC       106 non-null object
GWD       106 non-null object
dtypes: object(29)
memory

### _Test Wrangling Function from custom `football` library_

In [18]:
from football import wrangling

In [19]:
df = wrangling.football_stats(2018, 'passing'); df.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds.1,NY/A,ANY/A,Sk%,4QC,GWD,year
0,Ben Roethlisberger,PIT,36,QB,16,16,9-6-1,452,675,67.0,5129,34,5.0,16,2.4,97,7.6,7.5,11.3,320.6,96.5,73.0,24,166,7.1,7.04,3.4,2,3,2018
1,Andrew Luck*,IND,29,QB,16,16,10-6-0,430,639,67.3,4593,39,6.1,15,2.3,68,7.2,7.4,10.7,287.1,98.7,71.5,18,134,6.79,6.95,2.7,3,3,2018
2,Matt Ryan,ATL,33,QB,16,16,7-9-0,422,608,69.4,4924,35,5.8,7,1.2,75,8.1,8.7,11.7,307.8,108.1,70.6,42,296,7.12,7.71,6.5,1,1,2018
3,Kirk Cousins,MIN,30,QB,16,16,8-7-1,425,606,70.1,4298,30,5.0,10,1.7,75,7.1,7.3,10.1,268.6,99.7,60.6,40,262,6.25,6.48,6.2,1,0,2018
4,Aaron Rodgers*,GNB,35,QB,16,16,6-9-1,372,597,62.3,4442,25,4.2,2,0.3,75,7.4,8.1,11.9,277.6,97.6,56.8,49,353,6.33,6.96,7.6,3,3,2018


### _Edit DataFrame_

In [20]:
df.dtypes

Player    object
Tm        object
Age       object
Pos       object
G         object
GS        object
QBrec     object
Cmp       object
Att       object
Cmp%      object
Yds       object
TD        object
TD%       object
Int       object
Int%      object
Lng       object
Y/A       object
AY/A      object
Y/C       object
Y/G       object
Rate      object
QBR       object
Sk        object
Yds       object
NY/A      object
ANY/A     object
Sk%       object
4QC       object
GWD       object
year       int64
dtype: object

In [21]:
df = df.astype({'Player': 'object', 'Tm': 'category', 'Pos': 'category', 'year': 'category'}); df.dtypes

Player      object
Tm        category
Age         object
Pos       category
G           object
GS          object
QBrec       object
Cmp         object
Att         object
Cmp%        object
Yds         object
TD          object
TD%         object
Int         object
Int%        object
Lng         object
Y/A         object
AY/A        object
Y/C         object
Y/G         object
Rate        object
QBR         object
Sk          object
Yds         object
NY/A        object
ANY/A       object
Sk%         object
4QC         object
GWD         object
year      category
dtype: object

In [62]:
df_other = df[['Player', 'Tm', 'Pos', 'year']]
#df.drop(labels=['Player', 'Tm', 'Pos', 'year'], axis=1, inplace=True); df

In [63]:
df.loc[:, 'Tm':].select_dtypes(include='object').apply(pd.to_numeric, errors='coerce')

Unnamed: 0,Age,G,GS,QBrec,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds.1,NY/A,ANY/A,Sk%,4QC,GWD
0,36.0,16.0,16.0,,452.0,675.0,67.0,5129.0,34.0,5.0,16.0,2.4,97.0,7.6,7.5,11.3,320.6,96.5,73.0,24.0,166.0,7.10,7.04,3.4,2.0,3.0
1,29.0,16.0,16.0,,430.0,639.0,67.3,4593.0,39.0,6.1,15.0,2.3,68.0,7.2,7.4,10.7,287.1,98.7,71.5,18.0,134.0,6.79,6.95,2.7,3.0,3.0
2,33.0,16.0,16.0,,422.0,608.0,69.4,4924.0,35.0,5.8,7.0,1.2,75.0,8.1,8.7,11.7,307.8,108.1,70.6,42.0,296.0,7.12,7.71,6.5,1.0,1.0
3,30.0,16.0,16.0,,425.0,606.0,70.1,4298.0,30.0,5.0,10.0,1.7,75.0,7.1,7.3,10.1,268.6,99.7,60.6,40.0,262.0,6.25,6.48,6.2,1.0,0.0
4,35.0,16.0,16.0,,372.0,597.0,62.3,4442.0,25.0,4.2,2.0,0.3,75.0,7.4,8.1,11.9,277.6,97.6,56.8,49.0,353.0,6.33,6.96,7.6,3.0,3.0
5,30.0,16.0,16.0,,365.0,586.0,62.3,3890.0,18.0,3.1,15.0,2.6,64.0,6.6,6.1,10.7,243.1,81.2,47.9,34.0,235.0,5.90,5.39,5.5,3.0,4.0
6,23.0,16.0,16.0,,383.0,580.0,66.0,5097.0,50.0,8.6,12.0,2.1,89.0,8.8,9.6,13.3,318.6,113.8,82.0,26.0,171.0,8.13,8.89,4.3,2.0,2.0
7,37.0,16.0,16.0,,380.0,576.0,66.0,4299.0,21.0,3.6,11.0,1.9,58.0,7.5,7.3,11.3,268.7,92.4,51.2,47.0,358.0,6.33,6.21,7.5,1.0,2.0
8,41.0,16.0,16.0,,375.0,570.0,65.8,4355.0,29.0,5.1,11.0,1.9,63.0,7.6,7.8,11.6,272.2,97.7,68.8,21.0,147.0,7.12,7.26,3.6,1.0,2.0
9,24.0,16.0,16.0,,364.0,561.0,64.9,4688.0,32.0,5.7,12.0,2.1,70.0,8.4,8.5,12.9,293.0,101.1,65.4,33.0,223.0,7.52,7.69,5.6,4.0,4.0


In [64]:
df.dtypes

Player      object
Tm        category
Age         object
Pos       category
G           object
GS          object
QBrec       object
Cmp         object
Att         object
Cmp%        object
Yds         object
TD          object
TD%         object
Int         object
Int%        object
Lng         object
Y/A         object
AY/A        object
Y/C         object
Y/G         object
Rate        object
QBR         object
Sk          object
Yds         object
NY/A        object
ANY/A       object
Sk%         object
4QC         object
GWD         object
year      category
dtype: object