In [1]:
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt


In [2]:
urllib.request.urlopen("https://www.pro-football-reference.com/years/2021/passing.htm")
# print(r.status)
# print(r)

url = "https://www.pro-football-reference.com/years/2021/passing.htm"

html = urlopen(url)
stats_page = BeautifulSoup(html, "html.parser")

In [3]:
# Collect table headers
column_headers = stats_page.findAll('tr')[0]
column_headers = [i.getText() for i in column_headers.findAll('th')]
print(column_headers)

['Rk', 'Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC', 'GWD']


In [4]:
# Collect table rows
rows = stats_page.findAll('tr')[1:]

# Get stats from each row
qb_stats = []
for i in range(len(rows)):
    qb_stats.append([col.getText() for col in rows[i].findAll('td')])
print(qb_stats[0])

['Tom Brady*', 'TAM', '44', 'QB', '17', '17', '13-4-0', '485', '719', '67.5', '5316', '43', '6', '12', '1.7', '269', '62', '7.4', '7.8', '11.0', '312.7', '102.1', '68.1', '22', '144', '3', '6.98', '7.41', '3', '5']


In [5]:
# Create DataFrame from our scraped data
data = pd.DataFrame(qb_stats, columns=column_headers[1:])
data.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,...,Y/G,Rate,QBR,Sk,Yds,Sk%,NY/A,ANY/A,4QC,GWD
0,Tom Brady*,TAM,44,QB,17,17,13-4-0,485,719,67.5,...,312.7,102.1,68.1,22,144,3.0,6.98,7.41,3,5
1,Justin Herbert*,LAC,23,QB,17,17,9-8-0,443,672,65.9,...,294.9,97.7,65.6,31,214,4.4,6.83,6.95,5,5
2,Matthew Stafford,LAR,33,QB,17,17,12-5-0,404,601,67.2,...,287.4,102.9,63.8,30,243,4.8,7.36,7.45,3,4
3,Patrick Mahomes*,KAN,26,QB,17,17,12-5-0,436,658,66.3,...,284.6,98.5,62.2,28,146,4.1,6.84,7.07,3,3
4,Derek Carr,LVR,30,QB,17,17,10-7-0,428,626,68.4,...,282.6,94.0,52.4,40,241,6.0,6.85,6.6,3,6


In [6]:
# View columns in data
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

In [7]:
# Rename sack yards column to "Yds_Sack"
new_columns = data.columns.values
new_columns[-6] = 'Yds_Sack'
data.columns = new_columns

In [8]:
# View columns in data with Sack Yards replacing Yds column
data.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G', 'GS', 'QBrec', 'Cmp', 'Att', 'Cmp%',
       'Yds', 'TD', 'TD%', 'Int', 'Int%', '1D', 'Lng', 'Y/A', 'AY/A', 'Y/C',
       'Y/G', 'Rate', 'QBR', 'Sk', 'Yds_Sack', 'Sk%', 'NY/A', 'ANY/A', '4QC',
       'GWD'],
      dtype='object')

In [9]:
# Select stat categories 
categories = ['Cmp%', 'Yds', 'TD', 'Int', 'Y/A', 'Rate']

# Create data subset for radar chart
data_radar = data[['Player', 'Tm'] + categories]
data_radar.head()

Unnamed: 0,Player,Tm,Cmp%,Yds,TD,Int,Y/A,Rate
0,Tom Brady*,TAM,67.5,5316,43,12,7.4,102.1
1,Justin Herbert*,LAC,65.9,5014,38,15,7.5,97.7
2,Matthew Stafford,LAR,67.2,4886,41,17,8.1,102.9
3,Patrick Mahomes*,KAN,66.3,4839,37,13,7.4,98.5
4,Derek Carr,LVR,68.4,4804,23,14,7.7,94.0


In [10]:
# Check data types
data_radar.dtypes

Player    object
Tm        object
Cmp%      object
Yds       object
TD        object
Int       object
Y/A       object
Rate      object
dtype: object

In [11]:
# Convert data to numerical values
for i in categories:
    data_radar[i] = pd.to_numeric(data[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# Check data types again
data_radar.dtypes

Player     object
Tm         object
Cmp%      float64
Yds       float64
TD        float64
Int       float64
Y/A       float64
Rate      float64
dtype: object

In [13]:
# Remove characters for achievements 
data_radar['Player'] = data_radar['Player'].str.replace('*', '')
data_radar['Player'] = data_radar['Player'].str.replace('+', '')

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# Filter by passing yards
data_radar_filtered = data_radar[data_radar['Yds'] > 1500]

In [15]:
# Create columns with percentile rank
for i in categories:
    data_radar_filtered[i + '_Rank'] = data_radar_filtered[i].rank(pct=True)
    
# Flip the rank for interceptions
data_radar_filtered['Int_Rank'] = 1 - data_radar_filtered['Int_Rank']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# Examine data
data_radar_filtered.head()

Unnamed: 0,Player,Tm,Cmp%,Yds,TD,Int,Y/A,Rate,Cmp%_Rank,Yds_Rank,TD_Rank,Int_Rank,Y/A_Rank,Rate_Rank
0,Tom Brady,TAM,67.5,5316.0,43.0,12.0,7.4,102.1,0.75,1.0,1.0,0.4375,0.640625,0.8125
1,Justin Herbert,LAC,65.9,5014.0,38.0,15.0,7.5,97.7,0.46875,0.96875,0.9375,0.09375,0.734375,0.6875
2,Matthew Stafford,LAR,67.2,4886.0,41.0,17.0,8.1,102.9,0.6875,0.9375,0.96875,0.015625,0.9375,0.84375
3,Patrick Mahomes,KAN,66.3,4839.0,37.0,13.0,7.4,98.5,0.515625,0.90625,0.875,0.3125,0.640625,0.71875
4,Derek Carr,LVR,68.4,4804.0,23.0,14.0,7.7,94.0,0.875,0.875,0.625,0.1875,0.828125,0.59375


In [17]:
# General plot parameters
mpl.rcParams['font.family'] = 'Avenir'
mpl.rcParams['font.size'] = 16
mpl.rcParams['axes.linewidth'] = 0
mpl.rcParams['xtick.major.pad'] = 15

In [18]:
# HEX codes of the NFL team colors
team_colors = {'ARI':'#97233f', 'ATL':'#a71930', 'BAL':'#241773', 'BUF':'#00338d', 'CAR':'#0085ca', 'CHI':'#0b162a', 'CIN':'#fb4f14', 'CLE':'#311d00', 'DAL':'#041e42', 'DEN':'#002244', 'DET':'#0076b6', 'GNB':'#203731', 'HOU':'#03202f', 'IND':'#002c5f', 'JAX':'#006778', 'KAN':'#e31837', 'LAC':'#002a5e', 'LAR':'#003594', 'MIA':'#008e97', 'MIN':'#4f2683', 'NWE':'#002244', 'NOR':'#d3bc8d', 'NYG':'#0b2265', 'NYJ':'#125740', 'OAK':'#000000', 'PHI':'#004c54', 'PIT':'#ffb612', 'SFO':'#aa0000', 'SEA':'#002244', 'TAM':'#d50a0a', 'TEN':'#0c2340', 'WAS':'#773141'}

In [19]:
# Calculate angles for radar chart
offset = np.pi/6
angles = np.linspace(0, 2*np.pi, len(categories) + 1) + offset