In [32]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


In [79]:
# Load the Data 2014 - 2016
pbp_data_path = '../nflscrapR-data-master/data/season_play_by_play'
years = ['20{}'.format(i + 14) for i in range(3)]

dfs = {}
for year in years:
    dfs[year] = pd.read_csv(pbp_data_path + '/pbp_{}.csv'.format(year), 
                            error_bad_lines=False)
    # Rename columns
    dfs[year] = dfs[year].rename(columns={"Accepted.Penalty": "AcceptedPenalty",
                            "Penalty.Yards": "PenaltyYards",
                            "Yards.Gained": "YardsGained",
                            "Challenge.Replay": "ChallengeReplay"})

# Combined all data into 1 DataFrame
df = pd.concat(dfs)
del dfs

In [81]:
# Define Useful Variables
team_ids = list(np.genfromtxt('team_ids.in', dtype='str'))
yd_bins = np.arange(-100, 101, 1)

In [82]:
# Filter Data 
# Only Passing plays without sacks
play_filter = (df.PassOutcome == 'Complete') | (df.PassOutcome == 'Incomplete')
filtered_data = df[play_filter]


In [83]:
# Passing Totals
passing_totals = df.groupby('posteam').agg({'YardsGained' : 'sum'})
passing_totals = passing_totals.reset_index()

# Index to sort 
idx = passing_totals['YardsGained'].argsort().values


In [90]:
# Build distributions for each team 
# Distance of throw in air on all pass attempts
distributions = np.zeros((32, len(yd_bins) - 1))
for i, team in enumerate(team_ids):
        
    single_team = filtered[(filtered.posteam == team) 
                          & (filtered.AcceptedPenalty == 0)]
    
    
    # Yards ball was in air
    all_plays = plt.hist(single_team.AirYards, bins=yd_bins)
    total_plays = single_team.PlayAttempted.count()
    
    # Normalized histogram for each team
    distributions[i, :] = all_plays[0] / total_plays

In [91]:
# Mean distribution
mean_distribution = np.mean(distributions, axis=0)

pca = PCA()
pca.fit(distributions)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [93]:
# See breakdown of PCA components
print pca.explained_variance_ratio_

[  2.79289563e-01   1.89769649e-01   9.59269417e-02   6.70221472e-02
   6.23077431e-02   5.72754028e-02   4.51651811e-02   3.57190978e-02
   2.92123026e-02   2.58354181e-02   2.07122523e-02   1.51015221e-02
   1.12595718e-02   1.08122788e-02   9.69293623e-03   7.77298017e-03
   6.41901384e-03   5.22244931e-03   4.99338779e-03   4.65519104e-03
   2.98711159e-03   2.64088551e-03   2.34369932e-03   2.12628043e-03
   1.50375240e-03   1.42241735e-03   8.27008810e-04   6.45595394e-04
   5.71413116e-04   5.30333384e-04   2.36472614e-04   5.50801544e-32]


In [240]:
# Calculate PCA coefficients
pca_coefficients = np.zeros((32, 32))
test = np.zeros((32, 32))
B = distributions - means
for i in range(32):
    for j in range(32):
        pca_coefficients[i, j] = np.dot(pca.components_[i], B[j,:])


In [241]:
# Plot of First 2 PCA Components
n = 2

plt.figure(figsize=(6, 9))

plt.subplot(211)
plt.title('PCA Analysis of passing attempts', fontsize=18)
plt.plot(yd_bins[:-1], means, 'k--',linewidth=2, label='Mean Distribution')
plt.xlim([-20, 20])
plt.yticks(np.arange(0, .1, 0.01))
plt.legend(frameon=False, loc=2)
plt.subplot(212)
for i in range(n):
    plt.plot(yd_bins[:-1], pca.components_[i], label='Component: {}'.format(i + 1), linewidth=2)
plt.xlim([-20, 20])
plt.legend(frameon=False,loc=2)
plt.xlabel('Yards in the air', fontsize=12, labelpad=8)
plt.tight_layout()

plt.savefig('pca1.pdf')


In [242]:
# Plot PCA coefficents of teams
coeff_num = 0
coeffs = z[coeff_num,:][idx]

plt.figure(figsize=(9.25, 9.25))

min_val = min(coeffs)
max_val = max(coeffs)
bins = np.arange(-.07, .07, .02)

plt.subplot(221)
n = 10
width = .9 * (bins[1] - bins[0])
plt.hist(coeffs[:n], bins=bins, color='g', label='{} worst passing teams'.format(n), width=width)
plt.ylabel('Number of Teams', fontsize=14, labelpad=8)
plt.yticks(np.arange(0, 9, 1))
plt.legend(frameon=False, loc=2)
plt.subplot(223)
plt.hist(coeffs[32-n:], bins=bins, color='b', label='{} best passing teams'.format(n), width=width)
plt.yticks(np.arange(0, 9, 1))

plt.ylabel('Number of Teams', fontsize=14, labelpad=8)
plt.xlabel('First PCA Coefficient', fontsize=14, labelpad=8)
plt.legend(frameon=False, loc=2)


coeff_num = 1
coeffs = z[coeff_num,:][idx]
min_val = min(coeffs)
max_val = max(coeffs)
#bins = np.arange(min_val - 0.02, max_val + 0.02, .025)
plt.subplot(222)
plt.hist(coeffs[:n], bins=bins, color='g', width=width)
plt.yticks(np.arange(0, 9, 1))

plt.subplot(224)
plt.hist(coeffs[32-n:], bins=bins, color='b', width=width)
plt.yticks(np.arange(0, 9, 1))

plt.xlabel('Second PCA Coefficient', fontsize=14, labelpad=8)

plt.tight_layout()
plt.savefig('pca2.pdf')


    