# Grouping

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
import numpy as np

In [5]:
nfl_plays = pd.read_csv('/data/JakeOliver28/cleaned_nfl_plays.csv', \
                        dtype={'down':np.float64, 'DefTwoPoint':object, \
                               'BlockingPlayer':str, 'ScoreDiff':float})

## Wins

In [6]:
# Plays in the fourth quarter of games
fourth_quarter = nfl_plays[nfl_plays['qtr'] == 4]

# Plays per game
plays_per_game = nfl_plays.groupby('GameID').count()

# Plays where a touchdown was scored
touchdowns = nfl_plays[nfl_plays['Touchdown'] == 1]

# Plays where field goals where attempted
field_goal_attempts = nfl_plays[nfl_plays['FieldGoalDistance'] >= 1]

# Last plays of the game
last_plays = nfl_plays.loc[nfl_plays.groupby('GameID')['TimeSecs'].idxmin()]
# Finding minimum values for gametime in seconds
nfl_plays.groupby('GameID')['TimeSecs'].min()

# Contains winning percentages of each team
onlyWP = last_plays[['GameID', 'desc', 'Win_Prob']]

## Winning Percentages

For this study, I considered considered the same teams in different seasons as distinct teams.
To calculate winning percentages, I found the last play of each game, and looked at a team's
win probability after that play. Naturally, for each game, the win probability is 1.00 for 
the winning team and 0.00 for the losing team.

In [7]:
home_team_win = last_plays[last_plays['Home_WP_post'] >= .99]
away_team_win = last_plays[last_plays['Home_WP_post'] <= .01]

home_wins = home_team_win.groupby('HomeTeamYear')[['GameID']].count()
home_losses = away_team_win.groupby('HomeTeamYear')[['GameID']].count()
away_losses = home_team_win.groupby('AwayTeamYear')[['GameID']].count()
away_wins = away_team_win.groupby('AwayTeamYear')[['GameID']].count()


total_wins = home_wins.add(away_wins, fill_value=0)
total_losses = home_losses.add(away_losses, fill_value=0)
total_games = total_wins.add(total_losses, fill_value=0)

win_percentage = total_wins.divide(total_games)
win_percentage = win_percentage.rename(index=str, columns={'GameID':'WinPercentage'})

win_percentage.head()

Unnamed: 0,WinPercentage
ARI2009,0.714286
ARI2010,0.1
ARI2011,0.416667
ARI2012,0.4
ARI2013,0.571429


## Passing Statistics

Here, I calculate mulltiple passing statistics for the entirety of each season such as total 
passing yards, air yards, and yards after the catch.

In [8]:
pass_plays_groupby = nfl_plays[nfl_plays['PassOutcome'] == 'Complete'].groupby('posteamYear')

pass_yards = pass_plays_groupby[['Yards.Gained']].sum()
pass_yards = pass_yards.rename(index=str, columns={'Yards.Gained':'PassYards'})

air_yards = pass_plays_groupby[['AirYards']].sum()
yards_after_catch = pass_plays_groupby[['YardsAfterCatch']].sum()

pass_attempts = nfl_plays[nfl_plays['PassAttempt'] == 1].groupby('posteamYear')[['Yards.Gained']].count()
pass_attempts = pass_attempts.rename(index=str, columns={'Yards.Gained':'PassAttempts'})

pass_yards.head()

Unnamed: 0_level_0,PassYards
posteamYear,Unnamed: 1_level_1
ARI2009,4222
ARI2010,3161
ARI2011,3992
ARI2012,3791
ARI2013,4400


## Run Statistics

One statistic I examined was the percentage of the time a rushing play by a given team
kept the team "on-pace". A rush that keeps a team on-pace is considered to be a rush of
at least 40% the yards to go on first down, at least 60% the yards to go on second down, 
and at least 100% the yards to go on third or fourth down.

In [9]:
run_plays = nfl_plays[nfl_plays['RushAttempt'] == 1]

first_down_runs = run_plays[run_plays['down'] == 1]
second_down_runs = run_plays[run_plays['down'] == 2]
third_down_runs = run_plays[run_plays['down'] == 3]
fourth_down_runs = run_plays[run_plays['down'] == 4]

succ_first = first_down_runs[first_down_runs['Yards.Gained'] >= .4*first_down_runs['ydstogo']]
succ_second = second_down_runs[second_down_runs['Yards.Gained'] >= .6*second_down_runs['ydstogo']]
succ_third = third_down_runs[third_down_runs['Yards.Gained'] >= third_down_runs['ydstogo']]
succ_fourth = fourth_down_runs[fourth_down_runs['Yards.Gained'] >= fourth_down_runs['ydstogo']]

succ_first_gb = succ_first.groupby('posteamYear')[['RushAttempt']].count()
first_down_gb = first_down_runs.groupby('posteamYear')[['RushAttempt']].count()
succ_second_gb = succ_second.groupby('posteamYear')[['RushAttempt']].count()
second_down_gb = second_down_runs.groupby('posteamYear')[['RushAttempt']].count()
succ_third_gb = succ_third.groupby('posteamYear')[['RushAttempt']].count()
third_down_gb = third_down_runs.groupby('posteamYear')[['RushAttempt']].count()
succ_fourth_gb = succ_fourth.groupby('posteamYear')[['RushAttempt']].count()
fourth_down_gb = fourth_down_runs.groupby('posteamYear')[['RushAttempt']].count()

runs = first_down_gb.add(second_down_gb, fill_value=0)
runs = runs.add(third_down_gb, fill_value=0)
runs = runs.add(fourth_down_gb, fill_value=0)

succ_runs = succ_first_gb.add(succ_second_gb, fill_value=0)
succ_runs = succ_runs.add(succ_third_gb, fill_value=0)
succ_runs = succ_runs.add(succ_fourth_gb, fill_value=0)

percent_succ_runs = succ_runs.divide(runs)
percent_succ_runs = percent_succ_runs.rename(index=str, columns={'RushAttempt':'PercentOnPaceRuns'})

run_yards = run_plays.groupby('posteamYear')[['Yards.Gained']].sum()
run_yards = run_yards.rename(index=str, columns={'Yards.Gained':'RushYards'})

run_yards.head()

Unnamed: 0_level_0,RushYards
posteamYear,Unnamed: 1_level_1
ARI2009,1484
ARI2010,1367
ARI2011,1578
ARI2012,1344
ARI2013,1567


## EPA

A offense's expected points (EP) for a certain drive are essentially the number of points 
a similar team in a similar situation would score, on average. Expected points added (EPA) 
is the amount an offense's expected points increase after an individual play.

In [10]:
epa = nfl_plays.groupby('posteamYear')[['EPA']].sum()
total_plays = nfl_plays.groupby('posteamYear')[['EPA']].count()

epa_per_play = epa.divide(total_plays)
epa_per_play = epa_per_play.rename(index=str, columns={'EPA':'EPAPerPlay'})


def_epa = nfl_plays.groupby('DefensiveTeamYear')[['EPA']].sum()
def_total_plays = nfl_plays.groupby('DefensiveTeamYear')[['EPA']].count()

def_epa_per_play = def_epa.divide(total_plays)
def_epa_per_play = def_epa_per_play.rename(index=str, columns={'EPA':'DefEPAPerPlay'})

epa_per_play.head()

Unnamed: 0_level_0,EPAPerPlay
posteamYear,Unnamed: 1_level_1
ARI2009,0.013203
ARI2010,-0.126148
ARI2011,-0.090028
ARI2012,-0.156911
ARI2013,-0.007928


## Defense

Here, I sum different defensive statistics: sacks, fumbles, and interceptions.

In [11]:
sacks = nfl_plays.groupby('DefensiveTeamYear')[['Sack']].sum()
fumbles = nfl_plays.groupby('DefensiveTeamYear')[['Fumble']].sum()
interceptions = nfl_plays.groupby('DefensiveTeamYear')[['InterceptionThrown']].sum()

sacks = sacks.rename(index=str, columns={'Sack':'Sacks'})
fumbles = fumbles.rename(index=str, columns={'Fumble':'ForcedFumbles'})
interceptions = interceptions.rename(index=str, columns={'Interception':'Interceptions'})

sacks.head()

Unnamed: 0_level_0,Sacks
DefensiveTeamYear,Unnamed: 1_level_1
ARI2009,42
ARI2010,34
ARI2011,42
ARI2012,42
ARI2013,47


## Form Single DataFrame

In [12]:
# Concatenate calculated statistics to form single dataframe

team_df = pd.concat([win_percentage, pass_yards, pass_attempts, percent_succ_runs, \
                     run_yards, sacks, fumbles, interceptions, epa_per_play, \
                     def_epa_per_play], axis=1)

team_df = team_df.dropna(axis=0, how='any')

team_df.to_csv('/data/JakeOliver28/teams.csv')