## Diving into velocity of late shifts

In [23]:
import pandas as pd

plays2 = pd.read_csv("~/nfl-data-bowl/Sample_Data/Raw/plays.csv")

v_late_total = pd.read_csv("~/nfl-data-bowl/Sample_Data/Raw/velocity_late_shifts.csv")
v_late_individual = v_late_total 

aggregated_rows = []

for (gameId, playId), group in v_late_total.groupby(['gameId', 'playId']):
    team_total_distance = group['teamTotalDistance'].iloc[0]
    team_average_speed = group['averageSpeed'].sum()
    num_players_moved = group['numPlayerMoved'].iloc[0]

    aggregated_rows.append({
        'gameId': gameId,
        'playId': playId,
        'teamTotalDistance': team_total_distance,
        'teamAverageSpeed': team_average_speed,
        'numPlayerMoved': num_players_moved
    })

v_late_total = pd.DataFrame(aggregated_rows)

print(v_late_total.head)

<bound method NDFrame.head of            gameId  playId  teamTotalDistance  teamAverageSpeed  numPlayerMoved
0      2022090800      56               6.35            3.0460               9
1      2022090800      80              10.71            4.8040              11
2      2022090800     101               5.50            2.5665              11
3      2022090800     122               4.54            1.8990              11
4      2022090800     167               5.93            2.7170              11
...           ...     ...                ...               ...             ...
12809  2022102400    3623               8.47            3.9865              11
12810  2022102400    3649               6.26            3.0710              11
12811  2022102400    3670               5.97            2.7185              11
12812  2022102400    3698               5.10            1.9420              11
12813  2022102400    3724              12.56            6.0720              11

[12814 rows x 5 colum

##### If you look at summary statistics, almost every play says 11 players shifted late. This won't help the model since every play is the exact same. You need to make the conditions more strict. Late shifts must be greater than blank speed, or blank distance etc... I'm going to look over a few plays to see what should actually be considered a "late shift".

In [11]:
v_late_total.describe()

Unnamed: 0,gameId,playId,teamTotalDistance,teamAverageSpeed,numPlayerMoved
count,12814.0,12814.0,12814.0,12814.0,12814.0
mean,2022097000.0,2033.989387,8.456089,3.978492,10.790464
std,4738.406,1186.678817,4.485115,2.354741,0.618051
min,2022091000.0,54.0,0.33,0.047,5.0
25%,2022092000.0,1001.25,5.38,2.374125,11.0
50%,2022100000.0,2029.5,7.61,3.5255,11.0
75%,2022102000.0,3037.75,10.49,5.008875,11.0
max,2022102000.0,5120.0,42.93,22.229375,11.0


In [12]:
v_late_individual.describe()

Unnamed: 0,gameId,playId,nflId,totalDistance,teamTotalDistance,averageSpeed,numPlayerMoved
count,140954.0,140954.0,140954.0,140954.0,140954.0,140954.0,140954.0
mean,2022097000.0,2033.989387,47407.4759,0.768735,8.456089,0.361681,10.790464
std,4738.238,1186.636721,4977.426674,0.997052,4.484956,0.515771,0.618029
min,2022091000.0,54.0,33131.0,0.0,0.33,0.0,5.0
25%,2022092000.0,1001.0,43478.0,0.18,5.38,0.034,11.0
50%,2022100000.0,2029.5,46456.0,0.42,7.61,0.17525,11.0
75%,2022102000.0,3038.0,52538.0,0.98,10.49,0.484,11.0
max,2022102000.0,5120.0,55241.0,13.58,42.93,6.7125,11.0


In [32]:
print(v_late_individual.head)

<bound method NDFrame.head of             gameId  playId    nflId  totalDistance  teamTotalDistance  \
0       2022090800      56  38577.0           0.13               6.35   
1       2022090800      56  41239.0           0.10               6.35   
2       2022090800      56  42816.0           0.48               6.35   
3       2022090800      56  43294.0           1.12               6.35   
4       2022090800      56  43298.0           0.00               6.35   
...            ...     ...      ...            ...                ...   
140949  2022102400    3724  44991.0           0.65              12.56   
140950  2022102400    3724  46212.0           1.77              12.56   
140951  2022102400    3724  47826.0           0.87              12.56   
140952  2022102400    3724  52495.0           1.02              12.56   
140953  2022102400    3724  52991.0           2.19              12.56   

        averageSpeed  numPlayerMoved  
0             0.0315               9  
1             0

In [16]:
# Assuming your DataFrame is named v_late_total

# First, calculate the quantiles
q1_dist = v_late_total['teamTotalDistance'].quantile(0.25)
q3_dist = v_late_total['teamTotalDistance'].quantile(0.75)

q1_speed = v_late_total['teamAverageSpeed'].quantile(0.25)
q3_speed = v_late_total['teamAverageSpeed'].quantile(0.75)

# Now create groups for teamTotalDistance
dist_low = v_late_total[v_late_total['teamTotalDistance'] < q1_dist][['gameId', 'playId']]
dist_mid = v_late_total[(v_late_total['teamTotalDistance'] >= q1_dist) & 
                        (v_late_total['teamTotalDistance'] <= q3_dist)][['gameId', 'playId']]
dist_high = v_late_total[v_late_total['teamTotalDistance'] > q3_dist][['gameId', 'playId']]

# Create groups for teamAverageSpeed
speed_low = v_late_total[v_late_total['teamAverageSpeed'] < q1_speed][['gameId', 'playId']]
speed_mid = v_late_total[(v_late_total['teamAverageSpeed'] >= q1_speed) & 
                         (v_late_total['teamAverageSpeed'] <= q3_speed)][['gameId', 'playId']]
speed_high = v_late_total[v_late_total['teamAverageSpeed'] > q3_speed][['gameId', 'playId']]

In [None]:
plays = plays2[["gameId", "playId","expectedPointsAdded","yardsGained"]]

# Join each distance group with plays
dist_low_joined = plays.merge(dist_low, on=['gameId', 'playId'], how='inner')
dist_mid_joined = plays.merge(dist_mid, on=['gameId', 'playId'], how='inner')
dist_high_joined = plays.merge(dist_high, on=['gameId', 'playId'], how='inner')

# Join each speed group with plays
speed_low_joined = plays.merge(speed_low, on=['gameId', 'playId'], how='inner')
speed_mid_joined = plays.merge(speed_mid, on=['gameId', 'playId'], how='inner')
speed_high_joined = plays.merge(speed_high, on=['gameId', 'playId'], how='inner')

print(speed_high_joined.head)
print(dist_high_joined.head)

<bound method NDFrame.head of           gameId  playId  expectedPointsAdded  yardsGained
0     2022091111     923             2.483530           42
1     2022091803    1222            -1.418779            0
2     2022092509    2694            -0.587710           -1
3     2022101700    3813            -0.573664            5
4     2022101609     888             0.683543            6
...          ...     ...                  ...          ...
3199  2022100910    3452             3.036551           27
3200  2022101606    1217             3.055281           18
3201  2022091900    3189            -0.106339            3
3202  2022100901    2213             1.512348           23
3203  2022092502    2337             1.073898           10

[3204 rows x 4 columns]>
<bound method NDFrame.head of           gameId  playId  expectedPointsAdded  yardsGained
0     2022102308    3304            -0.373242            0
1     2022091111     923             2.483530           42
2     2022091803    1222     

In [18]:
def print_group_stats(group, group_name):
    epa_mean = group['expectedPointsAdded'].mean()
    yards_mean = group['yardsGained'].mean()
    print(f"{group_name} — EPA Mean: {epa_mean:.3f}, Yards Gained Mean: {yards_mean:.2f}")

# Distance-based groups
print_group_stats(dist_low_joined, "Distance < 25%")
print_group_stats(dist_mid_joined, "Distance 25%-75%")
print_group_stats(dist_high_joined, "Distance > 75%")

# Speed-based groups
print_group_stats(speed_low_joined, "Speed < 25%")
print_group_stats(speed_mid_joined, "Speed 25%-75%")
print_group_stats(speed_high_joined, "Speed > 75%")


Distance < 25% — EPA Mean: -0.018, Yards Gained Mean: 5.69
Distance 25%-75% — EPA Mean: -0.029, Yards Gained Mean: 5.48
Distance > 75% — EPA Mean: -0.046, Yards Gained Mean: 5.37
Speed < 25% — EPA Mean: -0.011, Yards Gained Mean: 5.61
Speed 25%-75% — EPA Mean: -0.038, Yards Gained Mean: 5.49
Speed > 75% — EPA Mean: -0.035, Yards Gained Mean: 5.43


#### Negative correlation with yards and epa???? Didn't expect this?!

Distance < 25% — EPA Mean: -0.018, Yards Gained Mean: 5.69
Distance 25%-75% — EPA Mean: -0.029, Yards Gained Mean: 5.48
Distance > 75% — EPA Mean: -0.046, Yards Gained Mean: 5.37
Speed < 25% — EPA Mean: -0.011, Yards Gained Mean: 5.61
Speed 25%-75% — EPA Mean: -0.038, Yards Gained Mean: 5.49
Speed > 75% — EPA Mean: -0.035, Yards Gained Mean: 5.43

## Editing it so only greater movements count

In [21]:
# Filter the high-speed group directly from v_late_total
q3_speed = v_late_total['teamAverageSpeed'].quantile(0.75)

high_speed_group = v_late_total[v_late_total['teamAverageSpeed'] > q3_speed]

# Sort by teamAverageSpeed in descending order and select just gameId and playId
sorted_high_speed = high_speed_group.sort_values(by='teamAverageSpeed', ascending=False)[['gameId', 'playId','teamAverageSpeed']]

print(high_speed_group)


           gameId  playId  teamTotalDistance  teamAverageSpeed  numPlayerMoved
6      2022090800     212              14.19          6.875000              11
8      2022090800     299              16.24          7.969500              11
9      2022090800     343              11.59          5.715500              11
11     2022090800     393              10.37          5.022000              11
12     2022090800     414              10.32          5.089500              11
...           ...     ...                ...               ...             ...
12797  2022102400    3233              11.88          5.897500              11
12805  2022102400    3527              12.44          6.043000              11
12807  2022102400    3580              24.94         12.278000              11
12808  2022102400    3601              16.66          9.153889              11
12813  2022102400    3724              12.56          6.072000              11

[3204 rows x 5 columns]


### 4th quarter is weird

In [26]:
plays = plays2[["gameId", "playId","expectedPointsAdded","yardsGained", "quarter"]]

# Join each distance group with plays
v_late_total_joined = plays.merge(v_late_total, on=['gameId', 'playId'], how='inner')

v_late_123 = v_late_total_joined[v_late_total_joined["quarter"] != 4]
v_late_4 = v_late_total_joined[v_late_total_joined["quarter"] == 4]

v_late_123.describe()

Unnamed: 0,gameId,playId,expectedPointsAdded,yardsGained,quarter,teamTotalDistance,teamAverageSpeed,numPlayerMoved
count,9287.0,9287.0,9287.0,9287.0,9287.0,9287.0,9287.0,9287.0
mean,2022097000.0,1490.902014,-0.015349,5.611823,2.034564,8.439511,3.956427,10.774308
std,4729.074,901.112606,1.389243,9.003419,0.846112,4.432711,2.318864,0.638434
min,2022091000.0,54.0,-10.357486,-68.0,1.0,0.5,0.051,5.0
25%,2022092000.0,729.5,-0.655365,0.0,1.0,5.38,2.372,11.0
50%,2022100000.0,1439.0,-0.181058,3.0,2.0,7.62,3.5165,11.0
75%,2022102000.0,2219.0,0.728634,8.0,3.0,10.45,4.981214,11.0
max,2022102000.0,5120.0,8.698986,98.0,5.0,42.93,21.448889,11.0


In [27]:
v_late_4.describe()

Unnamed: 0,gameId,playId,expectedPointsAdded,yardsGained,quarter,teamTotalDistance,teamAverageSpeed,numPlayerMoved
count,3527.0,3527.0,3527.0,3527.0,3527.0,3527.0,3527.0,3527.0
mean,2022097000.0,3464.001418,-0.070339,5.216898,4.0,8.499742,4.036594,10.833003
std,4761.811,395.470403,1.441421,8.664757,0.0,4.620615,2.44608,0.558703
min,2022091000.0,2537.0,-13.0236,-16.0,4.0,0.33,0.047,5.0
25%,2022092000.0,3166.0,-0.614184,0.0,4.0,5.365,2.382,11.0
50%,2022100000.0,3449.0,-0.150488,3.0,4.0,7.58,3.568,11.0
75%,2022102000.0,3745.0,0.608765,8.0,4.0,10.63,5.0965,11.0
max,2022102000.0,4596.0,6.326126,75.0,4.0,41.68,22.229375,11.0


In [56]:
v_late_individual_filtered = v_late_individual[
    ((v_late_individual["totalDistance"] > 5) | (v_late_individual["averageSpeed"] > 5))
]

print(v_late_individual_filtered.head)

<bound method NDFrame.head of             gameId  playId    nflId  totalDistance  teamTotalDistance  \
76      2022090800     212  53532.0           5.28              14.19   
203     2022090800     569  47844.0           5.01              17.86   
279     2022090800     818  44976.0           6.86              11.26   
452     2022090800    1358  40166.0           7.07              14.22   
1558    2022091100     828  52542.0           6.94              18.95   
...            ...     ...      ...            ...                ...   
140106  2022102400    1301  54586.0           7.07              13.41   
140314  2022102400    1853  54586.0           6.11              19.86   
140381  2022102400    2130  52991.0           5.52              12.01   
140420  2022102400    2260  43522.0           7.01              11.82   
140885  2022102400    3580  54504.0           7.78              24.94   

        averageSpeed  numPlayerMoved  
76            2.7090              11  
203           2

In [57]:
# Group and aggregate
aggregated = v_late_individual_filtered.groupby(['gameId', 'playId']).agg({
    'totalDistance': 'sum',
    'averageSpeed': 'sum',
    'nflId': 'count'  # use count to get numPlayerMoved
}).reset_index()

# Rename columns for clarity
aggregated = aggregated.rename(columns={
    'totalDistance': 'teamTotalDistance',
    'averageSpeed': 'teamAverageSpeed',
    'nflId': 'numPlayerMoved'
})

print(aggregated.head)


<bound method NDFrame.head of           gameId  playId  teamTotalDistance  teamAverageSpeed  numPlayerMoved
0     2022090800     212               5.28            2.7090               1
1     2022090800     569               5.01            2.5125               1
2     2022090800     818               6.86            3.4815               1
3     2022090800    1358               7.07            3.5275               1
4     2022091100     828               6.94            3.4690               1
...          ...     ...                ...               ...             ...
1062  2022102400    1301               7.07            3.5750               1
1063  2022102400    1853               6.11            2.9980               1
1064  2022102400    2130               5.52            2.7930               1
1065  2022102400    2260               7.01            3.5210               1
1066  2022102400    3580               7.78            3.8825               1

[1067 rows x 5 columns]>


In [62]:
player_play = pd.read_csv("~/nfl-data-bowl/Sample_Data/Raw/player_play.csv")
motion_zero = player_play[player_play['motionSinceLineset'] == 0]

# Step 2: Get unique gameId/playId pairs from that filtered set
valid_plays = motion_zero[['gameId', 'playId']].drop_duplicates()

# Step 3: Merge with plays to keep only those rows
filtered_plays = plays2.merge(valid_plays, on=['gameId', 'playId'], how='inner')

print(filtered_plays.head)


<bound method NDFrame.head of            gameId  playId                                    playDescription  \
0      2022102302    2655  (1:54) (Shotgun) J.Burrow pass short middle to...   
1      2022091809    3698  (2:13) (Shotgun) J.Burrow pass short right to ...   
2      2022103004    3146  (2:00) (Shotgun) D.Mills pass short right to D...   
3      2022110610     348  (9:28) (Shotgun) P.Mahomes pass short left to ...   
4      2022102700    2799  (2:16) (Shotgun) L.Jackson up the middle to TB...   
...           ...     ...                                                ...   
14031  2022100901    2213  (15:00) K.Pickett pass deep right to D.Johnson...   
14032  2022091100    1046  (13:41) (Shotgun) C.Patterson right tackle to ...   
14033  2022110604    1051  (12:49) (Shotgun) T.Lawrence pass incomplete s...   
14034  2022103005    3492  (12:32) (Shotgun) K.Cousins pass incomplete de...   
14035  2022101602    3046  (15:00) Br.Hall up the middle for 34 yards, TO...   

       qu

In [64]:
plays = filtered_plays[["gameId", "playId","expectedPointsAdded","yardsGained"]]

# Left join plays with the aggregated data
merged = plays.merge(aggregated, on=['gameId', 'playId'], how='left')

# Fill missing values (NaN) with 0 for the aggregated stats
merged[['teamTotalDistance', 'teamAverageSpeed', 'numPlayerMoved']] = \
    merged[['teamTotalDistance', 'teamAverageSpeed', 'numPlayerMoved']].fillna(0)
    
merged.describe()

Unnamed: 0,gameId,playId,expectedPointsAdded,yardsGained,teamTotalDistance,teamAverageSpeed,numPlayerMoved
count,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0,14036.0
mean,2022099000.0,1983.514107,0.082547,6.226275,0.508545,0.253973,0.074451
std,6037.901,1175.193247,1.360552,8.780623,2.068198,1.033531,0.298816
min,2022091000.0,54.0,-13.0236,-61.0,0.0,0.0,0.0
25%,2022092000.0,958.75,-0.568298,0.0,0.0,0.0,0.0
50%,2022101000.0,1960.5,-0.090117,4.0,0.0,0.0,0.0
75%,2022102000.0,2979.0,0.773558,9.0,0.0,0.0,0.0
max,2022111000.0,5120.0,8.698986,98.0,29.63,14.694,4.0


In [1]:
# --- TEAM TOTAL DISTANCE QUANTILES ---
q1_dist = merged['teamTotalDistance'].quantile(0.25)
q3_dist = merged['teamTotalDistance'].quantile(0.75)

dist_low = merged[merged['teamTotalDistance'] < q1_dist]
dist_mid = merged[(merged['teamTotalDistance'] >= q1_dist) & (merged['teamTotalDistance'] <= q3_dist)]
dist_high = merged[merged['teamTotalDistance'] > q3_dist]

print("=== TEAM TOTAL DISTANCE GROUPS ===")
print("Low:")
print(dist_low[['expectedPointsAdded', 'yardsGained']].mean())
print("\nMid:")
print(dist_mid[['expectedPointsAdded', 'yardsGained']].mean())
print("\nHigh:")
print(dist_high[['expectedPointsAdded', 'yardsGained']].mean())


# --- TEAM AVERAGE SPEED QUANTILES ---
q1_speed = merged['teamAverageSpeed'].quantile(0.25)
q3_speed = merged['teamAverageSpeed'].quantile(0.75)

speed_low = merged[merged['teamAverageSpeed'] < q1_speed]
speed_mid = merged[(merged['teamAverageSpeed'] >= q1_speed) & (merged['teamAverageSpeed'] <= q3_speed)]
speed_high = merged[merged['teamAverageSpeed'] > q3_speed]

print("\n=== TEAM AVERAGE SPEED GROUPS ===")
print("Low:")
print(speed_low[['expectedPointsAdded', 'yardsGained']].mean())
print("\nMid:")
print(speed_mid[['expectedPointsAdded', 'yardsGained']].mean())
print("\nHigh:")
print(speed_high[['expectedPointsAdded', 'yardsGained']].mean())

# --- NUMBER OF PLAYERS MOVED QUANTILES ---
q1_moved = merged['numPlayerMoved'].quantile(0.25)
q3_moved = merged['numPlayerMoved'].quantile(0.75)

moved_low = merged[merged['numPlayerMoved'] < q1_moved]
moved_mid = merged[(merged['numPlayerMoved'] >= q1_moved) & (merged['numPlayerMoved'] <= q3_moved)]
moved_high = merged[merged['numPlayerMoved'] > q3_moved]

print("\n=== NUM PLAYERS MOVED GROUPS ===")
print("Low:")
print(moved_low[['expectedPointsAdded', 'yardsGained']].mean())
print("\nMid:")
print(moved_mid[['expectedPointsAdded', 'yardsGained']].mean())
print("\nHigh:")
print(moved_high[['expectedPointsAdded', 'yardsGained']].mean())



NameError: name 'merged' is not defined