# Lines Analysis

## Import

In [1]:
import os

import numpy as np
import pandas as pd

import janitor

## Constants

In [2]:
scores = ['GOAL', 'SCORED_ON', 'CALLAHAN', 'CALLAHAN_THROWN']
we_score = ['GOAL', 'CALLAHAN']
they_score = ['SCORED_ON', 'CALLAHAN_THROWN']

turnovers = ['THROWAWAY', 'DROP', 'STALL', 'CALLAHAN_THROWN']
takeaways = ['BLOCK', 'CALLAHAN', 'STALL_CAUSED', 'THROWAWAY_CAUSED']

pulls = ['PULL_OUT_OF_BOUNDS', 'PULL_INBOUNDS']

pens = ['D_PENALTY_ON_THEM', 'D_PENALTY_ON_US', 'O_PENALTY_ON_THEM', 'O_PENALTY_ON_US']

quarter_ends = ['START_OF_GAME', 'END_OF_Q1', 'HALFTIME', 'END_OF_Q3', 'GAME_OVER']

line_sets = ['SET_D_LINE', 'SET_O_LINE', 'SET_D_LINE_NO_PULL', 'SET_O_LINE_NO_PULL']
o_lines = ['SET_O_LINE', 'SET_O_LINE_NO_PULL']
d_lines = ['SET_D_LINE', 'SET_D_LINE_NO_PULL']

timeouts = ['THEIR_MIDPOINT_TIMEOUT', 'OUR_MIDPOINT_TIMEOUT']

other = ['REF_TIMEOUT_DISCUSSION???', 'INJURY_ON_O', 'INJURY_ON_D']


## Analysis

### File IO

In [3]:
df = pd.read_csv("Flyers_Games.csv").rename({"Unnamed: 0" : "event_index"}, axis=1)
df_home = pd.read_csv("Flyers_Games_home.csv").rename({"Unnamed: 0" : "event_index"}, axis=1)
df_away = pd.read_csv("Flyers_Games_away.csv").rename({"Unnamed: 0" : "event_index"}, axis=1)

In [4]:
df.head(35)

Unnamed: 0,event_index,game_id,date,home_team,away_team,event_counter,team_id,current_quarter,time,event_type,player,x,y,o_point,d_point,point_id,our_score,their_score,line
0,0,2908,2022-06-25,245,242,0,245,1.0,,START_OF_GAME,,,,False,False,0,0,0,
1,1,2908,2022-06-25,245,242,1,245,1.0,,SET_D_LINE,"Michael Arbutine, Cody Coates, Garrett Knobel,...",,,False,True,1,0,0,"['Michael Arbutine', 'Cody Coates', 'Garrett K..."
2,2,2908,2022-06-25,245,242,2,245,1.0,,PULL_INBOUNDS,Michael Arbutine,7.57,80.31,False,True,1,0,0,
3,3,2908,2022-06-25,245,242,3,245,1.0,,BLOCK,Michael Arbutine,,,False,True,1,0,0,
4,4,2908,2022-06-25,245,242,4,245,1.0,,POSSESSION,Michael Arbutine,-0.48,39.95,False,True,1,0,0,
5,5,2908,2022-06-25,245,242,5,245,1.0,,POSSESSION,Sean Plunkett,7.36,44.31,False,True,1,0,0,
6,6,2908,2022-06-25,245,242,6,245,1.0,,DROP,Clayton Partlow,22.91,60.61,False,True,1,0,0,
7,7,2908,2022-06-25,245,242,7,245,1.0,Q1 11:10,SCORED_ON,,,,False,True,1,0,1,
8,8,2908,2022-06-25,245,242,8,245,1.0,,SET_O_LINE,"Tannon Hedges, Carl Johnson, Billy O'Bryan, Un...",,,True,False,2,0,1,"['Tannon Hedges', 'Carl Johnson', ""Billy O'Bry..."
9,9,2908,2022-06-25,245,242,9,245,1.0,,POSSESSION,Tannon Hedges,0.0,40.0,True,False,2,0,1,


### Cleaning

In [5]:
df['is_home'] = df['home_team'] == df['team_id']
df['is_away'] = df['away_team'] == df['team_id']

df["game_point_id"] = df["game_id"]*1000 + df["point_id"]

df['player'] = np.where(df['event_type'].isin(line_sets), df['player'], "")

df['home_score']  = np.where((df['is_home']) & (df['event_type'].isin(we_score)), 1, 0)
df['away_score']  = np.where((df['is_away']) & (df['event_type'].isin(we_score)), 1, 0)

### LINES

In [6]:
all_lines = df.loc[df['event_type'].isin(line_sets)].copy()

# How many lines are there per point
all_lines['line_number_in_point'] = all_lines.groupby('game_point_id')['event_counter'].cumcount()
print("Max Number of Lines in a point:", all_lines['line_number_in_point'].unique().max())

# Make Line ID within point within game
# Multiply by 10 is safe
all_lines['game_point_line_id'] = all_lines['game_point_id']*10 + all_lines['line_number_in_point']

Max Number of Lines in a point: 7


In [7]:
# Players in each line
all_lines['line_players'] = [[x.strip() for x in y[:-1]] for y in all_lines['player'].str.split(",")]

In [8]:
# Last Line on for each point
last_line_by_point_idx = all_lines.groupby(['game_point_id', 'team_id'], sort=False)['game_point_line_id'].transform('max') == all_lines['game_point_line_id']
last_lines = all_lines.loc[last_line_by_point_idx].sort_values('game_point_id')
# last_lines.sort_values(['game_point_id', 'game_point_line_id']).head(5)

all_lines = all_lines.merge(last_lines.loc[:, 'game_point_line_id'], how='left', on='game_point_line_id', indicator=True)
all_lines['is_last_line'] = all_lines['_merge'] == 'both'
del all_lines['_merge']

In [9]:
home_lines = all_lines.loc[all_lines['is_home'] == True].copy()
away_lines = all_lines.loc[all_lines['is_home'] == False].copy()

In [10]:
all_lines.columns

Index(['event_index', 'game_id', 'date', 'home_team', 'away_team',
       'event_counter', 'team_id', 'current_quarter', 'time', 'event_type',
       'player', 'x', 'y', 'o_point', 'd_point', 'point_id', 'our_score',
       'their_score', 'line', 'is_home', 'is_away', 'game_point_id',
       'home_score', 'away_score', 'line_number_in_point',
       'game_point_line_id', 'line_players', 'is_last_line'],
      dtype='object')

In [11]:
keep_cols_a = [
    'date', 'game_id', 'game_point_id', 'game_point_line_id', 
    'home_team', 'away_team', 'team_id', 'current_quarter',
    'event_type', 'o_point', 'd_point', 'point_id',
    'our_score', 'their_score', 'is_home', 'is_away', 'line_players',
    'is_last_line', 'line_number_in_point'
]

keep_cols_b = ['game_point_id', 'game_point_line_id', 'team_id', 'line_players', 'is_last_line']

rename_cols_a = {
    'team_id': 'team_id_a',
    'is_home': 'is_home_a',
    'is_away': 'is_away_a',
    'game_point_line_id': 'line_id_a',
    'line_players': 'line_players_a',
    'is_last_line': 'is_last_line_a'
}

rename_cols_b = {
    'team_id': 'team_id_b',
    'game_point_line_id': 'line_id_b',
    'line_players': 'line_players_b',
    'is_last_line': 'is_last_line_b'
}

final_lines = home_lines.loc[:, keep_cols_a].rename(rename_cols_a, axis=1).merge(
    away_lines.loc[:, keep_cols_b].rename(rename_cols_b, axis=1),
    how='outer',
    on='game_point_id',
    indicator=True,
    copy=True
)
del final_lines['_merge']
print(final_lines.shape)
final_lines.head()

(568, 23)


Unnamed: 0,date,game_id,game_point_id,line_id_a,home_team,away_team,team_id_a,current_quarter,event_type,o_point,...,their_score,is_home_a,is_away_a,line_players_a,is_last_line_a,line_number_in_point,line_id_b,team_id_b,line_players_b,is_last_line_b
0,2022-06-25,2908,2908001,29080010,245,242,245,1.0,SET_D_LINE,False,...,0,True,False,"[Michael Arbutine, Cody Coates, Garrett Knobel...",True,0,29080011,242,"[Terrence Mitchell, Matt Gouchoe-Hanas, Alex D...",True
1,2022-06-25,2908,2908002,29080020,245,242,245,1.0,SET_O_LINE,True,...,1,True,False,"[Tannon Hedges, Carl Johnson, Billy O'Bryan, U...",True,0,29080021,242,"[Suraj Madiraju, Seth Weaver, David Richardson...",True
2,2022-06-25,2908,2908003,29080030,245,242,245,1.0,SET_O_LINE,True,...,2,True,False,"[Billy O'Bryan, Logan Diehl, Sean Plunkett, Au...",True,0,29080031,242,"[Wilson Matthews, Alex Cloud, Michael Lee, Dyl...",True
3,2022-06-25,2908,2908004,29080040,245,242,245,1.0,SET_D_LINE,False,...,2,True,False,"[Cody Coates, Garrett Knobel, Clayton Partlow,...",False,0,29080042,242,"[Alex Davis, Jacob Fairfax, Henry Fisher, Matt...",False
4,2022-06-25,2908,2908004,29080040,245,242,245,1.0,SET_D_LINE,False,...,2,True,False,"[Cody Coates, Garrett Knobel, Clayton Partlow,...",False,0,29080043,242,"[Seth Weaver, Suraj Madiraju, William Coffin, ...",True


In [12]:
df_quarter_ends = df.loc[(df['event_type'].isin(quarter_ends))].copy()
df_quarter_ends.shape
df_quarter_ends.head()

Unnamed: 0,event_index,game_id,date,home_team,away_team,event_counter,team_id,current_quarter,time,event_type,...,d_point,point_id,our_score,their_score,line,is_home,is_away,game_point_id,home_score,away_score
0,0,2908,2022-06-25,245,242,0,245,1.0,,START_OF_GAME,...,False,0,0,0,,True,False,2908000,0,0
110,110,2908,2022-06-25,245,242,110,245,2.0,,END_OF_Q1,...,True,9,3,5,,True,False,2908009,0,0
227,227,2908,2022-06-25,245,242,227,245,3.0,,HALFTIME,...,False,19,6,11,,True,False,2908019,0,0
324,324,2908,2022-06-25,245,242,324,245,4.0,,END_OF_Q3,...,True,27,9,15,,True,False,2908027,0,0
422,422,2908,2022-06-25,245,242,422,245,4.0,,GAME_OVER,...,False,42,15,23,,True,False,2908042,0,0


### POINTS

In [13]:
df_scores = df.loc[(df['event_type'].isin(we_score))].copy()
# df_scores = df.loc[(df['event_type'] == 'GOAL') | (df['event_type'] == 'CALLAHAN')].copy()
df_scores.head(10)

Unnamed: 0,event_index,game_id,date,home_team,away_team,event_counter,team_id,current_quarter,time,event_type,...,d_point,point_id,our_score,their_score,line,is_home,is_away,game_point_id,home_score,away_score
37,37,2908,2022-06-25,245,242,37,245,1.0,Q1 07:17,GOAL,...,False,3,1,2,,True,False,2908003,1,0
47,47,2908,2022-06-25,245,242,47,245,1.0,Q1 05:23,GOAL,...,True,4,2,2,,True,False,2908004,1,0
107,107,2908,2022-06-25,245,242,107,245,1.0,Q1 00:02,GOAL,...,False,8,3,5,,True,False,2908008,1,0
124,124,2908,2022-06-25,245,242,124,245,2.0,Q2 10:36,GOAL,...,False,10,4,5,,True,False,2908010,1,0
177,177,2908,2022-06-25,245,242,177,245,2.0,Q2 04:43,GOAL,...,False,14,5,8,,True,False,2908014,1,0
196,196,2908,2022-06-25,245,242,196,245,2.0,Q2 02:44,GOAL,...,False,16,6,9,,True,False,2908016,1,0
245,245,2908,2022-06-25,245,242,245,245,3.0,Q3 09:31,GOAL,...,False,21,7,12,,True,False,2908021,1,0
287,287,2908,2022-06-25,245,242,287,245,3.0,Q3 03:55,GOAL,...,False,23,8,13,,True,False,2908023,1,0
317,317,2908,2022-06-25,245,242,317,245,3.0,Q3 00:24,GOAL,...,False,26,9,15,,True,False,2908026,1,0
333,333,2908,2022-06-25,245,242,333,245,4.0,Q4 11:19,GOAL,...,False,28,10,15,,True,False,2908028,1,0


In [14]:
df_scores.columns

Index(['event_index', 'game_id', 'date', 'home_team', 'away_team',
       'event_counter', 'team_id', 'current_quarter', 'time', 'event_type',
       'player', 'x', 'y', 'o_point', 'd_point', 'point_id', 'our_score',
       'their_score', 'line', 'is_home', 'is_away', 'game_point_id',
       'home_score', 'away_score'],
      dtype='object')

In [15]:
score_keep_cols = [
     'game_point_id', 'time', 'event_type', 'o_point', 'is_home', 'is_away', 
    'our_score', 'their_score'
]

score_rename_cols = {
    'event_type': 'score_type'
}
final_scores = df_scores.loc[:, score_keep_cols].rename(score_rename_cols, axis=1).sort_values('game_point_id')

### FINAL DF

In [16]:
df_final = final_lines.merge(
    final_scores,
    on='game_point_id',
    how='outer',
    indicator=True,
    copy=True
)

df_final.head(10)
df_final._merge.value_counts()

both          527
left_only      41
right_only      0
Name: _merge, dtype: int64

In [17]:
df_final.columns

Index(['date', 'game_id', 'game_point_id', 'line_id_a', 'home_team',
       'away_team', 'team_id_a', 'current_quarter', 'event_type', 'o_point_x',
       'd_point', 'point_id', 'our_score_x', 'their_score_x', 'is_home_a',
       'is_away_a', 'line_players_a', 'is_last_line_a', 'line_number_in_point',
       'line_id_b', 'team_id_b', 'line_players_b', 'is_last_line_b', 'time',
       'score_type', 'o_point_y', 'is_home', 'is_away', 'our_score_y',
       'their_score_y', '_merge'],
      dtype='object')

In [21]:
df_final.to_csv("Test_Data_220730_1000.csv", index=False)

In [19]:
df['event_type'].unique()

array(['START_OF_GAME', 'SET_D_LINE', 'PULL_INBOUNDS', 'BLOCK',
       'POSSESSION', 'DROP', 'SCORED_ON', 'SET_O_LINE', 'THROWAWAY',
       'GOAL', 'OUR_MIDPOINT_TIMEOUT', 'SET_O_LINE_NO_PULL',
       'PULL_OUT_OF_BOUNDS', 'THROWAWAY_CAUSED', 'O_PENALTY_ON_US',
       'D_PENALTY_ON_US', 'END_OF_Q1', 'D_PENALTY_ON_THEM',
       'O_PENALTY_ON_THEM', 'INJURY_ON_D', 'SET_D_LINE_NO_PULL', 'STALL',
       'HALFTIME', 'THEIR_MIDPOINT_TIMEOUT', 'END_OF_Q3', 'GAME_OVER',
       nan, 'INJURY_ON_O', 'STALL_CAUSED', 'PULL_OUR_OFFSIDES',
       'REF_TIMEOUT_DISCUSSION???', 'CALLAHAN', 'CALLAHAN_THROWN'],
      dtype=object)