In [99]:
import os
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import Tracab as tracab
import Tracking_Visuals as vis
import Tracking_Velocities as vel
import Tracking_Fatigue as fatigue
import helpers
import importlib
# importlib.reload(tracab)
# importlib.reload(helpers)
# importlib.reload(vis)

from scipy.optimize import curve_fit
from collections import defaultdict

# Fatigue model for large dataset

## example read for one player

In [94]:
# EX data
with open('../data_processed/EX_time_series_game_984472.pkl', 'rb') as infile:
    data_dict = pickle.load(infile)

In [95]:
data_dict.keys()

dict_keys(['data_path', 'player_info', 'energy_x', 'ball_in_play'])

In [41]:
# players data
PLAYER_ID_to_JERSEY_NUM_LOC = '../playerid_jerseynum_map.csv'
players_df = pd.read_csv(PLAYER_ID_to_JERSEY_NUM_LOC)

In [40]:
players_df.head()

Unnamed: 0,Match ID,Team,Playerid,Player name,Jersey Num,Starting Position,Starts Match,start_pos_super,first_name,last_name
0,984455,Home,50310,Sebastian Mielitz,28,GK,Start,GK,Sebastian,Mielitz
1,984455,Home,247047,Nicholas Marfelt,18,LFB,Start,FB,Nicholas,Marfelt
2,984455,Home,207273,Stefan Gartenmann,2,RFB,Start,FB,Stefan,Gartenmann
3,984455,Home,56317,Kees Luijckx,5,CD,Start,CB,Kees,Luijckx
4,984455,Home,52560,Marc Pedersen,3,CD,Start,CB,Marc,Pedersen


# Get all games data path

In [91]:
# get paths
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data_processed')
all_data_processed = [os.path.join(data_path, f) for f in os.listdir(data_path)]
game_dicts = {f.split('/')[-1].split('.')[0].split('_')[-1]: f for f in all_data_processed 
              if f.find('.pkl') != -1}

# Fatigue Model

## Fit 2nd degree polynomial on energy expenditure

In [85]:
def poly_2nd(x, b, c):
    return 0 + (b * x) + (c * x ** 2)

def plays_full_game(EX_series, thresh=0.95):
    is_full = np.count_nonzero(EX_series)/len(EX_series) > thresh 
    return is_full

In [None]:
coefs = []

# for each game
for match_id, path in game_dicts.items():    
    with open(path, 'rb') as infile:
        data_dict = pickle.load(infile)
    
    # fit model for each game
    for _, player_id, series in data_dict['energy_x']:
        EX_cumul = np.cumsum(series)
        frame_lst = np.array(range(len(EX_cumul)))
        
        # check if is subbed out
        if not plays_full_game(series):
            continue
        
        # fit second degree polynomial that passes through origin
        popt, pcov = curve_fit(poly_2nd, frame_lst, EX_cumul)
        
        # store info
        coefs.append([match_id, player_id, *popt, *np.sqrt(np.diag(pcov))])
        
    print('Done with match {}'.format(match_id))
    
# dataframe of coefficients
df_coefs = pd.DataFrame(coefs, 
                        columns=['match_id', 'player_id', 'b1', 'b2', 'sigma_b1', 'sigma_b2'])
df_coefs.to_csv('../data_model/model_coefs.csv', index=0)

Done with match 984600
Done with match 984601
Done with match 984577
Done with match 984603
Done with match 984617
Done with match 984602
Done with match 984613
Done with match 984565
Done with match 984571
Done with match 984567
Done with match 984605
Done with match 984610


In [None]:
pd.read_csv(('../data_model/model_coefs.csv'))

Unnamed: 0,match_id,player_id,b1,b2,sigma_b1,sigma_b2
0,984600,68250,2.807164,6.748634e-07,0.001069,9.575698e-09
1,984600,208904,8.995215,-5.770922e-06,0.000853,7.641641e-09
2,984600,51656,10.243082,-1.114597e-05,0.001466,1.313169e-08
3,984600,126317,9.749276,-1.292588e-07,0.00187,1.674551e-08
4,984600,85007,8.067055,-5.418892e-06,0.001702,1.524209e-08
5,984600,435556,8.343799,-2.509435e-06,0.001457,1.304969e-08
6,984600,157704,8.76889,1.687213e-06,0.001297,1.161813e-08
7,984600,423805,9.119596,2.620463e-08,0.002727,2.442427e-08
8,984600,39738,8.666626,-3.67946e-07,0.001152,1.032071e-08
9,984600,162038,9.39689,6.886191e-06,0.001817,1.627038e-08


In [102]:
df_coefs.sample(5)

Unnamed: 0,match_id,player_id,b1,b2,sigma_b1,sigma_b2
8,984600,39738,8.666626,-3.67946e-07,0.001152,1.032071e-08
3,984600,126317,9.749276,-1.292588e-07,0.00187,1.674551e-08
10,984600,113774,9.247364,5.274016e-06,0.001253,1.121963e-08
15,984600,102005,8.39939,-4.889316e-06,0.001419,1.271178e-08
6,984600,157704,8.76889,1.687213e-06,0.001297,1.161813e-08


## Merge player data

In [17]:
# add in info 
df_coefs_w_info = pd.merge(df_coefs.astype({'match_id':int}),
                            players_df[['Match ID', 'Playerid', 'start_pos_super', 'Team']],
                            left_on=['match_id', 'player_id'],
                            right_on=['Match ID', 'Playerid'],
                            how='left').drop(['Match ID', 'Playerid'], axis=1)

# Distribution of coefficients

In [94]:
# get players who play the most, filter out player with few games
n_games = df_coefs['player_id'].value_counts()
most_play_players = n_games[n_games > 3]

In [95]:
# distribution per player
coefs_per_player = df_coefs_w_info.set_index('player_id')\
                                  .loc[most_play_players.index]\
                                  .reset_index()

### B1

# Integrate player features to model
### Get player features

In [33]:
first_last.head(10)

Unnamed: 0,first_name,last_name
0,R.,Skov
1,A.,Skov Olsen
2,V.,Fischer
3,P.,Onuachu
4,D.,Vavro
5,M.,Duelund
6,B.,Sanneh
7,Evander,
8,H.,Mukhtar
9,P.,Ankersen


In [4]:
# read  player data
danish_player_data = pd.read_csv('../DanishSuperLiga_1819_PlayerData.csv')

# get player id to name mapping
name_and_id = players_df[['Playerid', 'Player name', 'first_name', 'last_name']].drop_duplicates()

# split first and last name
first_last = danish_player_data['Player'].str.split(expand=True, n=1)
first_last.columns = ['first_name', 'last_name']
danish_player_data[['first_name', 'last_name']] = first_last

NameError: name 'players_df' is not defined

In [98]:
# adjust to give last name = first name if not first name
last_name_nulls_idx = danish_player_data[danish_player_data['last_name'].isnull()].index
last_nulls = danish_player_data.loc[last_name_nulls_idx]
danish_player_data.loc[last_name_nulls_idx, 'last_name'] = danish_player_data.loc[
    last_name_nulls_idx, 'first_name'
]

In [99]:
danish_player_data.loc[ danish_player_data[danish_player_data['last_name'].isnull()].index]

Unnamed: 0,Player,Team,Position,Age,Market value,Contract expires,Matches played,Minutes played,Goals,xG,...,xA,Birth country,Passport country,Foot,Height,Weight,On loan,first_name,last_name,player_id


In [100]:
# get player id in the player data
for i, first, last, name in danish_player_data[['first_name', 'last_name', 'Player']].reset_index().values:
    # last name
    if last:
        last_matches = name_and_id['Player name'].str.contains(last)
    
    # first name
    if last_matches.any():
        first_matches = name_and_id[last_matches]['first_name'].str.slice(stop=1).str.contains(first[0])
        if len(first_matches[first_matches].index) == 1:
            p_id = name_and_id.loc[first_matches[first_matches].index]['Playerid']
            
            # set
            danish_player_data.loc[i, 'player_id'] = p_id.values[0]
        else:
            print(len(first_matches[first_matches].index))
            

0
3
2
2
3
3
2
2
2
2
2
3
2
2
0
0
3
3
0
0
0
0
2
0
0
0
2
0
3
0
0
2
0
3
0
0
0
0
0


In [64]:
# write clean data
danish_player_data.to_csv('../DanishSuperLiga_1819_PlayerData_processed.csv', index=0)

### Read clean player data set

In [65]:
danish_player_data = pd.read_csv('../DanishSuperLiga_1819_PlayerData_processed.csv')

## Build dataset with player info and EX

In [101]:
# mean EX per player
EX_mean_and_info = pd.merge(
    df_coefs_w_info.groupby('player_id')[['b1', 'b2']].mean().reset_index(),
    danish_player_data,
    on='player_id'
)

In [131]:
EX_mean_and_info[['b1', 'b2']].isnull().count()

b1    121
b2    121
dtype: int64

In [144]:
B.loc[[98, 3]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,player_id,match_id,b1,b2,sigma_b1,sigma_b2,start_pos_super,Team
98,95194.0,984554.0,13.657691,-1.6e-05,0.001531,1.32554e-08,F,Away


# Regression
One regression per position

In [117]:
from sklearn.linear_model import LinearRegression

In [104]:
# we keep players who played more than 5 games (coefs_per_player)
positions = coefs_per_player['start_pos_super'].unique()

In [194]:
# one model per position
for pos in positions:
    print(pos)
    if pos == 'CB':
        # NEED TO FIGURE CENTER BACKS
        continue
    
    # processing
    features = ['Age', 'Weight']
    X = EX_mean_and_info.set_index(['player_id']).loc[B['player_id'].unique()].dropna()[features]
    Y = coefs_per_player[coefs_per_player['start_pos_super'] == pos][['player_id', 'match_id', 'b1', 'b2']]
    XY = pd.merge(Y, X,
        left_on='player_id', right_on='player_id',
        how='left')
    
    if pos == 'M':
        test1, test2 = X, Y
    
    X = XY[['b1', 'b2']]
    y = XY[['Age', 'Weight']]
    
    
    
    
    # regression
    reg = LinearRegression()
    reg.fit(X, y)
    
    
    
    

GK
CB
M


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [193]:
test1

Unnamed: 0_level_0,Age,Weight
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1
182413,26,86
