In [18]:
import os
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import Tracab as tracab
import Tracking_Visuals as vis
import Tracking_Velocities as vel
import Tracking_Fatigue as fatigue
import helpers
import importlib
# importlib.reload(tracab)
# importlib.reload(helpers)
# importlib.reload(vis)

# Model 2

# Read player and energy expenditure data

In [19]:
# EX data
with open(os.path.join('../data_processed/EX_time_series_game_984472.pkl'), 'rb') as infile:
    data_dict = pickle.load(infile)

In [20]:
data_dict.keys()

dict_keys(['data_path', 'player_info', 'energy_x'])

In [7]:
# players data
players_df = pd.read_csv('../playerid_jerseynum_map.csv')

In [8]:
players_df.head()

Unnamed: 0,Match ID,Team,Playerid,Player name,Jersey Num,Starting Position,Starts Match,start_pos_super,first_name,last_name
0,984455,Home,50310,Sebastian Mielitz,28,GK,Start,GK,Sebastian,Mielitz
1,984455,Home,247047,Nicholas Marfelt,18,LFB,Start,FB,Nicholas,Marfelt
2,984455,Home,207273,Stefan Gartenmann,2,RFB,Start,FB,Stefan,Gartenmann
3,984455,Home,56317,Kees Luijckx,5,CD,Start,CB,Kees,Luijckx
4,984455,Home,52560,Marc Pedersen,3,CD,Start,CB,Marc,Pedersen


In [9]:
# for player, positions in players_df[['Playerid', 'start_pos_super']]\
#         .groupby('Playerid')['start_pos_super'].apply(list).iteritems():
#     print(player, np.unique(positions))

In [10]:
# players_df[players_df['Playerid']==45097]

In [11]:
# players_df[['Match ID', 'Playerid', 'start_pos_super']]

In [5]:
PLAYER_ID_to_JERSEY_NUM_LOC = '../playerid_jerseynum_map.csv'

In [12]:
df = pd.read_csv(PLAYER_ID_to_JERSEY_NUM_LOC)

In [14]:
df['Match ID'].max()

984634

In [3]:
all_cluster_games = [x for x, _, _ in os.walk(cluster_dir_path) if x.split('/')[-1].isnumeric()]


NameError: name 'cluster_dir_path' is not defined

# Fatigue Model

## Fit 2nd degree polynomial on energy expenditure

In [12]:
from scipy.optimize import curve_fit
from collections import defaultdict

In [16]:
def poly_2nd(x, b, c):
    return 0 + (b * x) + (c * x ** 2)

def plays_full_game(EX_series, thresh=0.95):
    is_full = np.count_nonzero(EX_series)/len(EX_series) > thresh 
    return is_full

In [14]:
coefs = []

# for each game
for match_id, data in data_dict.items():    
    for _, player_id, series in data['energy_x']:
        EX_cumul = np.cumsum(series)
        frame_lst = np.array(range(len(EX_cumul)))
        
        # check if is subbed out
        if not plays_full_game(series):
            continue
        
        # fit second degree polynomial that passes through origin
        popt, pcov = curve_fit(poly_2nd, frame_lst, EX_cumul)
        
        
        # store info
        coefs.append([match_id, player_id, *popt, *np.sqrt(np.diag(pcov))])
        
    print('Done with match {}'.format(match_id))

Done with match 984554
Done with match 984509
Done with match 984590
Done with match 984539
Done with match 984530
Done with match 984481
Done with match 984495
Done with match 984468
Done with match 984460
Done with match 984523
Done with match 984570
Done with match 984579
Done with match 984505
Done with match 984558
Done with match 984476
Done with match 984491
Done with match 984455
Done with match 984518
Done with match 984544
Done with match 984575


In [15]:
# dataframe of coefficients
df_coefs = pd.DataFrame(coefs, 
                        columns=['match_id', 'player_id', 'b1', 'b2', 'sigma_b1', 'sigma_b2'])

NameError: name 'coefs' is not defined

In [16]:
df_coefs.sample(5)

Unnamed: 0,match_id,player_id,b1,b2,sigma_b1,sigma_b2
292,984544,79462,8.076311,1.006757e-05,0.001671,1.484766e-08
44,984590,95194,11.714477,-3.435686e-06,0.00183,1.614592e-08
119,984468,111319,3.565601,2.284048e-06,0.000898,8.30028e-09
216,984558,76001,10.906742,1.701355e-06,0.001365,1.186156e-08
62,984539,180169,8.845356,-6.258773e-07,0.002367,2.062615e-08


## Merge player data

In [17]:
# add in info 
df_coefs_w_info = pd.merge(df_coefs.astype({'match_id':int}),
                            players_df[['Match ID', 'Playerid', 'start_pos_super', 'Team']],
                            left_on=['match_id', 'player_id'],
                            right_on=['Match ID', 'Playerid'],
                            how='left').drop(['Match ID', 'Playerid'], axis=1)

# Distribution of coefficients

In [94]:
# get players who play the most, filter out player with few games
n_games = df_coefs['player_id'].value_counts()
most_play_players = n_games[n_games > 3]

In [95]:
# distribution per player
coefs_per_player = df_coefs_w_info.set_index('player_id')\
                                  .loc[most_play_players.index]\
                                  .reset_index()

### B1

# Integrate player features to model
### Get player features

In [33]:
first_last.head(10)

Unnamed: 0,first_name,last_name
0,R.,Skov
1,A.,Skov Olsen
2,V.,Fischer
3,P.,Onuachu
4,D.,Vavro
5,M.,Duelund
6,B.,Sanneh
7,Evander,
8,H.,Mukhtar
9,P.,Ankersen


In [4]:
# read  player data
danish_player_data = pd.read_csv('../DanishSuperLiga_1819_PlayerData.csv')

# get player id to name mapping
name_and_id = players_df[['Playerid', 'Player name', 'first_name', 'last_name']].drop_duplicates()

# split first and last name
first_last = danish_player_data['Player'].str.split(expand=True, n=1)
first_last.columns = ['first_name', 'last_name']
danish_player_data[['first_name', 'last_name']] = first_last

NameError: name 'players_df' is not defined

In [98]:
# adjust to give last name = first name if not first name
last_name_nulls_idx = danish_player_data[danish_player_data['last_name'].isnull()].index
last_nulls = danish_player_data.loc[last_name_nulls_idx]
danish_player_data.loc[last_name_nulls_idx, 'last_name'] = danish_player_data.loc[
    last_name_nulls_idx, 'first_name'
]

In [99]:
danish_player_data.loc[ danish_player_data[danish_player_data['last_name'].isnull()].index]

Unnamed: 0,Player,Team,Position,Age,Market value,Contract expires,Matches played,Minutes played,Goals,xG,...,xA,Birth country,Passport country,Foot,Height,Weight,On loan,first_name,last_name,player_id


In [100]:
# get player id in the player data
for i, first, last, name in danish_player_data[['first_name', 'last_name', 'Player']].reset_index().values:
    # last name
    if last:
        last_matches = name_and_id['Player name'].str.contains(last)
    
    # first name
    if last_matches.any():
        first_matches = name_and_id[last_matches]['first_name'].str.slice(stop=1).str.contains(first[0])
        if len(first_matches[first_matches].index) == 1:
            p_id = name_and_id.loc[first_matches[first_matches].index]['Playerid']
            
            # set
            danish_player_data.loc[i, 'player_id'] = p_id.values[0]
        else:
            print(len(first_matches[first_matches].index))
            

0
3
2
2
3
3
2
2
2
2
2
3
2
2
0
0
3
3
0
0
0
0
2
0
0
0
2
0
3
0
0
2
0
3
0
0
0
0
0


In [64]:
# write clean data
danish_player_data.to_csv('../DanishSuperLiga_1819_PlayerData_processed.csv', index=0)

### Read clean player data set

In [65]:
danish_player_data = pd.read_csv('../DanishSuperLiga_1819_PlayerData_processed.csv')

## Build dataset with player info and EX

In [101]:
# mean EX per player
EX_mean_and_info = pd.merge(
    df_coefs_w_info.groupby('player_id')[['b1', 'b2']].mean().reset_index(),
    danish_player_data,
    on='player_id'
)

In [131]:
EX_mean_and_info[['b1', 'b2']].isnull().count()

b1    121
b2    121
dtype: int64

In [144]:
B.loc[[98, 3]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,player_id,match_id,b1,b2,sigma_b1,sigma_b2,start_pos_super,Team
98,95194.0,984554.0,13.657691,-1.6e-05,0.001531,1.32554e-08,F,Away


# Regression
One regression per position

In [117]:
from sklearn.linear_model import LinearRegression

In [104]:
# we keep players who played more than 5 games (coefs_per_player)
positions = coefs_per_player['start_pos_super'].unique()

In [194]:
# one model per position
for pos in positions:
    print(pos)
    if pos == 'CB':
        # NEED TO FIGURE CENTER BACKS
        continue
    
    # processing
    features = ['Age', 'Weight']
    X = EX_mean_and_info.set_index(['player_id']).loc[B['player_id'].unique()].dropna()[features]
    Y = coefs_per_player[coefs_per_player['start_pos_super'] == pos][['player_id', 'match_id', 'b1', 'b2']]
    XY = pd.merge(Y, X,
        left_on='player_id', right_on='player_id',
        how='left')
    
    if pos == 'M':
        test1, test2 = X, Y
    
    X = XY[['b1', 'b2']]
    y = XY[['Age', 'Weight']]
    
    
    
    
    # regression
    reg = LinearRegression()
    reg.fit(X, y)
    
    
    
    

GK
CB
M


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [193]:
test1

Unnamed: 0_level_0,Age,Weight
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1
182413,26,86
