In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import joblib
import pickle
from mplsoccer.pitch import Pitch
from ipynb.fs.defs.PassesData import get_df_passes
from ipynb.fs.defs.ExpectedGoalsData import get_df_shots
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

In [2]:
plt.style.use('fivethirtyeight')
params = {'axes.titlesize': 'x-large',
          'axes.labelsize': 'large', 
          'xtick.labelsize':'large',
          'ytick.labelsize':'large'}
plt.rcParams.update(params)

In [3]:
matches = pd.read_excel('extracted_data/matches.xlsx')
players = pd.read_excel('extracted_data/players.xlsx')

In [4]:
def get_player_name(player_id):
    if pd.notna(players.loc[players['player_id'] == player_id, 'player_nickname'].iloc[0]):
        return players.loc[players['player_id'] == player_id, 'player_nickname'].iloc[0]
    else:
        return players.loc[players['player_id'] == player_id, 'player_name'].iloc[0]

In [5]:
def get_competition_data(comp_matches):
    # get all events from competition matches
    events_folder = 'data/events'
    events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))).assign(match_id=match_id) for match_id in comp_matches['match_id'].unique()])
    events.reset_index(inplace=True)
    # add predictions for the pass accuracy
    comp_passes = get_df_passes(events)
    passes_model = joblib.load('models/pass_accuracy_model.pkl')
    with open("models/passes_selected_features.txt", "rb") as fp:
         passes_selected_features = pickle.load(fp)
    comp_passes.loc[:, 'pred_accuracy'] = passes_model.predict_proba(comp_passes[passes_selected_features])[:, 1]
    # add expected goals predictions
    comp_shots = get_df_shots(events)
    xg_model = joblib.load('models/expected_goals_model.pkl')
    with open("models/xg_selected_features.txt", "rb") as fp:
         xg_selected_features = pickle.load(fp)
    comp_shots.loc[:, 'pred_xg'] = xg_model.predict_proba(comp_shots[xg_selected_features])[:, 1]
    # add the ids to the passes and shots
    events.loc[:, 'pass_event'] = [1 if events['type'].iloc[i]['id'] == 30 else 0 for i in range(len(events))]
    events.loc[:, 'shot_event'] = [1 if events['type'].iloc[i]['id'] == 16 else 0 for i in range(len(events))]
    comp_passes.loc[:, 'id'] = events.loc[events['pass_event'] == 1, 'id'].reset_index(drop=True)
    comp_shots.loc[:, 'id'] = events.loc[events['shot_event'] == 1, 'id'].reset_index(drop=True)
    events = events.loc[(events['pass_event'] == 1) | (events['shot_event'] == 1), ['id', 'match_id', 'possession', 'pass_event', 'shot_event']]
    return events, comp_passes, comp_shots

In [6]:
# calculate the playtime for each player as a dict with the player_id as the key and the minutes played as the value
def calculate_players_playtime(comp_matches):
    # get all events from competition matches
    events_folder = 'data/events'
    events = pd.concat([pd.DataFrame(json.load(open(os.path.join(events_folder, str(match_id) + '.json'), encoding='utf8'))).assign(match_id=match_id) for match_id in comp_matches['match_id'].unique()])
    events.reset_index(inplace=True)
    # calculate the playtime for each player
    players_playtime = {}
    for match_id in comp_matches['match_id'].unique():
        total_minutes = events.loc[events['match_id'] == match_id, 'minute'].max()
        for _, tactic in events.loc[(events['match_id'] == match_id) & (~events['tactics'].isna()), 'tactics'].iloc[:2].iteritems():
            for player in pd.DataFrame(tactic['lineup'])['player']:
                if player['id'] in players_playtime:
                    players_playtime[player['id']] += total_minutes
                else:
                    players_playtime[player['id']] = total_minutes
        for _, sub in events.loc[(events['match_id'] == match_id) & (~events['substitution'].isna()), ['substitution', 'player', 'minute']].iterrows():
            on_player_id = sub['substitution']['replacement']['id']
            off_player_id = sub['player']['id']
            remaining_minutes = (total_minutes - sub['minute'])
            if on_player_id in players_playtime:
                players_playtime[on_player_id] += remaining_minutes
            else:
                players_playtime[on_player_id] = remaining_minutes
            players_playtime[off_player_id] -= remaining_minutes
    return players_playtime

In [7]:
# add the xg-value of a shot to all passes that belong to the chain of passes that led to the shot
def calculate_xg_chain(comp_events, comp_passes, comp_shots):
    comp_passes.loc[:, 'xg_chain'] = 0
    for match_id in comp_events['match_id'].unique():
        match_events = comp_events[comp_events['match_id'] == match_id]
        for possession in match_events['possession'].unique():
            shot_events = match_events[(match_events['possession'] == possession) & (match_events['shot_event'] == 1)]
            if len(shot_events) > 0:
                shot_xg = comp_shots.loc[comp_shots['id'] == shot_events['id'].iloc[0], 'pred_xg'].iloc[0]
                pass_chain_ids = match_events.loc[(match_events['possession'] == possession) & (match_events['pass_event'] == 1), 'id']
                comp_passes.loc[comp_passes['id'].isin(pass_chain_ids), 'xg_chain'] = shot_xg
    return comp_passes

In [8]:
# get the xg-chain-value per match for every player
def get_xg_chain(comp_passes, comp_shots, players_playtime):
    df_xg_chain = pd.DataFrame(columns=['player_id', 'Position', 'Spieler', 'Anzahl Spiele', 'Spielminuten', 'xG-Chain', 'xG-Chain/90',
                                        'pre-xG-Chain', 'pre-xG-Chain/90'])
    for player_id in comp_passes['player_id'].unique():
        player_passes = comp_passes[comp_passes['player_id'] == player_id]
        player_shots = comp_shots[comp_shots['player_id'] == player_id]
        num_matches = len(player_passes['match_id'].unique())
        num_minutes = players_playtime[player_id]
        xg_chain_sum = np.round(np.sum(player_passes['xg_chain'].unique()) + player_shots['pred_xg'].sum(), 2)
        pre_xg_chain_sum = np.round(np.sum(player_passes.loc[(player_passes['shot_assist'] == 0) & (player_passes['goal_assist'] == 0), 'xg_chain'].unique()), 2)
        pos = player_passes['position'].iloc[0]
        position = 'Torwart' if pos == 1 else 'Verteidiger' if pos <= 8 else 'Mittelfeldspieler' if pos <= 21 else 'Stürmer'
        df_xg_chain = df_xg_chain.append(pd.Series({'player_id': player_id, 'Position': position, 'Spieler': get_player_name(player_id),
                                                    'Anzahl Spiele': num_matches, 'Spielminuten': num_minutes, 'xG-Chain': xg_chain_sum, 
                                                    'xG-Chain/90': np.round(90 * xg_chain_sum / num_minutes, 2), 'pre-xG-Chain': pre_xg_chain_sum, 
                                                    'pre-xG-Chain/90': np.round(90 * pre_xg_chain_sum / num_minutes, 2)}), ignore_index=True)
    return df_xg_chain

In [9]:
# calculate the effectiveness for a pass: -2 für Fehlpässe, die zu einem Torschuss des Gegners führen; 
# -1 für harmlose Fehlpässe; 0 für neutrale Pässe; 1 für raumgewinnende Pässe; 2 für Pässe, die zu einem Torschuss führen
def calculate_pass_effectiveness(the_pass, comp_events):
    if the_pass['complete'] == 0:
        event = comp_events[comp_events['id'] == the_pass['id']].iloc[0]
        next_possession_chain = comp_events[(comp_events['match_id'] == event['match_id']) & (comp_events['possession'] == event['possession']+1)]
        if (next_possession_chain['pass_event'].sum() <= 4) & (next_possession_chain['shot_event'].sum() > 0):
            return -2
        else:
            return -1
    else:
        if (the_pass['shot_assist'] == 1) | (the_pass['goal_assist'] == 1):
            return 2
        elif ((the_pass['location_x'] > 60) & (the_pass['end_location_x'] > the_pass['location_x'])) | (the_pass['end_location_x'] - 10 > the_pass['location_x']) | (the_pass['switch'] == 1) | (the_pass['cross'] == 1) | (the_pass['cut_back'] == 1):
            return 1
        else:
            return 0

In [10]:
# add a score to every pass for the effectiveness of a pass
def calculate_pass_effectivenesses(comp_passes, comp_events):
    comp_passes.loc[:, 'pass_effectiveness'] = [calculate_pass_effectiveness(row, comp_events) for _, row in comp_passes.iterrows()]
    return comp_passes

In [11]:
# add a score to every pass that combines the effectiveness and the difficulty of a pass
def calculate_pass_scores(comp_passes):
    comp_passes.loc[:, 'pass_score'] = [row['pass_effectiveness']*(1-row['pred_accuracy']) if row['pass_effectiveness'] >= 0 else row['pass_effectiveness']*row['pred_accuracy'] for _, row in comp_passes.iterrows()]
    return comp_passes

In [12]:
# get the sum of passing scores per match for every player
def get_pass_scores(comp_passes, players_playtime):
    df_pass_scores = pd.DataFrame(columns=['player_id', 'Position', 'Spieler', 'Anzahl Spiele', 'Spielminuten', 'Passeffekt', 
                                           'Passeffekt/90', 'Pässe', '-2', '-1', '0', '1', '2', 'Passwert', 'Passwert/90'])
    for player_id in comp_passes['player_id'].unique():
        player_passes = comp_passes[comp_passes['player_id'] == player_id]
        num_matches = len(player_passes['match_id'].unique())
        num_minutes = players_playtime[player_id]
        pass_effect_sum = player_passes['pass_effectiveness'].sum()
        pass_score_sum = np.round(player_passes['pass_score'].sum(), 2)
        pos = player_passes['position'].iloc[0]
        position = 'Torwart' if pos == 1 else 'Verteidiger' if pos <= 8 else 'Mittelfeldspieler' if pos <= 21 else 'Stürmer'
        df_pass_scores = df_pass_scores.append(pd.Series({'player_id': player_id, 'Position': position, 'Spieler': get_player_name(player_id),
                                                          'Anzahl Spiele': num_matches, 'Spielminuten': num_minutes,
                                                          'Passeffekt': pass_effect_sum, 
                                                          'Passeffekt/90': np.round(90 * pass_effect_sum / num_minutes, 2), 
                                                          'Pässe': len(player_passes), 
                                                          '-2': len(player_passes[player_passes['pass_effectiveness'] == -2]), 
                                                          '-1': len(player_passes[player_passes['pass_effectiveness'] == -1]),
                                                          '0': len(player_passes[player_passes['pass_effectiveness'] == 0]), 
                                                          '1': len(player_passes[player_passes['pass_effectiveness'] == 1]), 
                                                          '2': len(player_passes[player_passes['pass_effectiveness'] == 2]),
                                                          'Passwert': pass_score_sum, 
                                                          'Passwert/90': np.round(90 * pass_score_sum / num_minutes, 2)}), ignore_index=True)
    return df_pass_scores

Calculation of Expected Goals Chain Values for the World Cup 2018

In [13]:
# get data for the world cup 2018
comp_id = 43
comp_matches = matches[matches['competition_id'] == comp_id]
wc_events, wc_passes, wc_shots = get_competition_data(comp_matches)



In [14]:
wc_players_playtime = calculate_players_playtime(comp_matches)

In [15]:
wc_passes = calculate_xg_chain(wc_events, wc_passes, wc_shots)

In [16]:
df_xg_chain = get_xg_chain(wc_passes, wc_shots, wc_players_playtime)

In [17]:
df_xg_chain[df_xg_chain['Spielminuten'] >= 200].sort_values('xG-Chain/90', ascending=False).head(20)

Unnamed: 0,player_id,Position,Spieler,Anzahl Spiele,Spielminuten,xG-Chain,xG-Chain/90,pre-xG-Chain,pre-xG-Chain/90
103,4320,Mittelfeldspieler,Neymar,87,478,12.42,2.34,5.75,1.08
132,5574,Mittelfeldspieler,Toni Kroos,95,289,6.14,1.91,5.07,1.58
224,3196,Stürmer,Wahbi Khazri,48,264,4.97,1.69,2.83,0.96
104,3501,Mittelfeldspieler,Philippe Coutinho,89,453,8.32,1.65,5.15,1.02
419,5216,Mittelfeldspieler,Andrés Iniesta,87,294,5.26,1.61,3.54,1.08
414,3064,Mittelfeldspieler,David Silva,79,328,5.77,1.58,3.96,1.09
138,5559,Mittelfeldspieler,Marco Reus,63,230,3.74,1.46,3.17,1.24
416,5198,Stürmer,Diego Costa,45,315,5.03,1.44,2.12,0.61
90,5539,Mittelfeldspieler,Casemiro,91,346,5.45,1.42,5.21,1.36
327,5630,Mittelfeldspieler,Dries Mertens,69,310,4.79,1.39,2.83,0.82


In [18]:
wc_passes = calculate_pass_effectivenesses(wc_passes, wc_events)

In [19]:
wc_passes = calculate_pass_scores(wc_passes)

In [20]:
df_pass_scores = get_pass_scores(wc_passes, wc_players_playtime)

In [21]:
df_pass_scores[(df_pass_scores['Spielminuten'] >= 200) & (df_pass_scores['Position'] == 'Torwart')].sort_values('Passwert/90', ascending=False).head(10)

Unnamed: 0,player_id,Position,Spieler,Anzahl Spiele,Spielminuten,Passeffekt,Passeffekt/90,Pässe,-2,-1,0,1,2,Passwert,Passwert/90
177,5601,Torwart,Vladimir Stojković,51,285,5,1.58,73,1,25,15,32,0,9.55,3.02
89,5597,Torwart,Keylor Navas,54,290,14,4.34,73,1,25,6,41,0,9.45,2.93
15,3815,Torwart,Kasper Schmeichel,70,409,-4,-0.88,122,7,48,9,58,0,13.2,2.9
163,3240,Torwart,Mat Ryan,56,282,46,14.68,111,0,16,33,62,0,8.14,2.6
255,3099,Torwart,Hugo Lloris,78,569,19,3.01,138,3,50,10,75,0,13.02,2.06
276,5172,Torwart,Igor Akinfeev,73,535,-24,-4.04,114,2,64,4,44,0,11.8,1.99
544,3468,Torwart,Jordan Pickford,97,720,22,2.75,226,5,65,59,97,0,14.18,1.77
329,3175,Torwart,Eiji Kawashima,59,377,8,1.91,89,2,30,15,42,0,6.42,1.53
105,5547,Torwart,Alisson,52,478,13,2.45,67,1,11,29,26,0,5.41,1.02
269,5267,Torwart,Fernando Muslera,60,473,14,2.66,106,2,33,20,51,0,5.12,0.97


In [22]:
df_pass_scores[(df_pass_scores['Spielminuten'] >= 200) & (df_pass_scores['Position'] == 'Verteidiger')].sort_values('Passwert/90', ascending=False).head(10)

Unnamed: 0,player_id,Position,Spieler,Anzahl Spiele,Spielminuten,Passeffekt,Passeffekt/90,Pässe,-2,-1,0,1,2,Passwert,Passwert/90
407,5201,Verteidiger,Sergio Ramos,107,409,272,59.85,496,0,27,173,293,3,30.88,6.8
9,5534,Verteidiger,Simon Kjær,86,409,45,9.9,221,1,29,117,72,2,14.03,3.09
420,5721,Verteidiger,Daniel Carvajal,88,246,29,10.61,181,0,24,109,43,5,8.37,3.06
537,3336,Verteidiger,Harry Maguire,110,675,124,16.53,434,2,51,207,169,5,21.99,2.93
535,3205,Verteidiger,Kyle Walker,98,506,96,17.08,385,1,38,211,134,1,15.15,2.69
158,5478,Verteidiger,Trent Sainsbury,79,282,57,18.19,191,4,18,88,79,2,8.25,2.63
119,5573,Verteidiger,Héctor Moreno,69,282,31,9.89,146,1,20,74,49,2,8.05,2.57
408,5213,Verteidiger,Gerard Piqué,105,409,125,27.51,354,1,30,168,153,2,11.61,2.55
157,5488,Verteidiger,Mark Milligan,78,282,79,25.21,249,1,20,128,99,1,7.56,2.41
94,5552,Verteidiger,Marcelo,91,298,56,16.91,291,10,43,127,103,8,7.46,2.25


In [23]:
df_pass_scores[(df_pass_scores['Spielminuten'] >= 200) & (df_pass_scores['Position'] == 'Mittelfeldspieler')].sort_values('Passwert/90', ascending=False).head(20)

Unnamed: 0,player_id,Position,Spieler,Anzahl Spiele,Spielminuten,Passeffekt,Passeffekt/90,Pässe,-2,-1,0,1,2,Passwert,Passwert/90
132,5574,Mittelfeldspieler,Toni Kroos,95,289,160,49.83,323,3,28,105,180,7,24.15,7.52
406,5199,Mittelfeldspieler,Koke,98,243,79,29.26,270,0,21,152,94,3,16.05,5.94
411,4926,Mittelfeldspieler,Isco,110,409,115,25.31,475,4,57,244,160,10,16.89,3.72
410,5203,Mittelfeldspieler,Sergio Busquets,99,409,95,20.9,312,0,25,170,114,3,16.38,3.6
22,5520,Mittelfeldspieler,Lasse Schöne,63,236,24,9.15,122,2,16,60,44,0,8.58,3.27
419,5216,Mittelfeldspieler,Andrés Iniesta,87,294,78,23.88,285,7,30,133,108,7,10.36,3.17
103,4320,Mittelfeldspieler,Neymar,87,478,62,11.67,283,7,53,118,81,24,15.39,2.9
104,3501,Mittelfeldspieler,Philippe Coutinho,89,453,110,21.85,325,0,36,156,120,13,14.08,2.8
358,5504,Mittelfeldspieler,Éver Banega,74,231,65,25.32,211,4,27,85,90,5,6.77,2.64
580,10956,Mittelfeldspieler,Eric Dier,86,276,36,11.74,194,0,22,114,58,0,7.41,2.42


In [24]:
df_pass_scores.loc[(df_pass_scores['Spielminuten'] >= 200) & (df_pass_scores['Position'] == 'Mittelfeldspieler'), ['Spieler', 'Anzahl Spiele', 'Spielminuten', 'Passeffekt', 'Passeffekt/90', 'Pässe', '-2', '-1', '0', '1', '2']].sort_values('Passeffekt/90', ascending=False).head(20)

Unnamed: 0,Spieler,Anzahl Spiele,Spielminuten,Passeffekt,Passeffekt/90,Pässe,-2,-1,0,1,2
132,Toni Kroos,95,289,160,49.83,323,3,28,105,180,7
406,Koke,98,243,79,29.26,270,0,21,152,94,3
346,Javier Mascherano,93,378,116,27.62,348,4,33,157,151,3
432,Abdullah Otayf,80,252,77,27.5,257,1,23,133,98,2
358,Éver Banega,74,231,65,25.32,211,4,27,85,90,5
411,Isco,110,409,115,25.31,475,4,57,244,160,10
419,Andrés Iniesta,87,294,78,23.88,285,7,30,133,108,7
232,Granit Xhaka,89,382,101,23.8,346,2,34,174,133,3
104,Philippe Coutinho,89,453,110,21.85,325,0,36,156,120,13
410,Sergio Busquets,99,409,95,20.9,312,0,25,170,114,3


In [25]:
df_pass_scores[(df_pass_scores['Spielminuten'] >= 200) & (df_pass_scores['Position'] == 'Stürmer')].sort_values('Passwert/90', ascending=False).head(10)

Unnamed: 0,player_id,Position,Spieler,Anzahl Spiele,Spielminuten,Passeffekt,Passeffekt/90,Pässe,-2,-1,0,1,2,Passwert,Passwert/90
447,3083,Stürmer,Son Heung-Min,50,288,19,5.94,79,2,13,35,22,7,3.1,0.97
279,4319,Stürmer,Edinson Cavani,54,355,15,3.8,83,1,14,40,25,3,2.75,0.7
416,5198,Stürmer,Diego Costa,45,315,10,2.86,64,0,11,36,13,4,-1.33,-0.38
543,3233,Stürmer,Raheem Sterling,65,456,16,3.16,135,4,18,76,32,5,-2.5,-0.49
495,5207,Stürmer,Cristiano Ronaldo,63,381,19,4.49,113,1,21,53,34,4,-2.89,-0.68
58,5668,Stürmer,Robert Lewandowski,52,282,8,2.55,73,1,12,41,16,3,-2.3,-0.73
542,10955,Stürmer,Harry Kane,75,595,-1,-0.15,148,5,45,46,50,2,-4.97,-0.75
127,3058,Stürmer,Javier Hernández,49,341,15,3.96,67,1,16,23,21,6,-3.02,-0.8
47,5473,Stürmer,Ahmed Musa,39,220,-6,-2.45,62,3,16,31,8,4,-2.02,-0.83
545,3318,Stürmer,Marcus Rashford,56,239,0,0.0,89,1,20,48,18,2,-2.23,-0.84
