In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import FCPython 
# import statsmodels.api as sm
# import statsmodels.formula.api as smf
import sys
# import seaborn as sns
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from tqdm import tqdm
# from sklearn.preprocessing import PolynomialFeatures

In [2]:
with open('/home/karim/staff/football_analytics/Wyscout/Wyscout/events/events_England.json') as f:
    data = json.load(f)    
raw = pd.DataFrame(data)

In [3]:
passes = raw[raw['eventName']=='Pass']
passes = passes.sample(n = 50000, random_state = 42)

In [4]:
cols = ['Acc_pass', 'X_start', 'Y_start', 'C_start', 
        'X_end', 'Y_end', 'C_end', 'Distance_to_keep_start', 
        'Distance_to_keep_end', 'Distance_pass', 'match_period', "speed", "X_start_squared", "Y_start_squared", "C_start_squared", "X_end_squared", "Y_end_squared", "C_end_squared" ,"playerId"]
pass_model = pd.DataFrame(columns=cols)    

In [5]:
# Created only for exploration purposes.
passes_temp = passes.head(50000) # 50000
passes_temp['positions']

25374     [{'y': 58, 'x': 68}, {'y': 33, 'x': 73}]
490415    [{'y': 92, 'x': 90}, {'y': 39, 'x': 90}]
142901    [{'y': 97, 'x': 35}, {'y': 75, 'x': 75}]
336240    [{'y': 13, 'x': 32}, {'y': 32, 'x': 70}]
320543    [{'y': 39, 'x': 13}, {'y': 90, 'x': 19}]
                            ...                   
238635    [{'y': 18, 'x': 94}, {'y': 69, 'x': 96}]
467433    [{'y': 31, 'x': 29}, {'y': 30, 'x': 25}]
220990    [{'y': 60, 'x': 12}, {'y': 71, 'x': 64}]
328105    [{'y': 81, 'x': 26}, {'y': 39, 'x': 33}]
554348    [{'y': 45, 'x': 45}, {'y': 75, 'x': 17}]
Name: positions, Length: 50000, dtype: object

In [6]:
# Creating & preparing the dataset
for i,pass_ in tqdm(passes.iterrows()):
    
    # Pass start location
    pass_model.at[i,'X_start']=100-pass_['positions'][0]['x']
    pass_model.at[i,'Y_start']=pass_['positions'][0]['y']
    pass_model.at[i,'C_start']=abs(pass_['positions'][0]['y']-50)
    
    #Pass end location
    pass_model.at[i,'X_end']=100-pass_['positions'][1]['x']
    pass_model.at[i,'Y_end']=pass_['positions'][1]['y']
    pass_model.at[i,'C_end']=abs(pass_['positions'][1]['y']-50)
    pass_model.at[i, "player_id"] = pass_["playerId"]
    
    ## squared 
    pass_model.at[i,'X_start_squared']=(100-pass_['positions'][0]['x'])**2
    pass_model.at[i,'Y_start_squared']=(pass_['positions'][0]['y'])**2
    pass_model.at[i,'C_start_squared']=(abs(pass_['positions'][0]['y']-50))**2

    pass_model.at[i,'X*Y_start_squared']=(100-pass_['positions'][0]['x']) * (pass_['positions'][0]['y'])
    
    #Pass end location
    pass_model.at[i,'X_end_squared']=(100-pass_['positions'][1]['x'])**2
    pass_model.at[i,'Y_end_squared']=(pass_['positions'][1]['y'])**2
    pass_model.at[i,'C_end_squared']=(abs(pass_['positions'][1]['y']-50))**2

    pass_model.at[i,'X*Y_end_squared']= (100-pass_['positions'][1]['x']) * (pass_['positions'][1]['y'])
    ## end squared
    
    #Distances (from the start location of the pass to the keep)
    x_start=pass_model.at[i,'X_start']*105/100
    y_start=pass_model.at[i,'C_start']*65/100
    pass_model.at[i,'Distance_to_keep_start']=np.sqrt(x_start**2 + y_start**2)
    
    #Distances (from the end location of the pass to the keep)
    x_end=pass_model.at[i,'X_end']*105/100
    y_end=pass_model.at[i,'C_end']*65/100
    pass_model.at[i,'Distance_to_keep_end']=np.sqrt(x_end**2 + y_end**2)
    
    #Pass distance
    pass_model.at[i,'Distance_pass'] = np.sqrt(abs(x_start-x_end)**2 + abs(y_start-y_end)**2)
    
    pass_model.at[i, "timeelapsed"] = pass_["eventSec"]
    pass_model.at[i, "match_period"] = pass_["matchPeriod"]
        
    #Accurate passes   
    pass_model.at[i,'Acc_pass']=0
    for passtags in pass_['tags']:
        if passtags['id']==1801:
            pass_model.at[i,'Acc_pass']=1
    sys.stdout.write('.'); sys.stdout.flush(); #Just for visual check while the code is runnung, whether the loop works or not.

# I added to more columns, which may come in handy while modelling. 
pass_model['dX'] = pass_model['X_start'] - pass_model['X_end']
pass_model['d_Distance'] = pass_model['Distance_to_keep_start'] - pass_model['Distance_to_keep_end']
pass_model["match_period"] = pass_model["match_period"].astype("category").cat.codes
pass_model = pass_model.astype(float)

pass_model['speed'] = pass_model["dX"] / pass_model["timeelapsed"]
# Turn them into floats as correlations cannot be plotted while all columns are objects.
# Pass types are added from raw data.
subEventName = passes['subEventName']
pass_model = pass_model.join(subEventName)
pass_model["subEventName"] = pass_model["subEventName"].astype("category").cat.codes

50000it [16:10, 51.53it/s]


In [7]:
#condition: last third passes
pass_model = pass_model[pass_model['Distance_to_keep_end'] < 33]

In [8]:
#preparing data for training
pass_model2train = pass_model.drop(["speed", "timeelapsed"], axis=1)
x_axis_dataframe = pass_model2train.drop(["Acc_pass", "player_id", "playerId"], axis=1)
y_axis_dataframe = pass_model2train["Acc_pass"]
# to list in order preparing the input for sklearn
x_axis_features = x_axis_dataframe.values.tolist() 
#normalisation
scaler = preprocessing.MinMaxScaler()
scaler = scaler.fit(x_axis_features)
x_axis_features = scaler.transform(x_axis_features)
# to list in order preparing the input for sklearn
y_axis_labels = y_axis_dataframe.values.tolist()

In [9]:
#linear regression training
X_train, X_test, y_train, y_test = train_test_split(x_axis_features, y_axis_labels, test_size=0.33)
clf = LogisticRegression(random_state=0, max_iter=10000, tol=0.000001, verbose=1).fit(X_train, y_train) 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [10]:
y_predictions = clf.predict(X_test)
y_prediction_labels = list(zip(y_predictions, y_test))
pred_correctness = [x[0] == x[1] for x in y_prediction_labels]
accuracy_now = pred_correctness.count(True) / len(pred_correctness)
# accuracy test blind
accuracy_now

0.7295965506621497

In [11]:
# testing on the training set to check if the model complexity needs to be increased
y_predictions_train = clf.predict(X_train)
y_prediction_labels_train = list(zip(y_predictions_train, y_train))
pred_correctness_train = [x[0] == x[1] for x in y_prediction_labels_train]
accuracy_now_train = pred_correctness_train.count(True) / len(pred_correctness_train)
accuracy_now_train

0.7191198786039453

In [12]:
## Checked train and test accuracy -> found out that the train accuracy is 85%, so there is room for increasing the model's complexity -> I hearby keep checking the accuracy on test to make sure we are not falling into the curse of dimensionality aka overfitting

# by normalizing the data 0-1 improved gradient descent and convergence

# all features polynomial degree 2

In [13]:
#extracting the coefficients
list_of_coeffs_INIT = clf.coef_
list_of_coeffs = [(x, y) for x,y in enumerate(list_of_coeffs_INIT[0])]
list_of_coeffs2 = [(x, y) for x,y in enumerate(list_of_coeffs_INIT[0])]
list_of_coeffs.sort(key= lambda x: x[1])
list_of_coeffs2.sort(key= lambda x: x[1], reverse=True)

In [14]:
# negative coeffs
list_of_coeffs[:10]

[(10, -2.1113943669763193),
 (18, -1.0313563674400996),
 (4, -0.8957791071797098),
 (0, -0.862928919217068),
 (16, -0.7665215501290408),
 (14, -0.6933695114340648),
 (12, -0.6790331082107489),
 (6, -0.6251963963401043),
 (13, -0.5243464784091069),
 (19, -0.5243343398628163)]

In [15]:
# positive coeffs
list_of_coeffs2[:10]

[(8, 3.0399294859649526),
 (17, 1.8377409655463393),
 (20, 1.737648671714509),
 (3, 1.2682358521550252),
 (15, 0.7716477596419659),
 (5, 0.7072606896257371),
 (2, 0.3884148913252003),
 (1, 0.1747061818909441),
 (11, 0.014445560673396174),
 (9, 0.013634710768439114)]

In [16]:
#corresponding features to those coeffs
x_axis_dataframe.columns

Index(['X_start', 'Y_start', 'C_start', 'X_end', 'Y_end', 'C_end',
       'Distance_to_keep_start', 'Distance_to_keep_end', 'Distance_pass',
       'match_period', 'X_start_squared', 'Y_start_squared', 'C_start_squared',
       'X_end_squared', 'Y_end_squared', 'C_end_squared', 'X*Y_start_squared',
       'X*Y_end_squared', 'dX', 'd_Distance', 'subEventName'],
      dtype='object')

In [17]:
## Retrieving list of best player, worst players in the sample taken as well as their score according to my model

In [18]:
pass_model_results = pass_model2train.drop(["playerId"], axis=1)
distinct_player_ids = pass_model2train["player_id"].unique()

In [19]:
#processing the IDs and their respective score in the model
player_ids_to_scores = {}
player_ids_to_scores_proba = {}
player_ids_to_no_passes = {}
player_ids_with_diffs = []
for player_id in tqdm(distinct_player_ids):
    temp_df = pass_model2train[pass_model2train["player_id"] == player_id]
    temp_df = temp_df.drop(["player_id", "Acc_pass", "playerId"], axis=1)
    temp_vals = temp_df.values.tolist()
    player_ids_to_no_passes[int(player_id)] = len(temp_vals)
    temp_vals_scaled = scaler.transform(temp_vals)
    y_predictions = clf.predict(temp_vals_scaled)
    y_predictions_average = np.mean(y_predictions)
    y_predictions_proba = clf.predict_proba(temp_vals_scaled)
    y_predictions_average_proba = np.mean([x[1] for x in y_predictions_proba])
    player_ids_to_scores[int(player_id)] = y_predictions_average
    player_ids_to_scores_proba[int(player_id)] = y_predictions_average_proba
    if (y_predictions_average_proba >= 0.5) != (y_predictions_average >= 0.5):
        player_ids_with_diffs.append(player_id)

100%|██████████| 467/467 [00:00<00:00, 494.03it/s]


In [20]:
#processing the mapping (id <--> player Name)
with open('/home/karim/staff/football_analytics/Wyscout/Wyscout/players.json') as f:
    playersdata = json.load(f)
player_name_to_id = {}
player_id_to_name = {}
all_player_data_w_id = {}
for data_entry in playersdata:
    player_name_to_id[data_entry["shortName"]] = data_entry["wyId"]
    all_player_data_w_id[data_entry["wyId"]] = data_entry
    player_id_to_name[data_entry["wyId"]] = data_entry["shortName"]

In [21]:
#Retrieve best and worst players and their model score
id_to_score_tuple_list = [(x, y) for x,y in player_ids_to_scores_proba.items()]
id_to_score_tuple_list2 = [(x, y) for x,y in player_ids_to_scores_proba.items()]

In [22]:
#Retrieve best 10 and worst 10 players and their model score
id_to_score_tuple_list.sort(key=lambda x:x[1], reverse=False)
id_to_score_tuple_list2.sort(key=lambda x:x[1], reverse=True)
number_wanted = 10
#id_to_score_tuple_list[:number_wanted] # top 10 worst players in passing (id, score)
#id_to_score_tuple_list2[:number_wanted] # top 10 best players in passing (id, score)

In [23]:
#Top 10 --> top 10 best players in passing (id, score)
top_players_info = [(rank+1, player_id_to_name[x[0]], x[0], x[1]) for rank, x in enumerate(id_to_score_tuple_list2[:number_wanted])] 
fbest = open('bestPlayers.txt', 'w')
for listing in top_players_info:
    print(listing)
    fbest.write("".join(str(listing)))
    fbest.write("\n")
fbest.close()

(1, 'P. Foden', 447205, 0.8687470937156003)
(2, 'Brahim Diaz', 404397, 0.8564162931262473)
(3, 'K. Stafylidis', 92947, 0.8367654000660716)
(4, 'C. Tosun', 32636, 0.8274985749171635)
(5, 'R. Barkley', 8246, 0.8215066313020184)
(6, 'P. Mertesacker', 7856, 0.8207282801013299)
(7, 'P. Jones', 7918, 0.8189403958911874)
(8, 'N. Wells', 9179, 0.8159358824826997)
(9, 'S. Kaikai', 297258, 0.8106025640136579)
(10, 'I. Afellay', 3343, 0.8104130367890183)


In [24]:
worst_players_info = [(rank+1, player_id_to_name[x[0]], x[0], x[1]) for rank, x in enumerate(id_to_score_tuple_list[:number_wanted])] # top 10 best players in passing (id, score)
#worst_players_names = []
fworst = open('worstPlayers.txt', 'w')
for listing in worst_players_info:
    print(listing)
    fworst.write("".join(str(listing)))
    fworst.write("\ n")
    #worst_players_names = str(listing) + '\n'
fworst.close()

(1, 'J. Hart', 8301, 0.19040813948480106)
(2, 'E. Jakupovic', 93084, 0.20408590945642777)
(3, 'F. Forster', 61941, 0.23487431376587295)
(4, 'J. Murphy', 62091, 0.25036253480893617)
(5, 'Ederson', 71654, 0.2520156676391796)
(6, 'L. Grant', 8935, 0.2588606926423368)
(7, 'Y. Kaboul', 8272, 0.3066054856575409)
(8, '\\u0141. Fabia\\u0144ski', 7847, 0.30957300481769306)
(9, 'H. Lloris', 25381, 0.3118138470959332)
(10, 'M. Ryan', 61390, 0.32165116687017886)


In [25]:
#check some players
def retrievePlayerDetails(wanted_player_inquiry):
    print(player_id_to_name[wanted_player_inquiry])
    return all_player_data_w_id[wanted_player_inquiry]
wanted_player_inquiry = 14886
print(player_id_to_name[wanted_player_inquiry])
all_player_data_w_id[wanted_player_inquiry]


M. Be\u0161i\u0107


{'passportArea': {'name': 'Bosnia-Herzegovina',
  'id': 70,
  'alpha3code': 'BIH',
  'alpha2code': 'BA'},
 'weight': 78,
 'firstName': 'Muhamed',
 'middleName': '',
 'lastName': 'Be\\u0161i\\u0107',
 'currentTeamId': 1620,
 'birthDate': '1992-09-10',
 'height': 177,
 'role': {'code2': 'MD', 'code3': 'MID', 'name': 'Midfielder'},
 'birthArea': {'name': 'Germany',
  'id': 276,
  'alpha3code': 'DEU',
  'alpha2code': 'DE'},
 'wyId': 14886,
 'foot': 'right',
 'shortName': 'M. Be\\u0161i\\u0107',
 'currentNationalTeamId': 10001}