# Understand why Son scores (or not) in his games

## Imports

In [3]:
from sklearn.cluster import KMeans
import pandas as pd 
import numpy as np
from datetime import datetime

## K-Means from a csv file

In [4]:
def KMC (filename, n) :

    data = pd.read_csv(filename)
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")

## K-Means from a Pandas DataFrame

In [5]:
def KMC_df (df, n) :

    data = df
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")
    return

### Get the index of each game included in a cluster

In [None]:
def ClusterIndicesNumpy (clustNum, labels_array) :
    return np.where(labels_array == clustNum)[0]

In [None]:
def KMC_details (filename, n) :

    data = pd.read_csv(filename)
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster :")
        print(ClusterIndicesNumpy(c, kmeans.labels_))

## Load the dataset including Son league and UCL Games 

In [6]:
data_match = pd.read_csv("../../data/club/all_season/Son_data_champ-ldc_matchs_details_all_season.csv")

In [7]:
data_match['Date'] = pd.to_datetime(data_match['Date'])

In [8]:
data_match_kmeans = ['Start', 'Gls','Sh', 'SoT' ,'Home_1_Away_0', 'xG', 'SCA', 'GCA', 'Elo_Diff']

## Split the data according to the club in which Son plays

In [7]:
start_leverkusen = datetime.strptime("2013-08-10", "%Y-%m-%d")
start_spurs = datetime.strptime("2015-09-13", "%Y-%m-%d")

In [8]:
hamburg_data = data_match[data_match['Date'] < start_leverkusen]

In [91]:
leverkusen_data = data_match[(data_match['Date'] < start_spurs) & (data_match['Date'] >= start_leverkusen)]

In [103]:
spurs_data = data_match[data_match['Date'] >= start_spurs]

## Select the columns that will be used in the K-Means

### Hamburg data

In [59]:
hamburg_data_kmeans = hamburg_data[data_match_kmeans]

In [60]:
hamburg_data_kmeans = hamburg_data_kmeans.fillna(np.nan)

In [61]:
hamburg_data_kmeans

Unnamed: 0,Start,Gls,Sh,SoT,Home_1_Away_0,xG,SCA,GCA,Elo_Diff
0,1,1,,,0.0,,,,152
1,0,0,,,1.0,,,,25
2,0,0,,,0.0,,,,-70
3,1,2,,,0.0,,,,96
4,1,0,,,1.0,,,,-20
...,...,...,...,...,...,...,...,...,...
68,1,0,,,1.0,,,,64
69,1,0,,,0.0,,,,-71
70,1,0,,,1.0,,,,-30
71,1,1,,,0.0,,,,50


In [62]:
for col in hamburg_data_kmeans.columns :
    hamburg_data_kmeans[col] = pd.to_numeric (hamburg_data_kmeans[col])

In [65]:
# In columns in which we have NaN values, we can fix them to 0 as they are all NaN
hamburg_data_kmeans = hamburg_data_kmeans.fillna(0)

### Leverkusen data

In [130]:
leverkusen_data_kmeans = leverkusen_data[data_match_kmeans]

In [131]:
# Replace the NaN values by Numpy NaN
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(np.nan)

In [132]:
# Replace the match in which Son was not on Match Sheet with NaN values
leverkusen_data_kmeans = leverkusen_data_kmeans.replace('On m', None)

In [133]:
for col in leverkusen_data_kmeans.columns :
    leverkusen_data_kmeans[col] = pd.to_numeric(leverkusen_data_kmeans[col])

In [134]:
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(leverkusen_data_kmeans.mean())

In [136]:
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(0)

### Spurs data

In [104]:
spurs_data_kmeans = spurs_data[data_match_kmeans]

In [105]:
spurs_data_kmeans = spurs_data_kmeans.fillna(np.nan)

In [106]:
# Replace the match in which Son was not on Match Sheet with NaN values
spurs_data_kmeans = spurs_data_kmeans.replace('On m', None)

In [108]:
for col in spurs_data_kmeans.columns :
    spurs_data_kmeans[col] = pd.to_numeric(spurs_data_kmeans[col])

In [142]:
spurs_data_kmeans = spurs_data_kmeans.fillna(spurs_data_kmeans.mean())

## Split the data : If Son scored and if Son didn't score and store them in csv file (for Silhouette Analysis)

### Hamburg data

In [110]:
hamburg_data_kmeans_no_goals = hamburg_data_kmeans[hamburg_data_kmeans['Gls'] < 1]

In [116]:
hamburg_data_kmeans_no_goals.to_csv("Son_hamburg_no_goals.csv", index=False)

In [121]:
hamburg_data_kmeans_goals = hamburg_data_kmeans[hamburg_data_kmeans['Gls'] >= 1]

In [123]:
hamburg_data_kmeans_goals.to_csv("Son_hamburg_goals.csv", index=False)

### Leverkusen data

In [138]:
leverkusen_data_kmeans_no_goals = leverkusen_data_kmeans[leverkusen_data_kmeans['Gls'] < 1]

In [139]:
leverkusen_data_kmeans_no_goals.to_csv("Son_leverkusen_no_goals.csv", index=False)

In [140]:
leverkusen_data_kmeans_goals = leverkusen_data_kmeans[leverkusen_data_kmeans['Gls'] >= 1]

In [141]:
leverkusen_data_kmeans_goals.to_csv("Son_leverkusen_goals.csv", index=False)

### Spurs data

In [145]:
spurs_data_kmeans_no_goals = spurs_data_kmeans[spurs_data_kmeans['Gls'] < 1]

In [146]:
spurs_data_kmeans_no_goals.to_csv("Son_spurs_no_goals.csv", index=False)

In [147]:
spurs_data_kmeans_goals = spurs_data_kmeans[spurs_data_kmeans['Gls'] >= 1]

In [148]:
spurs_data_kmeans_goals.to_csv("Son_spurs_goals.csv", index=False)

## Apply K-means on Hamburg .csv files

In [9]:
KMC("Son_hamburg_goals.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  1.0
Gls :  1.1429
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.3571
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  15.9286
14  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  1.0
Gls :  2.0
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.5
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -276.5
2  Games included in this cluster


In [10]:
KMC("Son_hamburg_no_goals.csv", 4)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.5652
Gls :  0.0
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.5217
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -15.9565
23  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  0.6
Gls :  0.0
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.4
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -89.6
15  Games included in this cluster
---------------------- Values for Cluster --------------- 3  ---------------------
Start :  0.75
Gls :  0.0
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.5
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -238.75
4  Games included in this cluster
---------------------- Values for Cluster --------------- 4  ---------------------
Start :  0.6
Gls :  0.0
Sh :  0.0
SoT :  0.0
Home_1_Away_0 :  0.6667
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  60.0
15  Games included in this cluster


## Apply K-means on Leverkusen .csv files

In [11]:
KMC("Son_leverkusen_goals.csv", 4)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.8571
Gls :  1.7143
Sh :  2.4424
SoT :  1.3456
Home_1_Away_0 :  0.5714
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  169.7143
7  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  1.0
Gls :  1.6667
Sh :  3.0215
SoT :  1.5376
Home_1_Away_0 :  0.5079
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -60.6667
3  Games included in this cluster
---------------------- Values for Cluster --------------- 3  ---------------------
Start :  1.0
Gls :  1.1667
Sh :  2.3548
SoT :  0.871
Home_1_Away_0 :  0.4206
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  69.3333
6  Games included in this cluster
---------------------- Values for Cluster --------------- 4  ---------------------
Start :  1.0
Gls :  1.0
Sh :  2.0323
SoT :  0.8065
Home_1_Away_0 :  0.5238
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  274.0
2  Games included in this cluster


In [12]:
KMC("Son_leverkusen_no_goals.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.75
Gls :  0.0
Sh :  1.9409
SoT :  0.6882
Home_1_Away_0 :  0.5119
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  -154.0
12  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  0.9216
Gls :  0.0
Sh :  1.9013
SoT :  0.7097
Home_1_Away_0 :  0.5331
xG :  0.0
SCA :  0.0
GCA :  0.0
Elo_Diff :  120.4118
51  Games included in this cluster


## Apply K-means on Spurs .csv files

In [13]:
KMC("Son_spurs_goals.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.1951
Gls :  1.2927
Sh :  3.878
SoT :  2.0244
Home_1_Away_0 :  0.6103
xG :  0.5491
SCA :  3.5155
GCA :  0.8677
Elo_Diff :  217.3415
41  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  0.5455
Gls :  1.2273
Sh :  3.3182
SoT :  1.8636
Home_1_Away_0 :  0.5244
xG :  0.429
SCA :  2.7234
GCA :  0.548
Elo_Diff :  -2.8636
22  Games included in this cluster


In [14]:
KMC("Son_spurs_no_goals.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.1413
Gls :  0.0
Sh :  1.6304
SoT :  0.4783
Home_1_Away_0 :  0.4461
xG :  0.1907
SCA :  2.5805
GCA :  0.4536
Elo_Diff :  213.0543
92  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  0.2462
Gls :  0.0
Sh :  1.7231
SoT :  0.7692
Home_1_Away_0 :  0.5164
xG :  0.2277
SCA :  2.4804
GCA :  0.3413
Elo_Diff :  -11.0923
65  Games included in this cluster


In [43]:
test = KMC_details("Son_spurs_no_goals.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
Start :  0.1413
Gls :  0.0
Sh :  1.6304
SoT :  0.4783
Home_1_Away_0 :  0.4461
xG :  0.1907
SCA :  2.5805
GCA :  0.4536
Elo_Diff :  213.0543
92  Games included in this cluster
[  0   3   5   6   8  11  12  13  14  16  17  19  20  23  24  25  27  29
  31  35  38  40  41  42  44  45  46  48  49  50  52  54  55  57  58  59
  60  61  62  63  68  69  71  72  73  78  80  82  83  84  85  86  87  89
  90  91  93  94  95  97  98  99 100 103 104 106 110 112 114 115 116 119
 121 122 123 124 125 127 128 129 135 137 138 140 142 144 145 146 148 151
 152 154]
---------------------- Values for Cluster --------------- 2  ---------------------
Start :  0.2462
Gls :  0.0
Sh :  1.7231
SoT :  0.7692
Home_1_Away_0 :  0.5164
xG :  0.2277
SCA :  2.4804
GCA :  0.3413
Elo_Diff :  -11.0923
65  Games included in this cluster
[  1   2   4   7   9  10  15  18  21  22  26  28  30  32  33  34  36  37
  39  43  47  51  53  56  64  65  66