# Understand why Son scores (or not) in his games

## Imports

In [1]:
from sklearn.cluster import KMeans
import pandas as pd 
import numpy as np
from datetime import datetime

## K-Means from a csv file

In [2]:
def KMC (filename, n) :

    data = pd.read_csv(filename)
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")

## K-Means from a Pandas DataFrame

In [3]:
def KMC_df (df, n) :

    data = df
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")
    return

## Load the dataset including Son league and UCL Games 

In [96]:
data_match = pd.read_csv("../../data/club/all_season/Son_data_champ-ldc_matchs_details_all_season.csv")

In [97]:
data_match['Date'] = pd.to_datetime(data_match['Date'])

In [58]:
data_match_kmeans = ['Start', 'Gls','Sh', 'SoT' ,'Home_1_Away_0', 'xG', 'SCA', 'GCA', 'Elo_Diff']

## Split the data according to the club in which Son plays

In [7]:
start_leverkusen = datetime.strptime("2013-08-10", "%Y-%m-%d")
start_spurs = datetime.strptime("2015-09-13", "%Y-%m-%d")

In [8]:
hamburg_data = data_match[data_match['Date'] < start_leverkusen]

In [91]:
leverkusen_data = data_match[(data_match['Date'] < start_spurs) & (data_match['Date'] >= start_leverkusen)]

In [103]:
spurs_data = data_match[data_match['Date'] >= start_spurs]

## Select the columns that will be used in the K-Means

### Hamburg data

In [59]:
hamburg_data_kmeans = hamburg_data[data_match_kmeans]

In [60]:
hamburg_data_kmeans = hamburg_data_kmeans.fillna(np.nan)

In [61]:
hamburg_data_kmeans

Unnamed: 0,Start,Gls,Sh,SoT,Home_1_Away_0,xG,SCA,GCA,Elo_Diff
0,1,1,,,0.0,,,,152
1,0,0,,,1.0,,,,25
2,0,0,,,0.0,,,,-70
3,1,2,,,0.0,,,,96
4,1,0,,,1.0,,,,-20
...,...,...,...,...,...,...,...,...,...
68,1,0,,,1.0,,,,64
69,1,0,,,0.0,,,,-71
70,1,0,,,1.0,,,,-30
71,1,1,,,0.0,,,,50


In [62]:
for col in hamburg_data_kmeans.columns :
    hamburg_data_kmeans[col] = pd.to_numeric (hamburg_data_kmeans[col])

In [65]:
# In columns in which we have NaN values, we can fix them to 0 as they are all NaN
hamburg_data_kmeans = hamburg_data_kmeans.fillna(0)

### Leverkusen data

In [130]:
leverkusen_data_kmeans = leverkusen_data[data_match_kmeans]

In [131]:
# Replace the NaN values by Numpy NaN
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(np.nan)

In [132]:
# Replace the match in which Son was not on Match Sheet with NaN values
leverkusen_data_kmeans = leverkusen_data_kmeans.replace('On m', None)

In [133]:
for col in leverkusen_data_kmeans.columns :
    leverkusen_data_kmeans[col] = pd.to_numeric(leverkusen_data_kmeans[col])

In [134]:
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(leverkusen_data_kmeans.mean())

In [136]:
leverkusen_data_kmeans = leverkusen_data_kmeans.fillna(0)

### Spurs data

In [104]:
spurs_data_kmeans = spurs_data[data_match_kmeans]

In [105]:
spurs_data_kmeans = spurs_data_kmeans.fillna(np.nan)

In [106]:
# Replace the match in which Son was not on Match Sheet with NaN values
spurs_data_kmeans = spurs_data_kmeans.replace('On m', None)

In [108]:
for col in spurs_data_kmeans.columns :
    spurs_data_kmeans[col] = pd.to_numeric(spurs_data_kmeans[col])

In [142]:
spurs_data_kmeans = spurs_data_kmeans.fillna(spurs_data_kmeans.mean())

## Split the data : If Son scored and if Son didn't score and store them in csv file (for Silhouette Analysis)

### Hamburg data

In [110]:
hamburg_data_kmeans_no_goals = hamburg_data_kmeans[hamburg_data_kmeans['Gls'] < 1]

In [116]:
hamburg_data_kmeans_no_goals.to_csv("Son_hamburg_no_goals.csv", index=False)

In [121]:
hamburg_data_kmeans_goals = hamburg_data_kmeans[hamburg_data_kmeans['Gls'] >= 1]

In [123]:
hamburg_data_kmeans_goals.to_csv("Son_hamburg_goals.csv", index=False)

### Leverkusen data

In [138]:
leverkusen_data_kmeans_no_goals = leverkusen_data_kmeans[leverkusen_data_kmeans['Gls'] < 1]

In [139]:
leverkusen_data_kmeans_no_goals.to_csv("Son_leverkusen_no_goals.csv", index=False)

In [140]:
leverkusen_data_kmeans_goals = leverkusen_data_kmeans[leverkusen_data_kmeans['Gls'] >= 1]

In [141]:
leverkusen_data_kmeans_goals.to_csv("Son_leverkusen_goals.csv", index=False)

### Spurs data

In [145]:
spurs_data_kmeans_no_goals = spurs_data_kmeans[spurs_data_kmeans['Gls'] < 1]

In [146]:
spurs_data_kmeans_no_goals.to_csv("Son_spurs_no_goals.csv", index=False)

In [147]:
spurs_data_kmeans_goals = spurs_data_kmeans[spurs_data_kmeans['Gls'] >= 1]

In [148]:
spurs_data_kmeans_goals.to_csv("Son_spurs_goals.csv", index=False)