## Imports

In [2]:
from sklearn.cluster import KMeans
import pandas as pd 
import numpy as np
from datetime import datetime

## K-Means from a csv file

In [3]:
def KMC (filename, n) :

    data = pd.read_csv(filename)
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")
    return

## K-Means from a dataframe

In [4]:
def KMC_df (df, n) :

    data = df
    kmeans = KMeans(n_clusters= n, random_state= 0)
    kmeans.fit(data)
    centers = kmeans.cluster_centers_

    for c in range (len(centers)) :
        print ("---------------------- Values for Cluster ---------------", c+1, " ---------------------")
        for i in range (len(centers[c])) :
            print ((data.columns.values[i]), ": ", np.round(centers[c][i], 4))
        print (np.count_nonzero(kmeans.labels_ == c), " Games included in this cluster")
    return

## Select data from the Liverpool Premier League's games csv file

In [5]:
liverpool_data = pd.read_csv("Liverpool_data_epl_match_details.csv")

### Transform the type of column 'date_match'

In [6]:
liverpool_data['date_match'] = pd.to_datetime(liverpool_data['date_match'])

### Create a column 'matchday' knowing that we have 38 games in a single season and we have 8 season in our dataset

In [7]:
matchdays = [i for i in range(1,39)]*8

In [8]:
liverpool_data['matchday'] = matchdays

### Create a column 'points_won' based on that : 3pts for a win, 1 pts for a draw and 0 for a loss

In [9]:
liverpool_data['points_won'] = liverpool_data['liverpool_result_ft'].apply(lambda x : 0 if x == "D" else( 1  if x == "N" else 3))

### Create a column 'home_0_away_1' to have a binary variable for the location of the match (Anfield or not)

In [10]:
liverpool_data['home_0_away_1'] = liverpool_data['at_anfield'].apply(lambda x : 0 if x else 1)

# Filter the data to take only Klopp's games who started on 17/10/2015

In [11]:
start_klopp_date = datetime.strptime("2015-10-17", "%Y-%m-%d")

In [12]:
klopp_data = liverpool_data[liverpool_data["date_match"] >= start_klopp_date]

In [13]:
klopp_data.to_csv("klopp_data_epl_matchs_details.csv", index=False)

### We take the columns that we will use for K-means algorithm (quantitative variables)

### We convert the Possession (str to int) to be able to use it in our algorithm

In [14]:
klopp_data['Possession'] = klopp_data['Possession'].astype(str)

In [15]:
for i in range (122,304) :
    klopp_data['Possession'][i] = int (klopp_data['Possession'][i].replace("%",""))

In [16]:
klopp_data_kmeans = klopp_data[['matchday','Rest_Days', 'home_0_away_1', 'Elo_Opponent', 'Elo_Liverpool', 'Elo_Diff', 'points_won', 'Shots', 'Shots On Target', 'Passes', 'Possession']]

In [17]:
klopp_data_kmeans.to_csv("klopp_data_epl_kmeans.csv", index=False)

## We select only the games in which Liverpool lost some points (defeats and draws)

In [18]:
lost_points = [0,1]

klopp_data_kmeans_lost_points = klopp_data_kmeans[klopp_data_kmeans['points_won'].isin(lost_points)]

In [19]:
klopp_data_kmeans_lost_points.to_csv("klopp_data_epl_kmeans_lost_points.csv", index =False )

In [20]:
klopp_kmeans_lost_points = KMC("klopp_data_epl_kmeans_lost_points.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
matchday :  23.9412
Rest_Days :  8.1765
home_0_away_1 :  0.5882
Elo_Opponent :  1655.9412
Elo_Liverpool :  1878.0739
Elo_Diff :  222.1328
points_won :  0.6471
Shots :  15.9706
Shots On Target :  4.7353
Passes :  552.8529
Possession :  63.8529
34  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
matchday :  18.0667
Rest_Days :  6.2667
home_0_away_1 :  0.6
Elo_Opponent :  1823.5
Elo_Liverpool :  1831.8279
Elo_Diff :  8.3279
points_won :  0.6333
Shots :  13.9667
Shots On Target :  4.1333
Passes :  413.7667
Possession :  52.9
30  Games included in this cluster


## We select only the games in which Liverpool gained 3 points (wins)

In [21]:
gained_points = [3]

klopp_data_kmeans_gained_points = klopp_data_kmeans[klopp_data_kmeans['points_won'].isin(gained_points)]

In [22]:
klopp_data_kmeans_gained_points.to_csv("klopp_data_epl_kmeans_gained_points.csv", index =False )

In [23]:
klopp_kmeans_gained_points = KMC("klopp_data_epl_kmeans_gained_points.csv", 7)

---------------------- Values for Cluster --------------- 1  ---------------------
matchday :  15.25
Rest_Days :  7.3333
home_0_away_1 :  0.3333
Elo_Opponent :  1822.5833
Elo_Liverpool :  1785.2694
Elo_Diff :  -37.3139
points_won :  3.0
Shots :  15.0833
Shots On Target :  6.9167
Passes :  359.75
Possession :  51.4167
12  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
matchday :  21.0345
Rest_Days :  5.2069
home_0_away_1 :  0.6207
Elo_Opponent :  1660.7931
Elo_Liverpool :  1821.8199
Elo_Diff :  161.0268
points_won :  3.0
Shots :  17.2759
Shots On Target :  6.4828
Passes :  439.5862
Possession :  57.3448
29  Games included in this cluster
---------------------- Values for Cluster --------------- 3  ---------------------
matchday :  21.125
Rest_Days :  4.625
home_0_away_1 :  0.5
Elo_Opponent :  1659.25
Elo_Liverpool :  2009.5743
Elo_Diff :  350.3243
points_won :  3.0
Shots :  17.75
Shots On Target :  7.125
Passes :  803.5


# We implement the algorithm for Brendan Rodgers

In [24]:
rodgers_data = liverpool_data[liverpool_data["date_match"] < start_klopp_date]

### We take the columns that we will use for K-means algorithm (quantitative variables)

In [25]:
rodgers_data['Possession'] = rodgers_data['Possession'].astype(str)

In [26]:
rodgers_data.to_csv("rodgers_data_epl_matchs_details.csv", index= False)

In [27]:
for i in range (0,122) :
    rodgers_data['Possession'][i] = int (rodgers_data['Possession'][i].replace("%",""))

In [28]:
rodgers_data_kmeans = rodgers_data[['matchday','Rest_Days', 'home_0_away_1', 'Elo_Opponent', 'Elo_Liverpool', 'Elo_Diff', 'points_won', 'Shots', 'Shots On Target', 'Passes', 'Possession']]

In [29]:
rodgers_data_kmeans.to_csv("rodgers_data_epl_kmeans.csv", index=False)

## We select only the games in which Liverpool lost some points (defeats and draws)


In [30]:
lost_points = [0,1]

rodgers_data_kmeans_lost_points = rodgers_data_kmeans[rodgers_data_kmeans['points_won'].isin(lost_points)]

In [31]:
rodgers_data_kmeans_lost_points.to_csv("rodgers_data_epl_kmeans_lost_points.csv", index =False )

In [32]:
rodgers_kmeans_lost_points = KMC("rodgers_data_epl_kmeans_lost_points.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
matchday :  16.25
Rest_Days :  5.9643
home_0_away_1 :  0.6071
Elo_Opponent :  1850.1429
Elo_Liverpool :  1780.6647
Elo_Diff :  -69.4782
points_won :  0.5
Shots :  14.75
Shots On Target :  5.25
Passes :  350.3571
Possession :  50.5714
28  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
matchday :  17.6129
Rest_Days :  6.5484
home_0_away_1 :  0.5806
Elo_Opponent :  1634.129
Elo_Liverpool :  1793.618
Elo_Diff :  159.4889
points_won :  0.5161
Shots :  16.7419
Shots On Target :  5.5484
Passes :  433.0
Possession :  59.1613
31  Games included in this cluster


## We select only the games in which Liverpool gained 3 points (wins)

In [33]:
gained_points = [3]

rodgers_data_kmeans_gained_points = rodgers_data_kmeans[rodgers_data_kmeans['points_won'].isin(gained_points)]

In [34]:
rodgers_data_kmeans_gained_points.to_csv("rodgers_data_epl_kmeans_gained_points.csv", index =False )

In [35]:
rodgers_kmeans_gained_points = KMC("rodgers_data_epl_kmeans_gained_points.csv", 2)

---------------------- Values for Cluster --------------- 1  ---------------------
matchday :  19.3333
Rest_Days :  5.8431
home_0_away_1 :  0.4314
Elo_Opponent :  1629.4118
Elo_Liverpool :  1797.0416
Elo_Diff :  167.6298
points_won :  3.0
Shots :  18.5098
Shots On Target :  7.6863
Passes :  429.8431
Possession :  57.0196
51  Games included in this cluster
---------------------- Values for Cluster --------------- 2  ---------------------
matchday :  22.6667
Rest_Days :  6.25
home_0_away_1 :  0.3333
Elo_Opponent :  1852.0833
Elo_Liverpool :  1837.7571
Elo_Diff :  -14.3263
points_won :  3.0
Shots :  14.1667
Shots On Target :  6.75
Passes :  291.9167
Possession :  46.1667
12  Games included in this cluster
