# Clustering

In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy import stats

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

In [24]:
df_players = pd.read_csv("./datasets/players.csv", index_col=0)
#df_players = df_players[df_players.gender == "m"]

In [25]:
feautures = ["rel_df", "rel_1stIn", "rel_2ndWon", "1WonOn1In", "rel_bpSaved"]
df_data = df_players[feautures].reset_index(drop=True)
# df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns)
#df_data = pd.DataFrame(RobustScaler(unit_variance=True).fit_transform(df_data), columns=df_data.columns)
#df_data = df_data[(np.abs(stats.zscore(df_data)) < 2).all(axis=1)]
df_data = pd.DataFrame(QuantileTransformer().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)

df_data.boxplot(column=feautures)

In [26]:
df_data

Unnamed: 0,rel_df,rel_1stIn,rel_2ndWon,1WonOn1In,rel_bpSaved
0,0.724,0.081,0.902,0.619,0.581
1,0.831,0.273,0.449,0.193,0.067
2,0.611,0.007,0.997,0.907,0.809
3,0.422,0.360,0.110,0.549,0.243
4,0.826,0.294,0.628,0.396,0.759
...,...,...,...,...,...
2294,0.088,0.916,0.342,0.791,0.713
2295,0.439,0.347,0.547,0.298,0.697
2296,0.861,0.026,0.117,0.110,0.050
2297,0.215,0.384,0.836,0.498,0.488


## K-means

### Find Optimal K

In [27]:
sse_scores = list()
silhoutte_scores = list()
davies_scores = list()
calinski_harabasz_scores = list()

max_k = 30
for k in range(2, max_k + 1):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100, init="k-means++")
    kmeans.fit(df_data)    
    
    # Sum of squared distances of samples to their closest cluster center
    sse_scores.append(kmeans.inertia_)
    davies_scores.append(davies_bouldin_score(df_data, kmeans.labels_))
    silhoutte_scores.append(silhouette_score(df_data, kmeans.labels_))
    calinski_harabasz_scores.append(calinski_harabasz_score(df_data, kmeans.labels_))


In [28]:
df = pd.DataFrame({"K": list(range(2, max_k + 1)), "sse": sse_scores, "sil": silhoutte_scores, "davies": davies_scores, "calinski": calinski_harabasz_scores})
df.plot(x="K", y=["sse"], kind="line").update_traces(mode='lines+markers').show()
df.plot(x="K", y=["calinski"], kind="line").update_traces(mode='lines+markers').show()
df.plot(x="K", y=["sil", "davies"], kind="line").update_traces(mode='lines+markers').show()

In [29]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)
print("SSE:",sse_scores[optimal_k - 2]," - SILHOUETTE:",silhoutte_scores[optimal_k - 2])

SSE: 420.40533710270154  - SILHOUETTE: 0.24405871869984336


In [30]:
hist, bins = np.histogram(kmeans.labels_, bins=range(0, len(set(kmeans.labels_)) + 1))
clust_dict = dict(zip(bins, hist))
print(clust_dict)

{0: 546, 1: 638, 2: 585, 3: 530}


### Result analysis

In [32]:
df_players["cluster"] = kmeans.labels_.astype(str)
df_players = df_players.round(3)

In [33]:
px.scatter_matrix(df_players,
    dimensions=feautures,
    color="cluster",)

In [38]:

for feature in df_players.columns.drop(["name"]).to_list():
  px.histogram(df_players, x=feature, facet_col="cluster", color=df_players.gender).show()

In [37]:
pd.set_option('display.max_columns', None)
df_players.groupby("cluster").mean()

Unnamed: 0_level_0,total_tourneys_played,total_matches_played,total_matches_won,matches_won_ratio,mean_performance_index,max_performance_index,min_performance_index,ht,age,mean_minutes,max_minutes,minutes_entropy,mean_rank_points,max_rank_points,last_rank_points,variance_rank_points,mean_tourney_spectators,max_tourney_spectators,mean_tourney_revenue,max_tourney_revenue,rel_ace,rel_df,rel_1stIn,rel_1stWon,rel_2ndWon,1WonOn1In,1WonOnTotWon,rel_ptsWon,rel_bpFaced,rel_bpSaved,rel_gmsWon
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,57.305861,119.278388,64.208791,0.421227,0.233938,0.596478,0.001223,172.886076,22.297628,90.249337,154.228938,2.91694,224.963811,425.880601,251.443707,45129.188128,3814.500606,10155.915751,761334.775467,2495299.0,0.02246,0.050531,0.653795,0.381676,0.145668,0.584179,0.72419,0.473872,0.066832,0.475311,0.465764
1,48.667712,100.915361,53.0721,0.457017,0.244519,0.605218,0.005752,184.525714,23.592375,93.528486,176.183386,3.668182,285.060027,564.6691,344.537324,155186.07218,4273.740614,10550.208464,854005.493735,2581004.0,0.068542,0.050378,0.570672,0.400741,0.209577,0.702227,0.65611,0.498997,0.049542,0.567489,0.490165
2,57.054701,121.567521,65.811966,0.478161,0.262291,0.661629,0.0044,183.824859,23.26439,95.20027,186.729915,3.935441,391.193038,747.664844,468.820508,164611.863764,4428.094203,11618.013675,888900.654978,2943637.0,0.057294,0.037703,0.63313,0.431605,0.181306,0.681814,0.703791,0.501562,0.048855,0.575191,0.499229
3,38.971698,78.307547,40.575472,0.358453,0.188626,0.480658,0.001215,176.480769,21.78816,85.828098,140.084906,2.52203,140.117343,274.239045,167.705806,26581.613823,3589.18644,8416.624528,715294.817055,1943040.0,0.036066,0.073264,0.567515,0.345758,0.180057,0.608808,0.656696,0.468375,0.067262,0.465881,0.436472


## Density-based

## Hierarchical

## Optional

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>