# Clustering

In [57]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from yellowbrick.cluster.elbow import KElbowVisualizer
from yellowbrick.cluster import silhouette_visualizer, intercluster_distance

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"

In [58]:
df_players = pd.read_csv("./datasets/players.csv", index_col=0)
df_players = df_players[df_players.gender == "f"]

In [59]:
feautures = ["rel_df", "rel_1stIn", "rel_2ndWon", "1WonOn1In", "rel_bpSaved"]
df_data = df_players[feautures].reset_index(drop=True)
# df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns)
#df_data = pd.DataFrame(RobustScaler(unit_variance=True).fit_transform(df_data), columns=df_data.columns)
#df_data = df_data[(np.abs(stats.zscore(df_data)) < 2).all(axis=1)]
df_data = pd.DataFrame(QuantileTransformer().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)

df_data.boxplot(column=feautures)


n_quantiles (1000) is greater than the total number of samples (714). n_quantiles is set to n_samples.



In [60]:
df_data

Unnamed: 0,rel_df,rel_1stIn,rel_2ndWon,1WonOn1In,rel_bpSaved
0,0.750,0.234,0.694,0.393,0.087
1,0.295,0.302,0.213,0.886,0.369
2,0.743,0.248,0.850,0.729,0.931
3,0.042,0.526,0.649,0.830,0.849
4,0.620,0.290,0.668,0.816,0.499
...,...,...,...,...,...
709,0.530,0.874,0.168,0.450,0.412
710,0.748,0.285,0.607,0.112,0.942
711,0.310,0.292,0.795,0.576,0.896
712,0.785,0.017,0.227,0.233,0.060


In [140]:
# Create a PCA instance: pca
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(df_data)
# Plot the explained variances
features = range(pca.n_components_)
px.bar(x=features, y=pca.explained_variance_ratio_)

# Save components to a DataFrame
PCA_components = pd.DataFrame(principalComponents)

px.scatter(x=PCA_components[0], y=PCA_components[1])

## K-means

### Find Optimal K

In [None]:
model = KMeans()
sse_visualizer = KElbowVisualizer(model, k=(2,12), timings=False)
sse_visualizer.fit(df_data)
sse_visualizer.show()

silhouette_visualizer = KElbowVisualizer(model, k=(2,12), timings=False, metric="silhouette")
silhouette_visualizer.fit(df_data)
silhouette_visualizer.show()

calinski_visualizer = KElbowVisualizer(model, k=(2,12), metric='calinski_harabasz', timings=False)
calinski_visualizer.fit(df_data)
calinski_visualizer.show()

In [None]:
silhouette_visualizer(KMeans(5, random_state=42), df_data, colors='yellowbrick')

In [None]:
intercluster_distance(kmeans, df_data)

In [61]:
# sse_scores = list()
# silhoutte_scores = list()
# davies_scores = list()
# calinski_harabasz_scores = list()

# max_k = 30
# for k in range(2, max_k + 1):
#     kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100, init="k-means++")
#     kmeans.fit(df_data)    
    
#     # Sum of squared distances of samples to their closest cluster center
#     sse_scores.append(kmeans.inertia_)
#     davies_scores.append(davies_bouldin_score(df_data, kmeans.labels_))
#     silhoutte_scores.append(silhouette_score(df_data, kmeans.labels_))
#     calinski_harabasz_scores.append(calinski_harabasz_score(df_data, kmeans.labels_))

# df = pd.DataFrame({"K": list(range(2, max_k + 1)), "sse": sse_scores, "sil": silhoutte_scores, "davies": davies_scores, "calinski": calinski_harabasz_scores})
# df.plot(x="K", y=["sse"], kind="line").update_traces(mode='lines+markers').show()
# df.plot(x="K", y=["calinski"], kind="line").update_traces(mode='lines+markers').show()
# df.plot(x="K", y=["sil", "davies"], kind="line").update_traces(mode='lines+markers').show()


In [142]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)
# print("SSE:",sse_scores[optimal_k - 2]," - SILHOUETTE:",silhoutte_scores[optimal_k - 2])

df_players["cluster"] = kmeans.labels_.astype(str)
df_players = df_players.round(3)

### Result analysis

Cluster distribution

In [None]:
df_players.groupby("cluster").count()["name"].plot.bar()

#### Interpretatation

In [102]:
px.scatter_matrix(df_players,
    dimensions=feautures,
    color="cluster")

In [None]:

print(f'Most frequent values per cluster')
out_dict = {}
for cluster in range(0, optimal_k):
    temp_df = df_players.groupby(by='cluster').get_group(str(cluster))
    temp_dict = {}

    for col in temp_df.columns:
        temp_dict[col] = temp_df[col].value_counts().idxmax()
    out_dict[cluster] = temp_dict

pd.DataFrame(out_dict)

In [103]:

for feature in df_players.columns.drop(["name"]).to_list():
  px.histogram(df_players, x=feature, facet_col="cluster", color=df_players.gender).show()

In [69]:
pd.set_option('display.max_columns', None)
df_players.groupby("cluster").agg(["mean", "std"])

Unnamed: 0_level_0,total_tourneys_played,total_tourneys_played,total_matches_played,total_matches_played,total_matches_won,total_matches_won,matches_won_ratio,matches_won_ratio,mean_performance_index,mean_performance_index,max_performance_index,max_performance_index,min_performance_index,min_performance_index,ht,ht,age,age,mean_minutes,mean_minutes,max_minutes,max_minutes,minutes_entropy,minutes_entropy,mean_rank_points,mean_rank_points,max_rank_points,max_rank_points,last_rank_points,last_rank_points,variance_rank_points,variance_rank_points,mean_tourney_spectators,mean_tourney_spectators,max_tourney_spectators,max_tourney_spectators,mean_tourney_revenue,mean_tourney_revenue,max_tourney_revenue,max_tourney_revenue,rel_ace,rel_ace,rel_df,rel_df,rel_1stIn,rel_1stIn,rel_1stWon,rel_1stWon,rel_2ndWon,rel_2ndWon,1WonOn1In,1WonOn1In,1WonOnTotWon,1WonOnTotWon,rel_ptsWon,rel_ptsWon,rel_bpFaced,rel_bpFaced,rel_bpSaved,rel_bpSaved,rel_gmsWon,rel_gmsWon
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2
0,82.269461,39.580699,172.095808,87.193472,93.275449,51.407264,0.510251,0.113088,0.300156,0.089672,0.786347,0.178742,0.0,0.0,170.45,6.708008,21.988701,4.519976,92.008569,13.667401,156.473054,37.564212,2.869353,1.113891,194.294814,217.364093,384.879988,445.729787,243.790952,302.94704,20748.186281,61628.12,3569.118653,1049.095111,11590.982036,5676.549156,710149.491856,202266.098055,2914981.0,1720064.0,0.013485,0.009057,0.049389,0.018296,0.670317,0.034057,0.369934,0.034035,0.133641,0.019371,0.55191,0.041607,0.735072,0.03467,0.468695,0.03276,0.07121,0.010937,0.453024,0.050921,0.505072,0.06243
1,84.757143,37.310325,188.609524,86.659151,107.695238,54.143991,0.556386,0.081807,0.334019,0.074373,0.842871,0.142042,0.0,0.0,175.821429,5.665709,21.815867,4.845711,94.669281,8.080995,169.033333,32.539322,3.790214,1.313698,597.0529,818.64044,1269.999886,1699.857382,850.213848,1342.937635,306254.57311,1045453.0,4572.610714,1453.269798,14561.457143,4815.616531,909523.85629,281665.875678,3794302.0,1534021.0,0.045329,0.020583,0.064576,0.023753,0.57469,0.031341,0.373748,0.028369,0.19051,0.014548,0.651167,0.034584,0.662486,0.026969,0.502857,0.020407,0.0596,0.008857,0.5331,0.045232,0.528552,0.032165
2,86.942529,37.671797,190.965517,86.895851,108.166667,53.514741,0.548316,0.091839,0.322897,0.074269,0.813144,0.133672,0.0,0.0,172.369231,5.964921,22.650828,4.485358,95.975557,10.032858,177.626437,39.577006,3.831891,1.410053,643.192057,911.761237,1218.212437,1630.456823,628.375644,832.582828,227399.20704,612442.7,4549.455644,1477.200214,13990.643678,5512.952188,906324.406523,287674.111757,3670944.0,1641750.0,0.027626,0.014591,0.045236,0.015284,0.64527,0.028228,0.398299,0.024419,0.159701,0.017552,0.617902,0.030193,0.714207,0.02839,0.502897,0.024945,0.061724,0.00945,0.53854,0.046666,0.526011,0.03873
3,64.171779,39.93878,131.90184,87.97083,70.687117,52.4438,0.484509,0.138989,0.284926,0.110682,0.728147,0.211129,0.0,0.0,172.5,6.654155,20.912951,4.212326,87.396491,13.323308,141.865031,33.074461,2.430166,0.889051,138.168693,114.586255,253.901215,192.925444,150.946988,147.096041,11036.497828,17668.12,3216.970706,1108.195521,9658.134969,5742.072458,642034.493957,220150.747776,2228940.0,1527345.0,0.024012,0.014874,0.077982,0.02974,0.582258,0.035275,0.332503,0.036646,0.164264,0.022689,0.570546,0.049548,0.668534,0.037812,0.463914,0.038378,0.072252,0.013317,0.439945,0.06497,0.492294,0.078353


## Density-based

## Hierarchical

## Optional

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>