# Clustering

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.decomposition import PCA

# KMEANS
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from yellowbrick.cluster.elbow import KElbowVisualizer 
from yellowbrick.cluster import silhouette_visualizer, intercluster_distance 

# DBSCAN
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import pdist, squareform
from sklearn.neighbors import NearestNeighbors

# Visualization
import plotly.express as px
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.templates.default = "seaborn"
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [None]:
df_players = pd.read_csv("./datasets/players.csv", index_col=0)

## Features preparation

In [None]:
feautures = ['lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points', 'variance_rank_points', 'max_tourney_revenue']
#feautures = ['lrpOnAvgrp', 'lrpOnMxrp', 'variance_rank_points', 'max_tourney_revenue'] # Sil:  0.5278228177769032
#['lrpOnAvgrp', 'lrpOnMxrp', 'matches_won_ratio', 'mean_rank_points', 'variance_rank_points', 'mean_tourney_spectators', 'max_tourney_revenue', 'rel_ptsWon']
#['lrpOnAvgrp', 'lrpOnMxrp', 'variance_rank_points', 'max_tourney_revenue']
#['matches_won_ratio', 'mean_rank_points', 'mean_tourney_spectators', 'mean_tourney_revenue'] -> Siluette score

# Plots
# for feature in feautures:
#     df_players[feature].hist().show()

### Transformations

In [None]:
df = df_players.copy()
df['mean_rank_points'] = np.log(df_players['mean_rank_points'])
df['variance_rank_points'] = np.log1p(df_players['variance_rank_points'])

df['mean_rank_points'].hist().show()
df['variance_rank_points'].hist().show()

### Normalization

In [None]:
df_data = df[feautures].reset_index(drop=True)
df_data = pd.DataFrame(MinMaxScaler().fit_transform(df_data), columns=df_data.columns)
#df_data = pd.DataFrame(RobustScaler(unit_variance=True).fit_transform(df_data), columns=df_data.columns)
#df_data = df_data[(np.abs(stats.zscore(df_data)) < 2).all(axis=1)]
#df_data = pd.DataFrame(QuantileTransformer().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)

df_data.boxplot(column=feautures)

## PCA

In [None]:
# # PCA
# pca = PCA(n_components=2)
# components = pca.fit_transform(df_data)
# components_df = pd.DataFrame(components)

# # Cluster on PCA
# model = KMeans()
# sse_visualizer = KElbowVisualizer(model, k=(2,8), timings=False)
# sse_visualizer.fit(components_df.iloc[:,:3])

# kmeans = KMeans(n_clusters=sse_visualizer.elbow_value_, n_init=10, max_iter=100, init="k-means++")
# kmeans.fit(components_df.iloc[:,:3])

# px.bar(x=range(pca.n_components), y=pca.explained_variance_ratio_, title="Explained variances").show()
# px.scatter(x=components_df[0], y=components_df[1], color=kmeans.labels_.astype(str))

In [None]:
# PCA
pca = PCA(n_components=3)
components = pca.fit_transform(df_data)
components_df = pd.DataFrame(components)

# Cluster on PCA
model = KMeans()
sse_visualizer = KElbowVisualizer(model, k=(2,8), timings=False)
sse_visualizer.fit(components_df.iloc[:,:4])

kmeans = KMeans(n_clusters=sse_visualizer.elbow_value_, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(components_df.iloc[:,:4])

px.bar(x=range(pca.n_components), y=pca.explained_variance_ratio_, title="Explained variances").show()
px.scatter_3d(x=components_df[0], y=components_df[1], z=components_df[2], color=kmeans.labels_.astype(str))

## K-means

### Find Optimal K

In [None]:
model = KMeans()
sse_visualizer = KElbowVisualizer(model, k=(2,8), timings=False)
sse_visualizer.fit(df_data)
sse_visualizer.show()

sil_visualizer = KElbowVisualizer(model, k=(2,8), timings=False, metric="silhouette")
sil_visualizer.fit(df_data)
sil_visualizer.show()

Picking optimal K

In [None]:
optimal_k = sse_visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, n_init=10, max_iter=100, init="k-means++")
kmeans.fit(df_data)

df_players["cluster_kmeans"] = kmeans.labels_.astype(str)

x = silhouette_visualizer(KMeans(optimal_k, random_state=42), df_data)
print("The silhoutte score is: " + str(x.silhouette_score_))

In [None]:
intercluster_distance(kmeans, df_data)

### Result analysis

#### Cluster distribution

In [None]:
df_players.groupby("cluster_kmeans").count()["name"].plot.bar()

#### Interpretatation

In [None]:
df_players.groupby("cluster_kmeans").mean()

In [None]:
print(f'Most frequent values per cluster')
out_dict = {}
for cluster in range(0, optimal_k):
    temp_df = df_players.groupby(by='cluster_kmeans').get_group(str(cluster))
    temp_dict = {}

    for col in temp_df.columns:
        temp_dict[col] = temp_df[col].value_counts().idxmax()
    out_dict[cluster] = temp_dict

pd.DataFrame(out_dict)

In [None]:
px.scatter_matrix(df_players,
    dimensions=feautures,
    color="cluster_kmeans")

In [None]:
# TODO - show only the most significant features
for feature in df_players.columns.drop(["name"]).to_list():
  px.histogram(df_players, x=feature, facet_col="cluster_kmeans", color=df_players.gender).show()

## Density-based

In [None]:
df_data = df_players[feautures].reset_index(drop=True)
df_data = pd.DataFrame(StandardScaler().fit_transform(df_data), columns=df_data.columns)
df_data = df_data.round(3)
df_data.boxplot(column=feautures)

In [None]:
dist = pdist(X=df_data, metric='euclidean')  # pair-wise distance: how every record is far from all others
dist = squareform(dist)                      # distance matrix given the vector dist

kmin, kmax = 3, 30
kth_distances = {k:[] for k in range(kmin, kmax + 1)} # initialize k lists

for d in dist:
    # argsort returns the indexes that would sort d
    indexes_to_sort_d = np.argsort(d)
    for k in range(kmin, kmax + 1):
        # append to kth_distances[k] the value in d that corresponds
        # to the distance of the i-th point (record) from its k-th nn.
        # it's like: kth_distances[k].append(sorted_d[k])), but we get "sorted_d[k]" by d[indexes_to_sort_d[k]]
        kth_distances[k].append(d[indexes_to_sort_d[k]])

import plotly.graph_objects as go

fig = go.Figure()

for k in kth_distances.keys():
    fig.add_trace(go.Scatter(x = np.array(range(0, len(kth_distances[k]))), y = sorted(kth_distances[k]), mode = 'lines' , name = str(k)))

fig.show()

In [None]:
def get_metrics(eps, min_samples, dataset, iter_):
    
    # Fitting ======================================================================
    
    dbscan_model_ = DBSCAN( eps = eps, min_samples = min_samples)
    dbscan_model_.fit(dataset)
    
    # Mean Noise Point Distance metric =============================================
    noise_indices = dbscan_model_.labels_ == -1
    
    if True in noise_indices:
        neighboors = NearestNeighbors(n_neighbors = 6).fit(dataset)
        distances, indices = neighboors.kneighbors(dataset)
        noise_distances = distances[noise_indices, 1:]
        noise_mean_distance = round(noise_distances.mean(), 3)
    else:
        noise_mean_distance = None
        
    # Number of found Clusters metric ==============================================
    
    number_of_clusters = len(set(dbscan_model_.labels_[dbscan_model_.labels_ >= 0]))
    
    # Log ==========================================================================
    
    print("%3d | Tested with eps = %3s and min_samples = %3s | %5s %4s" % (iter_, eps, min_samples, str(noise_mean_distance), number_of_clusters))
        
    return(noise_mean_distance, number_of_clusters)

### Find optimal hyper-parameters

In [None]:
eps_to_test = [round(eps,1) for eps in np.arange(0.1, 3, 0.1)]
min_samples_to_test = range(3, 30, 2)

print("EPS:", eps_to_test)
print("MIN_SAMPLES:", list(min_samples_to_test))

# Dataframe per la metrica sulla distanza media dei noise points dai K punti più vicini
results_noise = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

# Dataframe per la metrica sul numero di cluster
results_clusters = pd.DataFrame( 
    data = np.zeros((len(eps_to_test),len(min_samples_to_test))), # Empty dataframe
    columns = min_samples_to_test, 
    index = eps_to_test
)

iter_ = 0

print("ITER| INFO%s |  DIST    CLUS" % (" "*39))
print("-"*65)

for eps in eps_to_test:
    for min_samples in min_samples_to_test:
        
        iter_ += 1
        
        # Calcolo le metriche
        noise_metric, cluster_metric = get_metrics(eps, min_samples, df_data, iter_)
        
        # Inserisco i risultati nei relativi dataframe
        results_noise.loc[eps, min_samples] = noise_metric
        results_clusters.loc[eps, min_samples] = cluster_metric

In [None]:
sm = (results_clusters == 3)# & (results_clusters <= 5)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,8) )

sns.heatmap(results_noise[sm], annot = True, ax = ax1, cbar = False).set_title("METRIC: Mean Noise Points Distance")
sns.heatmap(results_clusters[sm], annot = True, ax = ax2, cbar = False).set_title("METRIC: Number of clusters")

ax1.set_xlabel("N"); ax2.set_xlabel("N")
ax1.set_ylabel("EPSILON"); ax2.set_ylabel("EPSILON")

plt.tight_layout(); plt.show()

### Result analysis

In [None]:
dbscan = DBSCAN(eps=1, min_samples=3).fit(df_data)
results = np.unique(dbscan.labels_, return_counts=True)
print(f"Clusters labels: {results[0]}\nElements per cluster: {results[1]}")

df_players["cluster_dbscan"] = dbscan.labels_.astype(str)
df_players = df_players.round(3)

df_players.groupby("cluster_dbscan").mean()

## Hierarchical

## Optional

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=56da3ab5-e195-41aa-a609-f5fefeb3379d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>