In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [6]:
df = pd.read_excel("athletewise data.xlsx").drop('Athlete', axis=1)
df.head()

Unnamed: 0,PTS,MIN,FTM,PA,FGA,FGM,PM,FTA,OREB,DREB,REB,AST,BLK,STL,TO,PF,Game Score
0,5.0,6.0,0.0,2.0,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.9
1,0.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-0.7
2,0.0,12.0,0.0,3.0,4.0,0.0,0.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,4.0,1.0,-4.9
3,4.0,5.0,0.0,1.0,3.0,2.0,0.0,0.0,3.0,0.0,3.0,2.0,0.0,0.0,0.0,1.0,5.8
4,6.639344,24.983607,1.393443,0.262295,5.721311,2.606557,0.032787,2.344262,2.688525,4.852459,7.540984,0.819672,0.262295,0.754098,2.508197,2.147541,4.778689


In [7]:
df_numeric = df.select_dtypes(include=[float, int])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)
print(X_scaled)

[[-0.15755696 -1.23199354 -0.75357133 ...  0.47058423  0.51541143
  -0.6326    ]
 [-1.12474062 -1.54519568 -0.75357133 ... -0.3288583  -1.35276342
  -0.95471039]
 [-1.12474062 -0.60558924 -0.75357133 ...  2.06946931 -0.418676
  -1.80025018]
 ...
 [-0.55110066 -0.64878954 -0.41185117 ... -0.35642529  0.51541143
  -0.58400576]
 [-0.15755696 -0.34458745  1.53332515 ... -0.3288583   0.98245514
  -0.04877491]
 [ 1.12396138  1.13007266  0.10401485 ...  1.36995709  0.74893328
   1.07861147]]


In [8]:
eps_values = np.arange(0.1, 2.0, 0.1)
results = []

In [9]:
for eps in eps_values:
    dbscan = DBSCAN(eps=eps)
    labels = dbscan.fit_predict(X_scaled)
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    if n_clusters > 1:
        silhouette_avg = silhouette_score(X_scaled, labels)
        calinski_harabasz_avg = calinski_harabasz_score(X_scaled, labels)
        davies_bouldin_avg = davies_bouldin_score(X_scaled, labels)
    else:
        silhouette_avg = calinski_harabasz_avg = davies_bouldin_avg = None
    
    results.append({
        'eps': eps,
        'n_clusters': n_clusters,
        'n_noise': n_noise,
        'silhouette_score': silhouette_avg,
        'calinski_harabasz_score': calinski_harabasz_avg,
        'davies_bouldin_score': davies_bouldin_avg
    })

In [10]:
results_df = pd.DataFrame(results)
optimal_eps = results_df.loc[results_df['silhouette_score'].idxmax()]['eps']

In [11]:
results_df, optimal_eps

(    eps  n_clusters  n_noise  silhouette_score  calinski_harabasz_score  \
 0   0.1           4     2950         -0.116138                14.342729   
 1   0.2           6     2915         -0.111105                16.045412   
 2   0.3           5     2873         -0.095989                28.637207   
 3   0.4           7     2804         -0.155599                30.791636   
 4   0.5           1     2734               NaN                      NaN   
 5   0.6           2     2655         -0.125616               178.430323   
 6   0.7           2     2523         -0.099837               233.860620   
 7   0.8           4     2391         -0.196907               142.665108   
 8   0.9           3     2245         -0.177061               219.780155   
 9   1.0           3     2125         -0.134837               247.877143   
 10  1.1           5     1990         -0.140013               159.595398   
 11  1.2           5     1815         -0.200847               163.886982   
 12  1.3    

In [12]:
dbscan = DBSCAN(eps=optimal_eps)
dbscan.fit(X_scaled)
labels = dbscan.labels_

In [13]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 3
Estimated number of noise points: 1534


In [14]:
silhouette_avg = silhouette_score(X_scaled, labels)
print('Silhouette Score:', silhouette_avg)

calinski_harabasz_avg = calinski_harabasz_score(X_scaled, labels)
print('Calinski-Harabasz Index:', calinski_harabasz_avg)

davies_bouldin_avg = davies_bouldin_score(X_scaled, labels)
print('Davies-Bouldin Index:', davies_bouldin_avg)


Silhouette Score: -0.0034165224006341794
Calinski-Harabasz Index: 273.4336840302565
Davies-Bouldin Index: 2.2629418811567366
