In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
# Clase contains which type of tissue is each
X = pd.read_csv('datasets/dataset_tissue.txt', index_col = 0).T
y = pd.read_csv('datasets/clase.txt', index_col = 0)

In [3]:
X_sc = MinMaxScaler().fit_transform(X)

In [4]:
# declaring and pca instance
pca = PCA(n_components = 60, random_state = 23)

# fitting pipeline
X_pca = pca.fit_transform(X_sc)

In [5]:
X_pca

array([[ -5.42329124, -12.2683582 ,   9.00111828, ...,   1.14077385,
          0.17201587,   2.48216327],
       [ -8.73275835, -12.59538796,   6.3317751 , ...,  -0.03229582,
         -0.91982159,  -0.86133693],
       [ -9.24502852,  -0.97764565,   8.71379403, ...,  -0.60658221,
         -0.67590603,  -0.4824903 ],
       ...,
       [-13.10002872,  -6.4571055 ,   2.56779968, ...,  -0.4417286 ,
         -0.29748162,  -0.41769577],
       [-14.68833824,   0.01919342,   1.31596615, ...,   0.2277238 ,
          0.19048803,   0.12284142],
       [-16.67453082,   0.2506692 ,   1.01034996, ...,   1.46861863,
         -0.97375503,   1.09870759]])

In [28]:
from sklearn.metrics import silhouette_score

list_eps = np.linspace(15, 20, 500)
list_min_samples = [2]
list_silhouette_avg = []
for eps in list_eps:
    for min_samples in list_min_samples:
        dbscan = DBSCAN(eps=eps, min_samples=int(min_samples))
        dbscan_labels = dbscan.fit_predict(X_pca)
        if len(set(dbscan_labels)) > 1:
            silhouette_avg = silhouette_score(X_pca, dbscan_labels, random_state=23)
            list_silhouette_avg.append((silhouette_avg,eps,min_samples))
            # print(f"Silhouette Score: {silhouette_avg}")
            # print("EPS:", eps)
            # print("min_samples:", min_samples)
            # print(np.unique(dbscan_labels))

print(max(list_silhouette_avg, key = lambda x: x[0]))

(0.354613029095344, 17.024048096192384, 2)


In [17]:
dbscan = DBSCAN(eps=19.6, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_pca)
silhouette_avg = silhouette_score(X_pca, dbscan_labels, random_state=23)
silhouette_avg

0.30768958873033714

In [26]:
dbscan = DBSCAN(eps=17.22, min_samples=2)
dbscan_labels = dbscan.fit_predict(X_pca)
silhouette_avg = silhouette_score(X_pca, dbscan_labels, random_state=23)
silhouette_avg

0.35661316581887015

In [7]:
dbscan = DBSCAN(eps=14.2, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_pca)
dbscan_labels

array([-1, -1,  0, -1, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0, -1,  0,  1,
        1, -1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2, -1, -1,  3, -1,  3,  3,  3, -1, -1,  3, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  4,  4, -1, -1, -1, -1,  4,  4,  4, -1, -1, -1,  4, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  5, -1,  5,  5,  5,
       -1, -1,  5,  6,  6,  6,  6,  6,  6,  6, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  2, -1,  2, -1, -1, -1, -1,
       -1, -1])

In [11]:
from sklearn.metrics import silhouette_score

if len(set(dbscan_labels)) > 1:
    silhouette_avg = silhouette_score(X_pca, dbscan_labels, random_state=23)
    print(f"Silhouette Score: {silhouette_avg}")


Silhouette Score: 0.12355715953656954
