In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import AffinityPropagation
import pacmap

In [2]:
df_train = pd.read_csv('train_pca.csv', index_col=0)

In [3]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
666_benign_train,-319.387866,-417.589250,4530.513977,1587.812063,1724.613794,-836.265050,295.739737,4804.498124,-223.019630,-884.570469,...,-235.482328,-346.888501,-2.202963,-527.121643,-604.879421,-232.817520,-172.143389,-194.769381,-138.814817,214.659891
470_benign_train,3786.186871,7385.170320,-3423.374947,244.308580,-1237.968229,-641.818517,-6559.241272,-2172.843429,-71.508032,1424.216097,...,-1832.688215,-1147.440420,960.988836,1024.001084,985.578526,717.296412,111.221696,246.094942,459.125071,359.366255
1202_benign_train,1775.743097,-13197.367974,-315.281618,-2094.339135,-1365.167897,355.095235,1224.650013,1096.577469,-1675.182939,-1126.957577,...,71.713504,1596.150241,-816.822696,-4942.641970,1688.245246,990.559684,1335.109207,725.536428,1628.957493,-381.065329
1459_benign_train,1809.608105,8722.743196,1102.701736,-530.729117,-7573.707934,2186.368496,24.003336,-1885.511161,1357.574140,528.729628,...,699.068939,1726.878468,187.216435,107.687678,870.637589,-372.794292,-469.972698,-597.000455,-131.268854,75.211831
1333_benign_train,-284.255610,3422.974215,4944.635126,-3024.625020,406.108330,-2730.408195,-962.190251,844.318764,-3214.764537,462.795686,...,-331.891470,-53.540276,-87.874444,1513.821601,-211.438040,411.418638,659.894729,376.454372,-266.999270,-383.643699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931_malignant_train,15048.262105,-3486.499095,-4523.064804,-1917.063302,563.111557,-811.493906,69.207960,4955.113346,-1146.926414,3249.838483,...,-343.786781,734.537678,729.498036,-482.508675,1047.611907,-847.478485,626.252040,-192.685402,364.304382,575.684121
86_malignant_train,19654.231934,-2400.977333,770.322474,-1308.765088,-763.000060,304.827885,22.257187,-2913.819852,1993.127968,2616.695575,...,316.926830,645.059006,865.266885,-810.991437,-38.891163,16.713776,-305.372573,-979.935952,-1122.071351,335.086426
904_malignant_train,-4607.591585,-4609.970209,-2277.840356,566.724144,2031.172192,2697.108441,-3320.172840,1187.728646,-706.789147,1016.015172,...,-958.514566,-1216.060846,-744.147924,-157.234682,-223.151643,103.474516,-933.654595,1240.385843,-625.034950,-31.006149
224_malignant_train,16014.270872,-2683.225868,-142.451798,-1093.745533,3680.334483,1824.080594,288.183350,-2551.887223,-2450.159063,-2235.294898,...,227.867422,47.949713,481.915880,167.413603,918.142602,448.822613,-139.240465,1514.817388,1305.536416,416.579857


In [4]:
# reduce to 3D
mapper = pacmap.PaCMAP(n_components=3)
X = df_train.values
X_3d = mapper.fit_transform(X)

df_train_3d = pd.DataFrame(X_3d, columns=['x', 'y', 'z'], index=df_train.index)
df_train_3d

Unnamed: 0,x,y,z
666_benign_train,-0.196607,4.759403,5.199811
470_benign_train,1.131784,0.648980,-0.592014
1202_benign_train,-0.132259,-6.247659,-5.685994
1459_benign_train,1.595588,4.218502,2.488258
1333_benign_train,0.147732,5.136787,4.476818
...,...,...,...
931_malignant_train,5.045954,-2.759437,-3.670108
86_malignant_train,6.032472,-2.193804,-2.652862
904_malignant_train,-2.143592,-2.695632,-2.791910
224_malignant_train,5.372513,-2.677425,-2.485770


In [5]:
# use affinity propagation to cluster
af = AffinityPropagation(max_iter=1000).fit(X_3d)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

df_train_3d['cluster'] = labels
df_train_3d['cluster'] = df_train_3d['cluster'].astype(str)

In [10]:
fig = px.scatter_3d(df_train_3d, x='x', y='y', z='z', color='cluster', height=800, width=1200, opacity=0.7,
                    template='plotly_white', color_discrete_sequence=px.colors.qualitative.Dark24, title='Clusters in 3D')
fig.show()

In [7]:
# import clustering metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# silhouette score
silhouette = silhouette_score(X_3d, labels)
print(f"Silhouette Score: {silhouette}")

# davies bouldin score
davies_bouldin = davies_bouldin_score(X_3d, labels)
print(f"Davies Bouldin Score: {davies_bouldin}")

# calinski harabasz score
calinski_harabasz = calinski_harabasz_score(X_3d, labels)
print(f"Calinski Harabasz Score: {calinski_harabasz}")

Silhouette Score: 0.31926289200782776
Davies Bouldin Score: 0.960474711334908
Calinski Harabasz Score: 3084.3771674158133
