<a href="https://colab.research.google.com/github/Mahnazshamissa/Python/blob/main/SpectralClustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import LabelEncoder

In [18]:
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [23]:
abo_path = '/content/iris.data'
p_df_raw = pd.read_csv(abo_path , names=column_names)

In [24]:
p_df_raw.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
p_df_raw.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

In [26]:
df_X = p_df_raw.drop(['class'],axis=1)
df_y = LabelEncoder().fit_transform(list(p_df_raw["class"]))

In [27]:
df_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [28]:
clustering = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0, n_init=20).fit(df_X)

In [29]:
clustering.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [30]:
clustering.affinity_matrix_

array([[1.00000000e+00, 7.48263568e-01, 7.71051586e-01, ...,
        2.30082059e-09, 4.03840951e-10, 3.59908895e-08],
       [7.48263568e-01, 1.00000000e+00, 9.13931185e-01, ...,
        1.62136087e-09, 2.15082380e-10, 3.22418674e-08],
       [7.71051586e-01, 9.13931185e-01, 1.00000000e+00, ...,
        3.65410404e-10, 6.16221335e-11, 9.42405852e-09],
       ...,
       [2.30082059e-09, 1.62136087e-09, 3.65410404e-10, ...,
        1.00000000e+00, 6.83861409e-01, 6.63650250e-01],
       [4.03840951e-10, 2.15082380e-10, 6.16221335e-11, ...,
        6.83861409e-01, 1.00000000e+00, 5.54327285e-01],
       [3.59908895e-08, 3.22418674e-08, 9.42405852e-09, ...,
        6.63650250e-01, 5.54327285e-01, 1.00000000e+00]])

# **Internal Measures**

In [31]:
from sklearn import metrics

In [32]:
# Perfect labelings are both homogeneous and complete, hence have score 1.0:
# Labelings that assign all classes members to the same clusters are complete be not homogeneous
# Labelings that have pure clusters with members coming from the same classes are homogeneous but un-necessary splits harms completeness
# If classes members are completely split across different clusters, the assignment is totally incomplete

sill = metrics.silhouette_score(df_X, clustering.labels_, metric='euclidean', sample_size=None)  ### best score is 1
db = metrics.davies_bouldin_score(df_X, clustering.labels_) ### best score is 0
sill, db

(0.6863930543445408, 0.3835952094491398)

# **External Measures**

In [33]:
# A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class.
# A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster.
# Both scores have positive values between 0.0 and 1.0, larger values being desirable.
# The V-measure is the harmonic mean between homogeneity and completeness:

In [34]:
print('V_score')
print(metrics.homogeneity_completeness_v_measure(df_y, clustering.labels_))
print('Accuracy')
print(metrics.accuracy_score(df_y, clustering.labels_))
print('Confusion Matrix')
print(metrics.confusion_matrix(df_y, clustering.labels_))
print('Confusion Report')
print(metrics.classification_report(df_y, clustering.labels_))
print('f1 score')
print(metrics.f1_score(df_y, clustering.labels_, average='weighted'))

V_score
(0.5793801642856945, 0.9999999999999997, 0.7336804366512104)
Accuracy
0.0
Confusion Matrix
[[ 0 50  0]
 [50  0  0]
 [50  0  0]]
Confusion Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      50.0
           1       0.00      0.00      0.00      50.0
           2       0.00      0.00      0.00      50.0

    accuracy                           0.00     150.0
   macro avg       0.00      0.00      0.00     150.0
weighted avg       0.00      0.00      0.00     150.0

f1 score
0.0


  _warn_prf(average, modifier, msg_start, len(result))


# **Grid Parameter Search**

In [35]:
internal_measures = []
for k in range(2, 35):
    clustering = SpectralClustering(n_clusters=k, assign_labels="discretize", random_state=0, n_init=20).fit(df_X)
    v_measure = metrics.homogeneity_completeness_v_measure(df_y, clustering.labels_)[2]
    silhouette = metrics.silhouette_score(df_X, clustering.labels_, metric='euclidean', sample_size=None)
    internal_measures.append([k,v_measure,silhouette])

In [36]:
internal_measures

[[2, 0.7336804366512104, 0.6863930543445408],
 [3, 0.7660355440487252, 0.5509296349732906],
 [4, 0.6924615173878584, 0.4996636428548515],
 [5, 0.6939910028645424, 0.4794165195023188],
 [6, 0.636835549420609, 0.348964988694922],
 [7, 0.6234775202002705, 0.3299227793501307],
 [8, 0.6576810642193917, 0.3453464363967509],
 [9, 0.6352591349436703, 0.34578931440110533],
 [10, 0.6168057548200682, 0.33834722297210473],
 [11, 0.5736386908866958, 0.25643206047972517],
 [12, 0.5965101989376439, 0.3073868680464209],
 [13, 0.5530818524717108, 0.2601607439287929],
 [14, 0.5315213011413568, 0.29115540610886353],
 [15, 0.515277067299817, 0.2447288708644159],
 [16, 0.5135299594018452, 0.24039682401597826],
 [17, 0.49927062463674154, 0.23512643218503418],
 [18, 0.5110443590974424, 0.19832586685219544],
 [19, 0.4962968212499585, 0.24092544415415665],
 [20, 0.47327759652986257, 0.21787514662741414],
 [21, 0.49719572014961194, 0.23513462173633215],
 [22, 0.4721159897967986, 0.23206861442516621],
 [23, 0.47

In [37]:
max(internal_measures[:1])

[2, 0.7336804366512104, 0.6863930543445408]

In [38]:
max_v_measure =np.argmax(internal_measures[:1])
optimum_clusters = internal_measures[max_v_measure][0]
optimum_clusters
clustering = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0, n_init=20).fit(df_X)

In [39]:
print('V_score')
print(metrics.homogeneity_completeness_v_measure(df_y, clustering.labels_))
print('Accuracy')
print(metrics.accuracy_score(df_y, clustering.labels_))
print('Confusion Matrix')
print(metrics.confusion_matrix(df_y, clustering.labels_))
print('Confusion Report')
print(metrics.classification_report(df_y, clustering.labels_))
print('f1 score')
print(metrics.f1_score(df_y, clustering.labels_, average='weighted'))

V_score
(0.5793801642856945, 0.9999999999999997, 0.7336804366512104)
Accuracy
0.0
Confusion Matrix
[[ 0 50  0]
 [50  0  0]
 [50  0  0]]
Confusion Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      50.0
           1       0.00      0.00      0.00      50.0
           2       0.00      0.00      0.00      50.0

    accuracy                           0.00     150.0
   macro avg       0.00      0.00      0.00     150.0
weighted avg       0.00      0.00      0.00     150.0

f1 score
0.0


  _warn_prf(average, modifier, msg_start, len(result))
