Połączenie z google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Niezbędne importy

In [2]:
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import adjusted_rand_score, davies_bouldin_score
from scipy.spatial.distance import cdist

Wczytanie danych

In [3]:
embeddings_path = '/content/drive/My Drive/data_sety/normalized_word2vec_embeddings.csv'
data_embeddings = pd.read_csv(embeddings_path)

original_data_path = '/content/drive/My Drive/data_sety/ready_data_set.csv'
original_data = pd.read_csv(original_data_path)
labels_true = original_data['label']


Funckja oblicająca indeks dunn

In [4]:
def dunn_index(X, labels):
    distances = cdist(X, X, 'euclidean') # Obliczenie odleŋłości między wszystkimi punktami i zapisanie ich w macierzy
    unique_clusters = np.unique(labels) # Identyfikacja unikalnych klastrów.

    # Inicjalizacja list dla odległości między klastrami i średnic klastrów
    inter_cluster_distances = []
    intra_cluster_diameters = []

    # Obliczanie odległości między klastrami i średnic klastrów
    for i in unique_clusters:
        for j in unique_clusters:
            if i != j:
                inter_cluster_distances.append(np.min(distances[labels == i][:, labels == j]))
        intra_cluster_diameters.append(np.max(distances[labels == i][:, labels == i]))

    # Zwrócenie indeksu Dunn. Jest on stosunkiem najmniejszej odległości między klastrami do największej średnicy klastra.
    return np.min(inter_cluster_distances) / np.max(intra_cluster_diameters)

Przygotowanie df do zbierania wyników i zklastrowanych danych

In [5]:
results_df = pd.DataFrame()
results_list = []

In [6]:
silhouette_scores_test = []
db_scores_test = []
ari_scores_test = []
dunn_scores_test = []

In [7]:
silhouette_scores_train = []
db_scores_train = []
ari_scores_train = []
dunn_scores_train = []

Klasteryzacja z wykorzystaniem 5-krotnej walidacji krzyżowej

In [None]:
data_embeddings_array = data_embeddings.to_numpy()


kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_number = 0
for train_index, test_index in kf.split(data_embeddings_array):
    fold_number += 1
    X_train, X_test = data_embeddings.iloc[train_index], data_embeddings.iloc[test_index]
    y_train, y_test = labels_true[train_index], labels_true[test_index]

     # Klasteryzacja za pomocą K-means
    kmeans = KMeans(n_clusters=2, random_state=42) # Inicjalizacja modelu klasteryzatora
    kmeans.fit(X_train) # Trenowanie klasteryzatora na danych treningowych

    labels_pred_train = kmeans.predict(X_train) # Przypisywanie danych treningowych do jednego z dwóch klastrów na podstwie wyuczonych centrów klastrów
    labels_pred_test = kmeans.predict(X_test) # Przypisywanie danych testowych do jednego z dwóch klastrów na podstwie wyuczonych centrów klastrów

    # Obliczanie metryk dla danych testowych
    silhouette_test = silhouette_score(X_test, labels_pred_test)
    db_index_test = davies_bouldin_score(X_test, labels_pred_test)
    ari_test = adjusted_rand_score(y_test, labels_pred_test)
    dunn_test = dunn_index(X_test, labels_pred_test)

    # Obliczanie metryk dla danych treningowych
    silhouette_train = silhouette_score(X_train, labels_pred_train)
    db_index_train = davies_bouldin_score(X_train, labels_pred_train)
    ari_train = adjusted_rand_score(y_train, labels_pred_train)
    dunn_train = dunn_index(X_train, labels_pred_train)

    # Dodanie wyników do listy wyników w celu obliczenia średniej na koniec dla danych testowych
    silhouette_scores_test.append(silhouette_test)
    db_scores_test.append(db_index_test)
    ari_scores_test.append(ari_test)
    dunn_scores_test.append(dunn_test)

    # Dodanie wyników do listy wyników w celu obliczenia średniej na koniec dla danych treningowych
    silhouette_scores_train.append(silhouette_train)
    db_scores_train.append(db_index_train)
    ari_scores_train.append(ari_train)
    dunn_scores_train.append(dunn_train)

    results_list.append({
        'Fold': fold_number,
        'Silhouette Score Test': silhouette_test,
        'DB score Test': db_index_test,
        'ARI Score Test': ari_test,
        'Dunn Index Test': dunn_test,
        'Silhouette Score Train': silhouette_train,
        'DB score Train': db_index_train,
        'ARI Score Train': ari_train,
        'Dunn Index Train': dunn_train
    })

    # Wyświetlenie wyników częściowych dla danego etapu
    print("Fold " + str(fold_number) + " Silhouette Score Test: " + str(silhouette_test) + " DB Score Test: " + str(db_index_test) + " ARI Score Test: " + str(ari_test) + " Dunn Index Test: " + str(dunn_test) + " Silhouette Score Train: " + str(silhouette_train) +  " DB Score Train: " + str(db_index_train) + " ARI Score Train: " + str(ari_train) + " Dunn Index Train: " + str(dunn_train))



Wyświeylenie podsumowania

In [None]:
average_silhouette_test = np.mean(silhouette_scores_test)
average_db_test = np.mean(db_scores_test)
average_ari_test = np.mean(ari_scores_test)
average_dunn_test = np.mean(dunn_scores_test)

average_silhouette_train = np.mean(silhouette_scores_train)
average_db_train = np.mean(db_scores_train)
average_ari_train = np.mean(ari_scores_train)
average_dunn_train = np.mean(dunn_scores_train)

print("Average Silhouette Score Test:", average_silhouette_test)
print("Average Davies-Bouldin Score Test:", average_db_test)
print("Average Adjusted Rand Index Test:", average_ari_test)
print("Average Dunn Index Test:", average_dunn_test)

print("Average Silhouette Score Train:", average_silhouette_train)
print("Average Davies-Bouldin Score Train:", average_db_train)
print("Average Adjusted Rand Index Train:", average_ari_train)
print("Average Dunn Index Train:", average_dunn_train)

Zapisanie wyników

In [None]:
results_file_path = '/content/drive/My Drive/data_sety/word2vec_kmeans_kfold_clustering_metrics.csv'

results_df = pd.DataFrame(results_list)
results_df.to_csv(results_file_path, index=False)