Połączenie z google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Instaowanie rapid

In [None]:
import os, sys, io
import subprocess
from pathlib import Path

# Install RAPIDS -- we're doing this in one file, for now, due to ease of use
try:
  import pynvml
except:
  output = subprocess.Popen(["pip install pynvml"], shell=True, stderr=subprocess.STDOUT,
      stdout=subprocess.PIPE)
  for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
    if(line == ""):
      break
    else:
      print(line.rstrip())
  import pynvml
try:
  pynvml.nvmlInit()
except:
  raise Exception("""
                  Unfortunately you're in a Colab instance that doesn't have a GPU.

                  Please make sure you've configured Colab to request a GPU Instance Type.

                  Go to 'Runtime -> Change Runtime Type --> under the Hardware Accelerator, select GPU', then try again."""
  )
gpu_name = pynvml.nvmlDeviceGetName(pynvml.nvmlDeviceGetHandleByIndex(0))
rapids_version = "24.4.*"

if ('P' not in gpu_name):
  print('***********************************************************************')
  print('Woo! Your instance has a '+ str(gpu_name)+' GPU!')
  print(f'We will install the latest stable RAPIDS via pip {rapids_version}!  Please stand by, should be quick...')
  print('***********************************************************************')
  print()
else:
  print('***********************************************************************')
  print('Hey! Your instance has a Pascal GPU, a '+ str(gpu_name)+'!')
  print('We will install a compatible RAPIDS via pip (23.12)!  Please stand by, should be quick...')
  print('***********************************************************************')
  print()
  rapids_version = "23.12.*"


output = subprocess.Popen([f"pip install cudf-cu12=={rapids_version} cuml-cu12=={rapids_version} cugraph-cu12=={rapids_version} cuspatial-cu12=={rapids_version} cuproj-cu12=={rapids_version} cuxfilter-cu12=={rapids_version} cucim-cu12=={rapids_version} pylibraft-cu12=={rapids_version} raft-dask-cu12=={rapids_version} aiohttp --extra-index-url=https://pypi.nvidia.com"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
output = subprocess.Popen(["rm -rf /usr/local/lib/python3.10/dist-packages/cupy*"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
output = subprocess.Popen(["pip install cupy-cuda12x"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
print("""
        ***********************************************************************
        The pip install of RAPIDS is complete.

        Please do not run any further installation from the conda based installation methods, as they may cause issues!

        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files.
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ***********************************************************************
        """
      )

Niezbędne importy

In [None]:
import cuml
from sklearn.cluster import Birch
from cuml.metrics.cluster import silhouette_score
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import adjusted_rand_score, davies_bouldin_score
from scipy.spatial.distance import cdist

Wczytanie danych

In [None]:
embeddings_path = '/content/drive/My Drive/data_sety/normalized_fasttext_embeddings.csv'
data_embeddings = pd.read_csv(embeddings_path)

original_data_path = '/content/drive/My Drive/data_sety/ready_data_set.csv'
original_data = pd.read_csv(original_data_path)
labels_true = original_data['label']


Funkcja licząca indeks dunn

In [None]:
def dunn_index(X, labels):
    distances = cdist(X, X, 'euclidean') # Obliczenie odleŋłości między wszystkimi punktami i zapisanie ich w macierzy
    unique_clusters = np.unique(labels) # Identyfikacja unikalnych klastrów.

    # Inicjalizacja list dla odległości między klastrami i średnic klastrów
    inter_cluster_distances = []
    intra_cluster_diameters = []

    # Obliczanie odległości między klastrami i średnic klastrów
    for i in unique_clusters:
        for j in unique_clusters:
            if i != j:
                inter_cluster_distances.append(np.min(distances[labels == i][:, labels == j]))
        intra_cluster_diameters.append(np.max(distances[labels == i][:, labels == i]))

    # Zwrócenie indeksu Dunn. Jest on stosunkiem najmniejszej odległości między klastrami do największej średnicy klastra.
    return np.min(inter_cluster_distances) / np.max(intra_cluster_diameters)

Przygotowanie df do zbierania wyników i zklastrowanych danych

In [None]:
results_df = pd.DataFrame()
results_list = []

In [None]:
silhouette_scores_test = []
db_scores_test = []
ari_scores_test = []
dunn_scores_test = []

In [None]:
silhouette_scores_train = []
db_scores_train = []
ari_scores_train = []
dunn_scores_train = []

Klasteryzacja z wykorzystaniem 5-krotnej walidacji krzyżowej

In [None]:
data_embeddings_array = data_embeddings.to_numpy()


kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_number = 0
for train_index, test_index in kf.split(data_embeddings_array):
    fold_number += 1
    X_train, X_test = data_embeddings.iloc[train_index], data_embeddings.iloc[test_index]
    y_train, y_test = labels_true[train_index], labels_true[test_index]

    # Klasteryzacja za pomocą K-means
    birch = Birch(n_clusters=2) # Inicjalizacja modelu klasteryzatora

    labels_pred_train = birch.fit_predict(X_train) # Trenowanie oraz przypisywanie danych treningowych do jednego z dwóch klastrów
    labels_pred_test = birch.fit_predict(X_test) # Trenowanie oraz przypisywanie danych testowych do jednego z dwóch klastrów

    # Obliczanie metryk dla danych testowych
    silhouette_test = silhouette_score(X_test, labels_pred_test)
    db_index_test = davies_bouldin_score(X_test, labels_pred_test)
    ari_test = adjusted_rand_score(y_test, labels_pred_test)
    dunn_test = dunn_index(X_test, labels_pred_test)

    # Obliczanie metryk dla danych treningowych
    silhouette_train = silhouette_score(X_train, labels_pred_train)
    db_index_train = davies_bouldin_score(X_train, labels_pred_train)
    ari_train = adjusted_rand_score(y_train, labels_pred_train)
    dunn_train = dunn_index(X_train, labels_pred_train)

    # Dodanie wyników do listy wyników w celu obliczenia średniej na koniec dla danych testowych
    silhouette_scores_test.append(silhouette_test)
    db_scores_test.append(db_index_test)
    ari_scores_test.append(ari_test)
    dunn_scores_test.append(dunn_test)

    # Dodanie wyników do listy wyników w celu obliczenia średniej na koniec dla danych treningowych
    silhouette_scores_train.append(silhouette_train)
    db_scores_train.append(db_index_train)
    ari_scores_train.append(ari_train)
    dunn_scores_train.append(dunn_train)

    results_list.append({
        'Fold': fold_number,
        'Silhouette Score Test': silhouette_test,
        'DB score Test': db_index_test,
        'ARI Score Test': ari_test,
        'Dunn Index Test': dunn_test,
        'Silhouette Score Train': silhouette_train,
        'DB score Train': db_index_train,
        'ARI Score Train': ari_train,
        'Dunn Index Train': dunn_train
    })

    # Wyświetlenie wyników częściowych dla danego etapu
    print("Fold " + str(fold_number) + " Silhouette Score Test: " + str(silhouette_test) + " DB Score Test: " + str(db_index_test) + " ARI Score Test: " + str(ari_test) + " Dunn Index Test: " + str(dunn_test) + " Silhouette Score Train: " + str(silhouette_train) +  " DB Score Train: " + str(db_index_train) + " ARI Score Train: " + str(ari_train) + " Dunn Index Train: " + str(dunn_train))

Wyświeylenie podsumowania

In [None]:
average_silhouette_test = np.mean(silhouette_scores_test)
average_db_test = np.mean(db_scores_test)
average_ari_test = np.mean(ari_scores_test)
average_dunn_test = np.mean(dunn_scores_test)

average_silhouette_train = np.mean(silhouette_scores_train)
average_db_train = np.mean(db_scores_train)
average_ari_train = np.mean(ari_scores_train)
average_dunn_train = np.mean(dunn_scores_train)

print("Average Silhouette Score Test:", average_silhouette_test)
print("Average Davies-Bouldin Score Test:", average_db_test)
print("Average Adjusted Rand Index Test:", average_ari_test)
print("Average Dunn Index Test:", average_dunn_test)

print("Average Silhouette Score Train:", average_silhouette_train)
print("Average Davies-Bouldin Score Train:", average_db_train)
print("Average Adjusted Rand Index Train:", average_ari_train)
print("Average Dunn Index Train:", average_dunn_train)

Zapisanie wyników

In [None]:
results_file_path = '/content/drive/My Drive/data_sety/fasttext_birch_kfold_clustering_metrics.csv'

results_df = pd.DataFrame(results_list)
results_df.to_csv(results_file_path, index=False)