Niestety klasetryzacja ze względu na ograniczoną pamięć RAM za każdym razem kończy się niepowodzeniem. Redukcja wymiarów nie rozwiązuje problemu. Podzielenie danych na mniejsze podzbiory radykalnie obniża jakośc klasteryzacji (jest gorzej niż w kmeans, a powinno być lepiej).

Połączenie z google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Instalowanie rapid

In [2]:
import os, sys, io
import subprocess
from pathlib import Path

# Install RAPIDS -- we're doing this in one file, for now, due to ease of use
try:
  import pynvml
except:
  output = subprocess.Popen(["pip install pynvml"], shell=True, stderr=subprocess.STDOUT,
      stdout=subprocess.PIPE)
  for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
    if(line == ""):
      break
    else:
      print(line.rstrip())
  import pynvml
try:
  pynvml.nvmlInit()
except:
  raise Exception("""
                  Unfortunately you're in a Colab instance that doesn't have a GPU.

                  Please make sure you've configured Colab to request a GPU Instance Type.

                  Go to 'Runtime -> Change Runtime Type --> under the Hardware Accelerator, select GPU', then try again."""
  )
gpu_name = pynvml.nvmlDeviceGetName(pynvml.nvmlDeviceGetHandleByIndex(0))
rapids_version = "24.4.*"

if ('P' not in gpu_name):
  print('***********************************************************************')
  print('Woo! Your instance has a '+ str(gpu_name)+' GPU!')
  print(f'We will install the latest stable RAPIDS via pip {rapids_version}!  Please stand by, should be quick...')
  print('***********************************************************************')
  print()
else:
  print('***********************************************************************')
  print('Hey! Your instance has a Pascal GPU, a '+ str(gpu_name)+'!')
  print('We will install a compatible RAPIDS via pip (23.12)!  Please stand by, should be quick...')
  print('***********************************************************************')
  print()
  rapids_version = "23.12.*"


output = subprocess.Popen([f"pip install cudf-cu12=={rapids_version} cuml-cu12=={rapids_version} cugraph-cu12=={rapids_version} cuspatial-cu12=={rapids_version} cuproj-cu12=={rapids_version} cuxfilter-cu12=={rapids_version} cucim-cu12=={rapids_version} pylibraft-cu12=={rapids_version} raft-dask-cu12=={rapids_version} aiohttp --extra-index-url=https://pypi.nvidia.com"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
output = subprocess.Popen(["rm -rf /usr/local/lib/python3.10/dist-packages/cupy*"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
output = subprocess.Popen(["pip install cupy-cuda12x"], shell=True, stderr=subprocess.STDOUT,
    stdout=subprocess.PIPE)
for line in io.TextIOWrapper(output.stdout, encoding="utf-8"):
  if(line == ""):
    break
  else:
    print(line.rstrip())
print("""
        ***********************************************************************
        The pip install of RAPIDS is complete.

        Please do not run any further installation from the conda based installation methods, as they may cause issues!

        Please ensure that you're pulling from the git repo to remain updated with the latest working install scripts.

        Troubleshooting:
            - If there is an installation failure, please check back on RAPIDSAI owned templates/notebooks to see how to update your personal files.
            - If an installation failure persists when using the latest script, please make an issue on https://github.com/rapidsai-community/rapidsai-csp-utils
        ***********************************************************************
        """
      )

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 1.8 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a Tesla T4 GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cuml-cu12==24.4.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-24.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1200.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 GB 1.6 MB/s eta 0:00:00
Collecting cugraph-cu12==24.4.*
  Downloading https://pypi.nvidia.com/cugraph-cu12/cugraph_cu12-24.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1429.1 MB)
     ━━━━━

Niezbędne importy

In [3]:
import cuml
from cuml.cluster import DBSCAN
from cuml.metrics.cluster import silhouette_score

import pandas as pd
import numpy as np
from sklearn.metrics import adjusted_rand_score, davies_bouldin_score
from scipy.spatial.distance import cdist

import gc
from sklearn.decomposition import PCA

Wczytanie danych

In [4]:
embeddings_path = '/content/drive/My Drive/data_sety/normalized_word2vec_embeddings.csv'
data_embeddings = pd.read_csv(embeddings_path, dtype=np.float32)

original_data_path = '/content/drive/My Drive/data_sety/ready_data_set.csv'
original_data = pd.read_csv(original_data_path)
labels_true = original_data['label'].to_numpy()


Funkcja licząca indeks dunn

In [5]:
def dunn_index(X, labels):
    distances = cdist(X, X, 'euclidean')
    unique_clusters = np.unique(labels)
    inter_cluster_distances = []
    intra_cluster_diameters = []
    for i in unique_clusters:
        for j in unique_clusters:
            if i != j:
                inter_cluster_distances.append(np.min(distances[labels == i][:, labels == j]))
        intra_cluster_diameters.append(np.max(distances[labels == i][:, labels == i]))
    return np.min(inter_cluster_distances) / np.max(intra_cluster_diameters)


Przygotowanie df do zbierania wyników i zklastrowanych danych

In [6]:
results_df = pd.DataFrame()
clustered_data = pd.DataFrame()
results_list = []

Klasteryzacja DBSCAN bez podziału na foldy, ponieważ algorytm ten nie zależy od losowego podziału danych na części treningowe i testowe.

In [None]:
# Klasteryzacja za pomocą DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
labels_pred = dbscan.fit_predict(data_embeddings_reduced)

# Obliczenie metryk
silhouette = silhouette_score(data_embeddings_reduced, labels_pred) if len(np.unique(labels_pred)) > 1 else 0
db_index = davies_bouldin_score(data_embeddings_reduced, labels_pred) if len(np.unique(labels_pred)) > 1 else 0
ari = adjusted_rand_score(labels_true, labels_pred)
dunn = dunn_index(data_embeddings_reduced, labels_pred)

# Zapisywanie metryk
result = {
    'Silhouette Score': silhouette,
    'ARI Score': ari,
    'Davies-Bouldin Index': db_index,
    'Dunn Index': dunn
}

results_list.append(result)

#Zapisywanie zklasteryzowanych danych
clustered_data = pd.DataFrame(data_embeddings)
clustered_data['True Label'] = labels_true
clustered_data['Predicted Cluster'] = labels_pred

[W] [11:04:07.452392] Batch size limited by the chosen integer type (4 bytes). 13060 -> 11078. Using the larger integer type might result in better performance


Wyświeylenie podsumowania

In [None]:
results_df = pd.DataFrame(results_list)
print(results_df)

Zapisanie wyników (metryki)

In [None]:
results_file_path = '/content/drive/My Drive/data_sety/word2vec_dbscan_clustering_metrics.csv'
clustered_data_file_path = '/content/drive/My Drive/data_sety/word2vec_dbscan_clustered_data.csv'

results_df.to_csv(results_file_path, index=False)
clustered_data.to_csv(clustered_data_file_path, index=False)