In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import adjusted_rand_score, confusion_matrix

import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Component Reduction UMAP 150 Chunk Size

In [2]:
X = np.load('Data/text_data150.npy')
y = np.load('Data/labels150.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=3)

In [3]:
sizes = np.arange(1, 11)
dimensions = 2 ** np.arange(1, 11)
rand_scores_hdbscan = np.zeros((len(dimensions), len(sizes)))
rand_scores_kmeans = np.zeros((len(dimensions), len(sizes)))
all_labels_hdbscan = np.zeros((len(dimensions), len(sizes)), dtype=object)
all_labels_kmeans = np.zeros((len(dimensions), len(sizes)), dtype=object)

for idy, dim in enumerate(dimensions):
    print(f'Dimension: {dim}')
    for idx, size in enumerate(sizes):
        if size == 10:
            X_val, y_val = X_train, y_train
        else:
            X_t2, X_val, y_t2, y_val = train_test_split(X_train, y_train, test_size=size/10, stratify=y_train, random_state=3)

        if dim == 1024:
            embedding_train = X_train
        else:
            mapper = umap.UMAP(n_neighbors=30,
                                min_dist=0.0,
                                init='random',
                                metric='euclidean',
                                n_components=dim).fit(X_val, y=y_val)
            embedding_train = mapper.transform(X_train)

        labels_hdb = hdbscan.HDBSCAN(
            min_samples=10,
            min_cluster_size=500,
        ).fit_predict(embedding_train)

        kmeans = KMeans(n_clusters=5, random_state=3)
        kmeans.fit(embedding_train)
        labels_kmeans = kmeans.labels_

        rand_scores_hdbscan[idy,idx] = adjusted_rand_score(y_train, labels_hdb)
        rand_scores_kmeans[idy, idx] = adjusted_rand_score(y_train, labels_kmeans)

        all_labels_hdbscan[idy, idx] = labels_hdb
        all_labels_kmeans[idy, idx] = labels_kmeans

Dimension: 2
Dimension: 4
Dimension: 8
Dimension: 16
Dimension: 32
Dimension: 64
Dimension: 128
Dimension: 256
Dimension: 512
Dimension: 1024


In [5]:
np.save('Results/labels_hdbscan_150.npy', all_labels_hdbscan)
np.save('Results/labels_kmeans_150.npy', all_labels_kmeans)

In [33]:
confusion_matrix(y_train, all_labels_hdbscan[0, 0])

array([[   0,    0,    0,    0,    0,    0],
       [   9,    0,    0,  644,    0,    0],
       [  29,    0,    0,    2,   20, 1029],
       [   1,  866,    0,    0,    0,    0],
       [  34,    1,    1,    0, 1023,    2],
       [   9,    1, 1061,    0,    0,    2]], dtype=int64)

### Component Reduction PCA

In [3]:
dimensions = 2 ** np.arange(1, 11)
rand_scores_hdbscan = np.zeros(len(dimensions))
rand_scores_kmeans = np.zeros(len(dimensions))
all_labels_hdbscan = np.zeros(len(dimensions), dtype=object)
all_labels_kmeans = np.zeros(len(dimensions), dtype=object)

for idx, dim in enumerate(dimensions):
    print(f'Dimension: {dim}')
    X_train_red = PCA(n_components=dim).fit_transform(X_train)

    labels_hdb = hdbscan.HDBSCAN(
        min_samples=10,
        min_cluster_size=500,
    ).fit_predict(X_train_red)

    kmeans = KMeans(n_clusters=5, random_state=3)
    kmeans.fit(X_train_red)
    labels_kmeans = kmeans.labels_

    rand_scores_hdbscan[idx] = adjusted_rand_score(y_train, labels_hdb)
    rand_scores_kmeans[idx] = adjusted_rand_score(y_train, labels_kmeans)

    all_labels_hdbscan[idx] = labels_hdb
    all_labels_kmeans[idx] = labels_kmeans

Dimension: 2
Dimension: 4
Dimension: 8
Dimension: 16
Dimension: 32
Dimension: 64
Dimension: 128
Dimension: 256
Dimension: 512
Dimension: 1024


In [6]:
np.save('Results/labels_hdbscan_PCA150.npy', all_labels_hdbscan)
np.save('Results/labels_kmeans_PCA150.npy', all_labels_kmeans)

## Sentence Embedding

### Component reduction with UMAP

In [8]:
X = np.load('Data/text_data_sentences.npy')
y = np.load('Data/labels_sentences.npy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=3)

In [9]:
sizes = np.arange(1, 11)
dimensions = 2 ** np.arange(1, 11)
rand_scores_hdbscan = np.zeros((len(dimensions), len(sizes)))
rand_scores_kmeans = np.zeros((len(dimensions), len(sizes)))
all_labels_hdbscan = np.zeros((len(dimensions), len(sizes)), dtype=object)
all_labels_kmeans = np.zeros((len(dimensions), len(sizes)), dtype=object)

for idy, dim in enumerate(dimensions):
    print(f'Dimension: {dim}')
    for idx, size in enumerate(sizes):
        if size == 10:
            X_val, y_val = X_train, y_train
        else:
            X_t2, X_val, y_t2, y_val = train_test_split(X_train, y_train, test_size=size/10, stratify=y_train, random_state=3)

        if dim == 1024:
            embedding_train = X_train
        else:
            mapper = umap.UMAP(n_neighbors=30,
                                min_dist=0.0,
                                init='random',
                                metric='euclidean',
                                n_components=dim).fit(X_val, y=y_val)
            embedding_train = mapper.transform(X_train)

        labels_hdb = hdbscan.HDBSCAN(
            min_samples=10,
            min_cluster_size=500,
        ).fit_predict(embedding_train)

        kmeans = KMeans(n_clusters=5, random_state=3)
        kmeans.fit(embedding_train)
        labels_kmeans = kmeans.labels_

        rand_scores_hdbscan[idy,idx] = adjusted_rand_score(y_train, labels_hdb)
        rand_scores_kmeans[idy, idx] = adjusted_rand_score(y_train, labels_kmeans)

        all_labels_hdbscan[idy, idx] = labels_hdb
        all_labels_kmeans[idy, idx] = labels_kmeans

Dimension: 2
Dimension: 4
Dimension: 8
Dimension: 16
Dimension: 32
Dimension: 64
Dimension: 128
Dimension: 256
Dimension: 512
Dimension: 1024


In [10]:
np.save('Results/labels_hdbscan_sentences.npy', all_labels_hdbscan)
np.save('Results/labels_kmeans_sentences.npy', all_labels_kmeans)