In [2]:
import matplotlib.pyplot as plt
import numpy as np

from datetime import datetime
from pathlib import Path

from cuml.neighbors import NearestNeighbors
from gensim.models.doc2vec import Doc2Vec
from kneed import KneeLocator

In [36]:
data_dir = Path(Path.cwd().parent, 'data/interim')
model_dir = Path(Path.cwd().parent, 'models')
nn_dir = Path(data_dir, 'nn')
paths = sorted([path for path in model_dir.glob('*.model')])

In [11]:
for path in paths:
    print(path)

/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_4_lt_10_50.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_4_lt_10_300.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_2_300.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_10_300.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_10_50.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_3_50.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_2_50.model
/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_3_300.model


In [13]:
path = paths[1]

In [12]:
filename = paths[0].name
nn_file = Path(nn_dir, f"nn_{'_'.join(paths[0].stem.split('_')[1:])}.npy")

In [13]:
nn_file

PosixPath('data/interim/nn/nn_chains_ge_4_lt_10_50.npy')

In [14]:
model = Doc2Vec.load(str(path))
data = model.dv.vectors.astype(np.float64)

In [37]:
paths

[PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_2_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_2_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_3_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_eq_3_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_10_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_10_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_4_lt_10_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_ge_4_lt_10_50.model')]

In [45]:
for path in paths:
    nn_file = Path(nn_dir, f"nn_{'_'.join(path.stem.split('_')[1:])}.npy")
    
    model = Doc2Vec.load(str(path))
    data = model.dv.vectors.astype(np.float64)

    ran = range(2,31)

    epsilons = np.empty((len(ran), 2))
    counter = 0
    for n in ran:
        nearest_neighbors = NearestNeighbors(n_neighbors=n, metric='l2')
        neighbors = nearest_neighbors.fit(data)

        distances, indices = neighbors.kneighbors(data)
        distances = np.sort(distances[:, distances.shape[1]-1], axis=0)
        i = np.arange(len(distances))
        knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
        epsilons[counter][0] = n
        epsilons[counter][1] = distances[knee.knee]
        counter += 1

    np.save(nn_file, epsilons)

In [76]:
dbscan_float = DBSCAN(eps = 19.19740677, min_samples = 4)
dbscan_float.fit(data)
labels = dbscan_float.labels_
count_clust = Counter(labels)
n_clusters = len([key for key in count_clust.keys() if key != -1])

# points = labels[~labels==-1]
clust_data = []
clust_labs = []
for i, label in enumerate(labels):
    if label != -1:
        clust_data.append(data[i])
        clust_labs.append(labels[i])

clust_data = np.asarray(clust_data)
clust_labs = np.asarray(clust_labs)
silscore(clust_data, clust_labs)

0.27074493497545216

In [None]:
labels[~labels==-1].shape

(9111,)

In [None]:
clust_data = np.empty(labels[~labels==-1].shape)
clust_labs = np.empty(labels[~labels==-1].shape)

array([4.66095379e-310, 4.66095377e-310, 4.66095379e-310, ...,
       1.18575755e-322, 3.95252517e-323, 2.37151510e-322])

In [None]:
silscore(clust_data, clust_labs) 

0.27074493497545216

In [None]:
count_clust = Counter(labels)
n_clusters = len([key for key in count_clust.keys() if key != -1])
results['n_clusters'] = n_clusters

if -1 in count_clust:
    n_noise = count_clust[-1]
    results['n_noise'] = n_noise

clust_data = []
clust_labs = []
for i, label in enumerate(labels):
    if label != -1:
        clust_data.append(data[i])
        clust_labs.append(labels[i])

Counter({0: 8992,
         1: 1,
         2: 1,
         3: 1,
         4: 1,
         5: 1,
         6: 1,
         7: 1,
         8: 1,
         9: 1,
         10: 1,
         11: 4,
         12: 1,
         13: 1,
         14: 1,
         15: 1,
         16: 1,
         17: 1,
         18: 1,
         19: 3,
         20: 1,
         21: 1,
         22: 1,
         23: 1,
         24: 1,
         25: 4,
         26: 2,
         27: 1,
         28: 1,
         29: 2,
         30: 1,
         31: 1,
         32: 1,
         33: 1,
         34: 1,
         35: 1,
         36: 1,
         37: 1,
         38: 1,
         39: 1,
         40: 1,
         41: 1,
         42: 1,
         43: 1,
         44: 1,
         45: 1,
         46: 2,
         47: 1,
         48: 1,
         49: 1,
         50: 1,
         51: 1,
         52: 1,
         53: 1,
         54: 1,
         55: 1,
         56: 2,
         57: 5,
         58: 1,
         59: 3,
         60: 1,
         61: 1,
         62: 1,