In [1]:
import csv
import pickle as pkl

import numpy as np
# import pandas as pd

from collections import Counter
from datetime import datetime
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec
# from hdbscan import HDBSCAN
from cuml.cluster import HDBSCAN, DBSCAN, KMeans
# from sklearn.decomposition import TruncatedSVD 
# from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from cuml.metrics.cluster.entropy import cython_entropy


In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
labels_dir = Path(data_dir, 'labels')
models_dir = Path(Path.cwd().parent, 'models')
mod_paths = sorted([mod_path for mod_path in Path(models_dir).glob('d2v*.model')])[2:]
# mod_paths = sorted([str(mod_path) for mod_path in Path(models_dir).glob('*.model')])[1:] # removing 1e5 model

In [3]:
def export_results(path, data=None):
    if not path.is_file():
        with open(path, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(data.keys())
            if data is not None:
                writer.writerow(data.values())
    else:
        with open(path, 'a') as file:
            writer = csv.writer(file)
            writer.writerow(data.values())
            

def load_labels(file):
    with open(file, 'rb') as handle:
        labels = pkl.load(handle)
            
    return labels


def export_labels(labels, file):
    with open(file, 'wb') as handle:
        pkl.dump(labels, handle, protocol=pkl.HIGHEST_PROTOCOL)


def get_kmeans(data, dataset, distance, n_clusters):
    labels_file = Path(data_dir, f'labels_cuMLkm_{dataset}_{n_clusters:02d}_{distance}.pkl')
    if labels_file.is_file():
        labels = load_labels(labels_file)
    else:
        km = KMeans(n_clusters=n_clusters)
        km.fit(data)
        labels = km.labels_.tolist()
        export_labels(labels, labels_file)
        
    return labels


def get_dbscan(data, dataset, distance, epsilon, min_pts):
    labels_file = Path(data_dir, f'labels_cuMLdbscan_{dataset}_{epsilon}_{min_pts:02d}_{distance}.pkl')
    if labels_file.is_file():
        labels = load_labels(labels_file)
    else:
        db = DBSCAN(eps=epsilon,
                    min_samples=min_pts,
                    metric=distance
                   ).fit(data)
        labels = db.labels_
        export_labels(labels, labels_file)
    
    return labels


def get_hdbscan(data, dataset, distance, min_clt_size, min_samples):
    labels_file = Path(data_dir, f'labels_cuMLhdbscan_{dataset}_{min_clt_size:02d}_{min_samples:02d}_{distance}.pkl')
    if labels_file.is_file():
        labels = load_labels(labels_file)
    else:
        clusterer = HDBSCAN(min_cluster_size=min_clt_size,
                            min_samples=min_samples,
                            metric=distance
                           ).fit(data)
        labels = clusterer.labels_
        export_labels(labels, labels_file)
    
    return labels


def evaluate_cluster(data, labels, distance, method, n_clusters=None):
    results = {}
    
    if method != 'km':
        count_clust = Counter(labels)
        n_clusters = len([key for key in count_clust.keys() if key != -1])
        results['n_clusters'] = n_clusters

        if -1 in count_clust:
            n_noise = count_clust[-1]
            results['n_noise'] = n_noise

        clust_data = []
        clust_labs = []
        for i, label in enumerate(labels):
            if label != -1:
                clust_data.append(data[i])
                clust_labs.append(labels[i])
    else:
        clust_data = data
        clust_labs = labels
        n_clusters = n_clusters
        results['n_clusters'] = n_clusters
    
    clust_data = np.asarray(clust_data)
    clust_labs = np.asarray(clust_labs)
    
    if len(clust_labs) == n_clusters or n_clusters < 2:
        results.update({'sl_score': None, 'ch_score': None, 'db_score': None, 'entropy': None})
    else:
        if method != 'km':
            results.update({
                'sl_score': cython_silhouette_score(clust_data, clust_labs, metric=distance),
                'ch_score': calinski_harabasz_score(clust_data, clust_labs),
                'db_score': davies_bouldin_score(clust_data, clust_labs),
                'entropy' : cython_entropy(clust_labs)
            })
        else:
            results.update({
                'sl_score': cython_silhouette_score(clust_data, clust_labs, metric=distance),
                'ch_score': calinski_harabasz_score(clust_data, clust_labs),
                'db_score': davies_bouldin_score(clust_data, clust_labs),
                'entropy' : None
            })

    return results


def get_results(data, filename, labels, dataset, distance, method, **kwargs):
    if method=='km':
        n_clusters = kwargs['n_clusters']
        cls_res = evaluate_cluster(data, labels, distance, method, n_clusters)
    elif method=='dbscan':
        cls_res = evaluate_cluster(data, labels, distance, method)
        cls_res = {'epsilon': kwargs['epsilon'], 'min_pts': kwargs['min_pts']}  | cls_res
    elif method=='hdbscan':
        cls_res = evaluate_cluster(data, labels, distance, method)
        cls_res = {'min_clt_size': kwargs['min_clt_size'], 'min_samples': kwargs['min_samples']}  | cls_res
    
    results = {'distance': distance, 'dataset': dataset} | cls_res
    export_results(Path(data_dir, filename + '.csv'), results)


def clustering(path, method='km', **kwargs):
    if 'distance' not in kwargs:
        distance = 'euclidean'
    else:
        distance = kwargs['distance']
    
    filename = f'eval_cuML_{method}_{int(datetime.today().timestamp())}'

    model = Doc2Vec.load(str(path))
    data = model.dv.vectors
    dataset = '_'.join(path.stem.split('_')[1:])

    print(f'Performing {method} and evaluating for {dataset} points')     

    if method=='km':
        for n_clusters in range(kwargs['min_clust'], kwargs['max_clust'] + 1):
            labels = get_kmeans(data, dataset, distance, n_clusters)
            get_results(data, filename, labels, dataset, distance, method, n_clusters=n_clusters)
    elif method=='dbscan':
        for epsilon in kwargs['eps_range']:
            for min_pts in kwargs['min_pts_range']:
                labels = get_dbscan(data, dataset, distance, epsilon, min_pts)
                get_results(data, filename, labels, dataset, distance, method, epsilon=epsilon, min_pts=min_pts)
    elif method=='hdbscan':
        for mcs in kwargs['mcs_range']:
            for min_samples in kwargs['min_samples_range']:
                labels = get_hdbscan(data, dataset, distance, mcs, min_samples)
                get_results(data, filename, labels, dataset, distance, method, min_clt_size=mcs, min_samples=min_samples)



# HDBSCAN

In [4]:
mcs_range = [mcs for mcs in range(2,21)]
print(mcs_range)
min_samples_range = [ms for ms in range(2,21)]
print(min_samples_range)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [6]:
# clustering(mod_paths[:2], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
# for i in range(len(mod_paths)):
for path in mod_paths[8:]:
    t0 = datetime.now()
    clustering(path, method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing hdbscan and evaluating for chains_split_2_300 points
Took: 0:31:51.766285
Performing hdbscan and evaluating for chains_split_2_50 points
Took: 0:51:33.349791
Performing hdbscan and evaluating for chains_split_3_300 points


KeyboardInterrupt: 

In [None]:
# clustering(mod_paths[2:4], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
for i in range(len(mod_paths[2:3])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing hdbscan and evaluating for forward points
Took: 0:17:45.370655
Performing hdbscan and evaluating for no_duplicates points


In [None]:
# clustering(mod_paths[3:4], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
for i in range(len(mod_paths[3:4])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing hdbscan and evaluating for no_duplicates points


In [5]:
# clustering(mod_paths[4:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
for i in range(len(mod_paths[4:])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing hdbscan and evaluating for not_original points
Took: 0:59:09.411661
Performing hdbscan and evaluating for replies points
Took: 1:28:25.749069


# DBSCAN

In [4]:
min_pts_range = [min_pts for min_pts in range(2,16)]
print(min_pts_range)
eps_range = [round(e*0.01,3) for e in range(20,101)]
print(eps_range)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]


In [5]:
mod_paths

[PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_gt_1_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_gt_1_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_replies_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_replies_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_0_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_0_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_1_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_1_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_2_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_2_50.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_3_300.model'),
 PosixPath('/home/miguel/Projects/tfm-nlp/models/d2v_chains_split_3_50.model')]

In [7]:
for path in mod_paths[:6]:
    t0 = datetime.now()
    clustering(path, method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing dbscan and evaluating for chains_gt_1_300 points
Took: 0:06:48.698934
Performing dbscan and evaluating for chains_gt_1_50 points
Took: 0:03:31.620055
Performing dbscan and evaluating for chains_replies_300 points
Took: 0:23:15.766989
Performing dbscan and evaluating for chains_replies_50 points
Took: 0:08:22.964877
Performing dbscan and evaluating for chains_split_0_300 points
Took: 0:09:07.265597
Performing dbscan and evaluating for chains_split_0_50 points
Took: 0:04:16.051523


In [None]:
for i in range(len(mod_paths)):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

In [5]:
# clustering(mod_paths[:3], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
for i in range(len(mod_paths[:3])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing dbscan and evaluating for chain_children points
Took: 0:27:42.338594
Performing dbscan and evaluating for chain_fathers points
Took: 1:01:47.539108
Performing dbscan and evaluating for forward points
Took: 0:01:20.729172


In [6]:
# clustering(mod_paths[4:], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
for i in range(len(mod_paths[4:])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing dbscan and evaluating for not_original points
Took: 0:08:55.745006
Performing dbscan and evaluating for replies points
Took: 0:17:27.458527


In [None]:
# clustering(mod_paths[3:4], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
for i in range(len(mod_paths[3:4])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

In [7]:
min_pts_range = [min_pts for min_pts in range(2,16)]
print(min_pts_range)
eps_range = [round(e*0.01,3) for e in range(10,101)]
print(eps_range)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
[0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]


In [8]:
for i in range(len(mod_paths[:3])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range, distance='cosine')
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing dbscan and evaluating for chain_children points
Took: 0:39:05.931020
Performing dbscan and evaluating for chain_fathers points
Took: 1:17:36.099185
Performing dbscan and evaluating for forward points
Took: 0:03:21.913214


In [9]:
for i in range(len(mod_paths[4:])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range, distance='cosine')
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing dbscan and evaluating for not_original points
Took: 0:21:03.090316
Performing dbscan and evaluating for replies points
Took: 0:25:28.512184


In [None]:
# clustering(mod_paths[3:4], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range, distance='cosine')
for i in range(len(mod_paths[3:4])):
    t0 = datetime.now()
    clustering(mod_paths[i], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range, distance='cosine')
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

# KMeans

In [8]:
for path in mod_paths[:6]:
    t0 = datetime.now()
    clustering(path, min_clust=2, max_clust=20)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing km and evaluating for chains_gt_1_300 points
Took: 0:00:23.300622
Performing km and evaluating for chains_gt_1_50 points
Took: 0:00:13.242667
Performing km and evaluating for chains_replies_300 points
Took: 0:00:57.562967
Performing km and evaluating for chains_replies_50 points
Took: 0:00:37.871400
Performing km and evaluating for chains_split_0_300 points
Took: 0:00:26.950204
Performing km and evaluating for chains_split_0_50 points
Took: 0:00:16.945235


In [None]:
for path in mod_paths:
    t0 = datetime.now()
    clustering(path, min_clust=2, max_clust=20)
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

In [13]:
for path in mod_paths:
    t0 = datetime.now()
    clustering(path, min_clust=2, max_clust=20, distance='cosine')
    t1 = datetime.now()
    print(f'Took: {t1-t0}')

Performing km and evaluating for chain_children points
Took: 0:00:48.638613
Performing km and evaluating for chain_fathers points
Took: 0:01:40.050665
Performing km and evaluating for forward points
Took: 0:00:04.075850
Performing km and evaluating for no_duplicates points
Took: 0:04:44.697953
Performing km and evaluating for not_original points
Took: 0:00:19.473320
Performing km and evaluating for replies points
Took: 0:00:32.727606


# HDBSCAN for random sizes

In [7]:
clustering(mod_paths[-2:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 90000 points
Took: 0:52:48.133023
Performing hdbscan and evaluating for 95000 points
Took: 1:13:08.788950


In [4]:
mcs_range = [mcs for mcs in range(2,5)]
print(mcs_range)
min_samples_range = [ms for ms in range(2,5)]
print(min_samples_range)

[2, 3, 4]
[2, 3, 4]


In [6]:
clustering(mod_paths[2:4], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 20000 points
Took: 0:00:28.531770
Performing hdbscan and evaluating for 25000 points
Took: 0:00:34.023335


In [7]:
clustering(mod_paths[4:12], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 30000 points
Took: 0:00:41.591004
Performing hdbscan and evaluating for 35000 points
Took: 0:00:50.358744
Performing hdbscan and evaluating for 40000 points
Took: 0:00:58.804751
Performing hdbscan and evaluating for 45000 points
Took: 0:01:07.808075
Performing hdbscan and evaluating for 50000 points
Took: 0:01:15.664892
Performing hdbscan and evaluating for 55000 points
Took: 0:01:28.587734
Performing hdbscan and evaluating for 60000 points
Took: 0:01:42.591604
Performing hdbscan and evaluating for 65000 points
Took: 0:01:47.774804


In [8]:
clustering(mod_paths[12:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 70000 points
Took: 0:01:58.902803
Performing hdbscan and evaluating for 75000 points
Took: 0:02:09.473521
Performing hdbscan and evaluating for 80000 points
Took: 0:02:19.611257
Performing hdbscan and evaluating for 85000 points
Took: 0:02:26.379817
Performing hdbscan and evaluating for 90000 points
Took: 0:02:46.450749
Performing hdbscan and evaluating for 95000 points
Took: 0:02:52.979125


# DBSCAN for random sizes