In [1]:
import csv
import pickle as pkl

import numpy as np
# import pandas as pd

from collections import Counter
from datetime import datetime
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec
# from hdbscan import HDBSCAN
from cuml.cluster import HDBSCAN, DBSCAN
# from sklearn.cluster import DBSCAN, KMeans
# from sklearn.decomposition import TruncatedSVD 
# from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances, silhouette_score, calinski_harabasz_score, davies_bouldin_score
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from cuml.metrics.cluster.entropy import cython_entropy


In [2]:
data_dir = Path(Path.cwd().parent, 'data/interim')
models_dir = Path(Path.cwd().parent, 'models')
mod_paths = sorted([str(mod_path) for mod_path in Path(models_dir).glob('*.model')])[1:] # removing 1e5 model

In [3]:
def export_results(path, data=None):
    if not path.is_file():
        with open(path, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(data.keys())
            if data is not None:
                writer.writerow(data.values())
    else:
        with open(path, 'a') as file:
            writer = csv.writer(file)
            writer.writerow(data.values())
            

def load_labels(file):
    with open(file, 'rb') as handle:
            labels = pkl.load(handle)
            
    return labels


def export_labels(labels, file):
    with open(file, 'wb') as handle:
        pkl.dump(labels, handle, protocol=pkl.HIGHEST_PROTOCOL)


# def get_kmeans(data, size, distance, n_clusters):
#     labels_file = Path(data_dir, f'labels_km_{size}_{n_clusters:02d}_{distance}.pkl')
#     if labels_file.is_file():
#         labels = load_labels(labels_file)
#     else:
#         km = KMeans(n_clusters=n_clusters)
#         km.fit(data)
#         labels = km.labels_.tolist()
#         export_labels(labels, labels_file)
        
#     return labels


def get_dbscan(data, size, distance, epsilon, min_pts):
    labels_file = Path(data_dir, f'labels_cuMLdbscan_{size}_{epsilon}_{min_pts:02d}_{distance}.pkl')
    if labels_file.is_file():
        labels = load_labels(labels_file)
    else:
        db = DBSCAN(eps=epsilon,
                    min_samples=min_pts,
                    metric=distance
                   ).fit(data)
        labels = db.labels_
        export_labels(labels, labels_file)
    
    return labels


def get_hdbscan(data, size, distance, min_clt_size, min_samples):
    labels_file = Path(data_dir, f'labels_cuMLhdbscan_{size}_{min_clt_size:02d}_{min_samples:02d}_{distance}.pkl')
    if labels_file.is_file():
        labels = load_labels(labels_file)
    else:
        clusterer = HDBSCAN(min_cluster_size=min_clt_size,
                            min_samples=min_samples,
                            metric=distance
                           ).fit(data)
        labels = clusterer.labels_
        export_labels(labels, labels_file)
    
    return labels


def evaluate_cluster(data, labels, distance, method, n_clusters=None):
    results = {}
    
    if method != 'km':
        count_clust = Counter(labels)
        n_clusters = len([key for key in count_clust.keys() if key != -1])
        results['n_clusters'] = n_clusters

        if -1 in count_clust:
            n_noise = count_clust[-1]
            results['n_noise'] = n_noise

        clust_data = []
        clust_labs = []
        for i, label in enumerate(labels):
            if label != -1:
                clust_data.append(data[i])
                clust_labs.append(labels[i])
    else:
        clust_data = data
        clust_labs = labels
        n_clusters = n_clusters
        results['n_clusters'] = n_clusters
    
    clust_data = np.asarray(clust_data)
    clust_labs = np.asarray(clust_labs)
    
    if len(clust_labs) == n_clusters or n_clusters < 2:
        results.update({'sl_score': None, 'ch_score': None, 'db_score': None})
    else:
        results.update({
            # 'sl_score': silhouette_score(clust_data, clust_labs, metric=distance),
            'sl_score': cython_silhouette_score(clust_data, clust_labs, metric=distance),
            'ch_score': calinski_harabasz_score(clust_data, clust_labs),
            'db_score': davies_bouldin_score(clust_data, clust_labs),
            'entropy' : cython_entropy(clust_labs)
        })

    return results


def get_results(data, filename, labels, size, distance, method, **kwargs):
    if method=='km':
        n_clusters = kwargs['n_clusters']
        cls_res = evaluate_cluster(data, labels, distance, method, n_clusters)
    elif method=='dbscan':
        cls_res = evaluate_cluster(data, labels, distance, method)
        cls_res = {'epsilon': kwargs['epsilon'], 'min_pts': kwargs['min_pts']}  | cls_res
    elif method=='hdbscan':
        cls_res = evaluate_cluster(data, labels, distance, method)
        cls_res = {'min_clt_size': kwargs['min_clt_size'], 'min_samples': kwargs['min_samples']}  | cls_res
    
    results = {'distance': distance, 'size': size} | cls_res
    export_results(Path(data_dir, filename + '.csv'), results)

    
def clustering(path_list, method='km', **kwargs):
    if 'distance' not in kwargs:
        distance = 'euclidean'
    else:
        distance = kwargs['distance']
    
    filename = f'eval_cuML_{method}_{int(datetime.today().timestamp())}'
    
    for i in range(len(path_list)):
        model = Doc2Vec.load(path_list[i])
        data = model.dv.vectors
        size = path_list[i].split('_')[2]
        
        print(f'Performing {method} and evaluating for {size} points')     
        
        if method=='km':
            t0 = datetime.now()
            for n_clusters in range(kwargs['min_clust'], kwargs['max_clust'] + 1):
                labels = get_kmeans(data, size, distance, n_clusters)
                get_results(data, filename, labels, size, distance, method, n_clusters=n_clusters)
            t1 = datetime.now()
            print(f'Took: {t1-t0}')
        elif method=='dbscan':
            t0 = datetime.now()
            for epsilon in kwargs['eps_range']:
                for min_pts in kwargs['min_pts_range']:
                    labels = get_dbscan(data, size, distance, epsilon, min_pts)
                    get_results(data, filename, labels, size, distance, method, epsilon=epsilon, min_pts=min_pts)
            t1 = datetime.now()
            print(f'Took: {t1-t0}')
        elif method=='hdbscan':
            t0 = datetime.now()
            for mcs in kwargs['mcs_range']:
                for min_samples in kwargs['min_samples_range']:
                    labels = get_hdbscan(data, size, distance, mcs, min_samples)
                    get_results(data, filename, labels, size, distance, method, min_clt_size=mcs, min_samples=min_samples)
            t1 = datetime.now()
            print(f'Took: {t1-t0}')
                    
                    
                    # print(f'Calculating clusters with minimum cluster size of {mcs} and {min_samples} minimum samples took: {t1-t0}')
           

# HDBSCAN

In [4]:
mcs_range = [mcs for mcs in range(5,21)]
print(mcs_range)
min_samples_range = [ms for ms in range(5,21)]
print(min_samples_range)

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [None]:
clustering(mod_paths[4:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

In [7]:
clustering(mod_paths[-2:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 90000 points
Took: 0:52:48.133023
Performing hdbscan and evaluating for 95000 points
Took: 1:13:08.788950


In [4]:
mcs_range = [mcs for mcs in range(2,5)]
print(mcs_range)
min_samples_range = [ms for ms in range(2,5)]
print(min_samples_range)

[2, 3, 4]
[2, 3, 4]


In [6]:
clustering(mod_paths[2:4], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 20000 points
Took: 0:00:28.531770
Performing hdbscan and evaluating for 25000 points
Took: 0:00:34.023335


In [7]:
clustering(mod_paths[4:12], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 30000 points
Took: 0:00:41.591004
Performing hdbscan and evaluating for 35000 points
Took: 0:00:50.358744
Performing hdbscan and evaluating for 40000 points
Took: 0:00:58.804751
Performing hdbscan and evaluating for 45000 points
Took: 0:01:07.808075
Performing hdbscan and evaluating for 50000 points
Took: 0:01:15.664892
Performing hdbscan and evaluating for 55000 points
Took: 0:01:28.587734
Performing hdbscan and evaluating for 60000 points
Took: 0:01:42.591604
Performing hdbscan and evaluating for 65000 points
Took: 0:01:47.774804


In [8]:
clustering(mod_paths[12:], method='hdbscan', mcs_range=mcs_range, min_samples_range=min_samples_range)

Performing hdbscan and evaluating for 70000 points
Took: 0:01:58.902803
Performing hdbscan and evaluating for 75000 points
Took: 0:02:09.473521
Performing hdbscan and evaluating for 80000 points
Took: 0:02:19.611257
Performing hdbscan and evaluating for 85000 points
Took: 0:02:26.379817
Performing hdbscan and evaluating for 90000 points
Took: 0:02:46.450749
Performing hdbscan and evaluating for 95000 points
Took: 0:02:52.979125


# DBSCAN

In [5]:
min_pts_range = [min_pts for min_pts in range(5,11)]
print(min_pts_range)
eps_range = [round(e*0.01,3) for e in range(50,101)]
print(eps_range)

[5, 6, 7, 8, 9, 10]
[0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]


In [6]:
clustering(mod_paths[:4], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)

Performing dbscan and evaluating for 10000 points
Performing dbscan and evaluating for 15000 points
Performing dbscan and evaluating for 20000 points
Performing dbscan and evaluating for 25000 points


In [7]:
clustering(mod_paths[4:12], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)

Performing dbscan and evaluating for 30000 points
Performing dbscan and evaluating for 35000 points
Performing dbscan and evaluating for 40000 points
Performing dbscan and evaluating for 45000 points
Performing dbscan and evaluating for 50000 points
Performing dbscan and evaluating for 55000 points
Performing dbscan and evaluating for 60000 points
Performing dbscan and evaluating for 65000 points


In [8]:
clustering(mod_paths[12:], method='dbscan', eps_range=eps_range, min_pts_range=min_pts_range)

Performing dbscan and evaluating for 70000 points
Performing dbscan and evaluating for 75000 points
Performing dbscan and evaluating for 80000 points
Performing dbscan and evaluating for 85000 points
Performing dbscan and evaluating for 90000 points
Performing dbscan and evaluating for 95000 points
