### Naive Approach

In [None]:
import sys
sys.path.append(r"D:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

from MARScore.utils import * 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import hdbscan
from sklearn.metrics import make_scorer
from random import seed
from datasets_loaders.loaders import load_billsum
import numbers

In [2]:
billsum = load_billsum()
subset = billsum.iloc[:10, :]

In [3]:
def DBCV(model, X, y=None):
    preds = model.fit_predict(X)
    return hdbscan.validity.validity_index(X, preds) if len(set(preds)) > 1 else float('nan')

def DBCV2(model, X, y=None):
    preds = [model.fit_predict(x) for x in X]
    score = np.mean(hdbscan.validity.validity_index(x, pred) for x, pred in zip(X, preds))
    return score if score != 0 else float('nan')

def HDBScanFinetune(v_texts, 
                    min_samples=[10,30,50,60,100], 
                    min_cluster_size=[100,200,300,400,500,600],
                    cluster_selection_method=['eom','leaf'],
                    seed_num=0, 
                    verbose=True):
    
    #model setup
    hdb = hdbscan.HDBSCAN(gen_min_span_tree=True)

    # specify parameters and distributions to sample from
    param_dist = {'min_samples': min_samples,
                  'min_cluster_size': min_cluster_size,  
                  'cluster_selection_method': cluster_selection_method
                 }

    #validity_scroer = "hdbscan__hdbscan___HDBSCAN__validity_index"
    #validity_scorer = make_scorer(DBCV, greater_is_better=True)

    #parameters research
    """
    random_search = GridSearchCV(hdb,
                                 param_grid=param_dist,
                                 scoring=validity_scorer)
    """
    
    n_iter_search = 2
    random_search = RandomizedSearchCV(hdb,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       scoring=DBCV,
                                       random_state=seed(seed_num))
    best_params = {}
    best_scores = []
    first_lap = True
    for vectors in v_texts:
        random_search.fit(vectors)
        if first_lap:
            for k in random_search.best_params_.keys():
                best_params[k] = []
            first_lap = False
        for k, v in random_search.best_params_.items():
            best_params[k].append(v)
        best_scores.append(random_search.best_estimator_.relative_validity_)
    
    final_params = {k: None for k in best_params.keys()}
    final_score = np.mean(best_scores)
    for k, v in best_params.items():
        if type(best_params[k][0]) == str:
            final_params[k] = max(set(best_params[k]), key = best_params[k].count)
        elif isinstance(best_params[k][0], numbers.Number):
            final_params[k] = np.mean(best_params[k])
        else:
            final_params[k] = best_params[k][0]    

    if verbose:
        print(f"Best Parameters {final_params}")
        print(f"DBCV score :{final_score}")
    return {"best_params": final_params, "dbcv_score": final_score}

In [4]:
def to_shape(a, shape):
    x_, y_ = shape
    x, y = len(a), len(a[0])
    x_pad = (x_-x)
    y_pad = (y_-y)
    return np.pad(a,((0, x_pad),
                    (0, y_pad)),
                mode = 'constant')

In [5]:
#creation of embeddings
all_v = []
for indiv in subset["text"].to_list():
    o, l = tokenizeCorpus(indiv)
    v = vectorizeCorpus(o)
    v, l = cleanAll(v, l)
    all_v.append(v)

Token indices sequence length is longer than the specified maximum sequence length for this model (699 > 512). Running this sequence through the model will result in indexing errors


In [6]:
max_dim_1 = np.max([len(x) for x in all_v])
all_v3 = [to_shape(cur_v, (max_dim_1, len(all_v[0][0]))) for cur_v in all_v]

In [None]:
DBCV2(hdbscan.HDBSCAN(), all_v3)

In [7]:
HDBScanFinetune(all_v3,
                min_samples=[3, 5, 7],
                min_cluster_size=[7, 10, 20],
                cluster_selection_method=['eom','leaf'])



Best Parameters {'min_samples': 5.6, 'min_cluster_size': 11.4, 'cluster_selection_method': 'leaf'}
DBCV score :0.227490788949255


{'best_params': {'min_samples': 5.6,
  'min_cluster_size': 11.4,
  'cluster_selection_method': 'leaf'},
 'dbcv_score': 0.227490788949255}

### Iterative approach

In [3]:
import sys
sys.path.append(r"D:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

from MARScore.utils import * 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import hdbscan
from sklearn.metrics import make_scorer
from random import seed, randint
from datasets_loaders.loaders import load_billsum
import numbers
import numpy as np
from datetime import timedelta, datetime

In [4]:
billsum = load_billsum()
subset = billsum.iloc[:50, :]

In [5]:
def DBCV(model, X, y=None):
    preds = model.fit_predict(X)
    return hdbscan.validity.validity_index(X, preds) if len(set(preds)) > 1 else float('nan')

def HDBScanFinetune(v_texts, 
                    start = 1,
                    stop = 3,
                    epsilon = 1,
                    epsilon_reduction_factor = 0.3,
                    delta = 0.5,
                    n_elements = 4,
                    n_iter_search = 2,
                    early_stopping = 3,
                    seed_num=0,
                    max_compile_time = timedelta(hours=2),
                    verbose=True):
    
    
    #params
    first_iter = True
    stable = 0
    start_time = datetime.now()
    compile_time = datetime.now() - start_time

    #model setup
    hdb = hdbscan.HDBSCAN(gen_min_span_tree=True)
    
    #tuning loop
    while((start + delta) < stop or compile_time >= max_compile_time):
        #search
        min_samples = np.rint(np.linspace(np.exp(start), np.exp(stop), n_elements)).astype(int)
        min_cluster_size = np.rint(np.linspace(np.exp(start), np.exp(stop), n_elements)).astype(int)
        cluster_selection_method = ['eom', 'leaf']


        #specify parameters and distributions to sample from
        param_dist = {'min_samples': min_samples,
                      'min_cluster_size': min_cluster_size,
                      'cluster_selection_method': cluster_selection_method
                     }

        random_search = RandomizedSearchCV(hdb,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       scoring=DBCV,
                                       random_state=seed(seed_num))
        best_params = {}
        best_scores = []
        first_lap = True
        better = False
        for vectors in v_texts:
            random_search.fit(vectors)
            if first_lap:
                for k in random_search.best_params_.keys():
                    best_params[k] = []
                first_lap = False
            for k, v in random_search.best_params_.items():
                best_params[k].append(v)
            best_scores.append(random_search.best_estimator_.relative_validity_)
        
        cur_params = {k: None for k in best_params.keys()}
        cur_score = np.mean(best_scores)
        for k, v in best_params.items():
            if type(best_params[k][0]) == str:
                cur_params[k] = max(set(best_params[k]), key = best_params[k].count)
            elif isinstance(best_params[k][0], numbers.Number):
                cur_params[k] = round(np.mean(best_params[k]))
            else:
                cur_params[k] = best_params[k][0]
        if first_iter:
            global_params = cur_params
            global_score = cur_score
            first_iter = False
        elif cur_score > global_score:
            global_params = cur_params
            global_score = cur_score
            better = True

        #adapting ranges
        if better:
            for k, v in cur_params.items():
                if cur_params[k] == locals()[k][0]:
                    start -= epsilon
                    stop -= epsilon
                elif cur_params[k] == locals()[k][-1]:
                    start += epsilon
                    stop += epsilon
                else:
                    side = np.round(randint(0, 1))
                    if side:
                        stop -= epsilon
                    else:
                        start += epsilon
                    epsilon -= epsilon*epsilon_reduction_factor
        elif not(better) and not(first_iter):
            stable += 1
        if stable >= early_stopping:
            break
        compile_time = datetime.now() - start_time

    if verbose:
        print(f"Best Parameters {global_params}")
        print(f"DBCV score :{global_score}")
    return {"best_params": global_params, "dbcv_score": global_score}

In [6]:
def to_shape(a, shape):
    x_, y_ = shape
    x, y = len(a), len(a[0])
    x_pad = (x_-x)
    y_pad = (y_-y)
    return np.pad(a,((0, x_pad),
                    (0, y_pad)),
                mode = 'constant')

#creation of embeddings
all_v = []
for indiv in subset["text"].to_list():
    o, l = tokenizeCorpus(indiv)
    v = vectorizeCorpus(o)
    v, l = cleanAll(v, l)
    all_v.append(v)

max_dim_1 = np.max([len(x) for x in all_v])
all_v3 = [to_shape(cur_v, (max_dim_1, len(all_v[0][0]))) for cur_v in all_v]

Token indices sequence length is longer than the specified maximum sequence length for this model (699 > 512). Running this sequence through the model will result in indexing errors


In [7]:
HDBScanFinetune(all_v3)

Traceback (most recent call last):
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\orteg\AppData\Local\Temp\ipykernel_7312\1983354121.py", line 3, in DBCV
    return hdbscan.validity.validity_index(X, preds) if len(set(preds)) > 1 else float('nan')
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\site-packages\hdbscan\validity.py", line 386, in validity_index
    density_sep[i, j] = density_separation(
                        ^^^^^^^^^^^^^^^^^^^
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\s

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 684, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\site-packages\hdbscan\hdbscan_.py", line 1190, in fit
    ) = hdbscan(clean_data, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\COURS\A4\S8 - ESILV\Stage\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\.venv\Lib\site-packages\hdbscan\hdbscan_.py", line 687, in hdbscan
    raise ValueError("Min cluster size must be greater than one")
ValueError: Min cluster size must be greater than one
