## Experimenting with Random and grid search code from Will Koehrsen
code can be found here https://www.kaggle.com/code/willkoehrsen/intro-to-model-tuning-grid-and-random-search/notebook

In [1]:
import os
os.chdir("T:/laupodteam/AIOS/Chontira/CellDynClustering")
import random
import numpy as np
import pandas as pd
import time
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import RandomizedSearchCV
from data.load_data import *
from random import randint
from sklearn.metrics import f1_score
from evaluation.util import *
from tabulate import tabulate
from collections import defaultdict
import itertools
import math

closed = load_data().read_from_path("data/toy_dense.csv", True)
seperated = load_data().read_from_path("data/toy_seperated.csv",True)

closed.predicted = GaussianMixture(n_components=3).fit_predict(closed.X)
seperated.predicted = GaussianMixture(n_components=3).fit_predict(seperated.X)

evaluators= {'silhouette_score': silhouette_score,
            'davies_bouldin_score': davies_bouldin_score}
            

metrics_scores_dict(closed.X, closed.predicted, evaluators, verbose= True)
metrics_scores_dict(seperated.X, seperated.predicted, evaluators, verbose= True)

ascending_met = [False, True]

Metric                   Score
--------------------  --------
silhouette_score      0.756181
davies_bouldin_score  0.352836


Metric                   Score
--------------------  --------
silhouette_score      0.851333
davies_bouldin_score  0.211788




## Gaussian Mixture model parameters

In [2]:

params = {'n_components': list(range(2, 6, 1)), 
        'n_init': list(range(2, 10, 2)),
        'covariance_type' : ('full', 'diag'),
        'init_params':('k-means++', "kmeans")}

keys, values = zip(*params.items())

In [3]:

def roundup(x):
    """
    From https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten
    """
    return int(math.ceil(x / 10.0)) * 10

In [4]:
def numpy_sampling(X, subsampling):  
    
      
    n_data = len(X) 
    idx = np.arange(n_data) 
    np.random.shuffle(idx) 
    return X[idx[: subsampling],:] 

numpy_sampling(closed.X, 1000).shape

(1000, 6)

## Code for the random search

In [24]:

def random_seach_optimization(X, model, evaluators, param_grid ,ascending,subsampling = False, max_evals = 300, num_iter = 5, random_state = None):
   
    """
    Random hyperparameter optimization for clustering algorithm. Measuring the results using multiple evaluators
    Adapted from Will Koehrsen for randomized search of hyperpameters for clustering algorithm.

    Paramters
    ---------
    X: numpy array
        the data that needs to be fitted to the model.
    
    model: function
        the model that needs to be hyperparameter tunned.
    
    evaluators: dict
        the dictionary that stores all the evaluators for measuring the performace of the clusters

    param_grid: dict
        dictionary of the model's parameters with range of values to be seleted by the search.
    
    ascending: list of bool
        list length equals to the number of evaluator
        whether the scores should be displayed in ascending order or descending order. True = ascending

    subsampling: optional, bool or int 
        needs to be an int if subsampling is to be used with a certain number of rows, otherwise the full dataset
        will be used.
    
    max_evals: optional, int
        max number of evaluation to seach for the ideal hyperparameter.
    
    num_iters: optional, int
        number of iterations to run for each hyperparameter setting.
    
    random_state: optional, int
        set to None. If int is used e.g. 0, the randomness is deterministic
    
    Returns
    ---------
    results: DataFrame
        containing the scores of the evaluations, the time it takes to run the model on average (in second),
        the hyperparameter settings, and order with which hyperparameter was ran.
        
    """
    method_start = time.time()
    precent_range = list(range(10,110,10))
    
    #setting random state
    random.seed(random_state)

    # Dataframe for values other then the results
    records = pd.DataFrame(columns = ['iteration', 'params', 'duration_in_second', 'num_labels', 'num_labels_std'],
                                  index = list(range(max_evals)))

    # Dataframe for results
    results = pd.DataFrame(columns = evaluators.keys(),
                                  index = list(range(max_evals)))
    std_words = '_std'

    std_results = pd.DataFrame(columns = [s+std_words for s in list(evaluators.keys())],
                            index= list(range(max_evals)))
    
    # Keep searching until reach max evaluations
    for i in range(max_evals):
        
        #picked  hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        # dictionary for storing average results
        num_labels = [] 
        scores = defaultdict(list)
        times = []

        #Run evaluation for each hyperparameter setting per num_iter
        for num in range(num_iter):
            sub = X

            start = time.time()
            if(type(subsampling) == int):
                sub = numpy_sampling(sub, subsampling)

            # Evaluate randomly selected hyperparameters
            predicted = model(**hyperparameters).fit_predict(sub)
            times.append(time.time()-start)

            num_labels.append(len(np.unique(predicted)))

            metric_results = metrics_scores_dict(sub, predicted, evaluators, return_dict= True)

            for metric, score in metric_results.items():
                scores[metric].append(score)

        for metric, score_ls in scores.items():
            results.at[i, metric]= np.mean(score_ls)
            std_results.at[i, metric+std_words] = np.std(score_ls)
            

        records.loc[i, :] = [i,hyperparameters,np.mean(times), np.mean(num_labels), np.std(num_labels)]
        
        #Print the percentage until finish
        percent_done = roundup(int((i/max_evals)*100))

        if(percent_done in precent_range):
            print(f"Around {percent_done}% done." )
            precent_range.remove(percent_done)

    #combine all three of the tables for sco
    records  = pd.concat([records, results, std_results], axis=1)

    # Sort with best score on top
    records.sort_values(list(evaluators.keys()), ascending = ascending, inplace = True)
    records.reset_index(inplace = True)

    print("Hyperpameter tuning is done and the best scores are:")
    print(records.loc[0][list(evaluators.keys())]) 
    print("Number of unique labels", int(records['num_labels'][0]))
    print("with parameter:", records['params'][0])
    print("Finish tuning in ",time.time() - method_start, "seconds.")

    return records 

In [25]:
results = random_seach_optimization(seperated.X,GaussianMixture , evaluators, params ,[False, True],subsampling = False, max_evals = 30, num_iter = 5)

Around 10% done.
Around 20% done.
Around 30% done.
Around 40% done.
Around 50% done.
Around 60% done.
Around 70% done.
Around 80% done.
Around 90% done.
Around 100% done.
Hyperpameter tuning is done and the best scores are:
silhouette_score        0.851333
davies_bouldin_score    0.211788
Name: 0, dtype: object
Number of unique labels 3
with parameter: {'n_components': 3, 'n_init': 4, 'covariance_type': 'full', 'init_params': 'k-means++'}
Finish tuning in  73.66451454162598 seconds.


In [12]:
results

Unnamed: 0,index,iteration,params,duration_in_second,num_labels,std_num_labels,silhouette_score,davies_bouldin_score,silhouette_score_std,davies_bouldin_score_std
0,10,10,"{'n_components': 3, 'n_init': 6, 'covariance_t...",0.040967,3.0,0.0,0.851333,0.211788,0.0,0.0
1,17,17,"{'n_components': 3, 'n_init': 4, 'covariance_t...",0.032523,3.0,0.0,0.851333,0.211788,0.0,0.0
2,18,18,"{'n_components': 3, 'n_init': 4, 'covariance_t...",0.091629,3.0,0.0,0.851333,0.211788,0.0,0.0
3,20,20,"{'n_components': 3, 'n_init': 4, 'covariance_t...",0.112811,3.0,0.0,0.851333,0.211788,0.0,0.0
4,22,22,"{'n_components': 3, 'n_init': 6, 'covariance_t...",0.048346,3.0,0.0,0.851333,0.211788,0.0,0.0
5,24,24,"{'n_components': 3, 'n_init': 8, 'covariance_t...",0.084998,3.0,0.0,0.851333,0.211788,0.0,0.0
6,25,25,"{'n_components': 3, 'n_init': 6, 'covariance_t...",0.170446,3.0,0.0,0.851333,0.211788,0.0,0.0
7,28,28,"{'n_components': 3, 'n_init': 8, 'covariance_t...",0.093747,3.0,0.0,0.851333,0.211788,0.0,0.0
8,29,29,"{'n_components': 3, 'n_init': 2, 'covariance_t...",0.102724,3.0,0.0,0.851333,0.211788,0.0,0.0
9,3,3,"{'n_components': 2, 'n_init': 2, 'covariance_t...",0.056349,2.0,0.0,0.741693,0.360219,0.0,0.0


## Code for grid search

In [22]:
def grid_seach_optimization(X, model, evaluators, param_grid ,ascending,subsampling = False, num_iter = 5, random_state = None):
   
    """
    Grid hyperparameter optimization for clustering algorithm. Measuring the results using multiple evaluators
    Adapted from Will Koehrsen for grid searching of hyperpameters for clustering algorithm.

    Paramters
    ---------
    X: numpy array
        the data that needs to be fitted to the model.
    
    model: function
        the model that needs to be hyperparameter tunned.
    
    evaluators: dict
        the dictionary that stores all the evaluators for measuring the performace of the clusters

    param_grid: dict
        dictionary of the model's parameters with range of values to be seleted by the search.
    
    ascending: list of bool
        list length equals to the number of evaluator
        whether the scores should be displayed in ascending order or descending order. True = ascending

    subsampling: optional, bool or int 
        needs to be an int if subsampling is to be used with a certain number of rows, otherwise the full dataset
        will be used.
    
    num_iters: optional, int
        number of iterations to run for each hyperparameter setting.
    
    random_state: optional, int
        set to None. If int is used e.g. 0, the randomness is deterministic.
    
    Returns
    ---------
    results: DataFrame
        containing the scores of the evaluations, the time it takes to run the model on average (in second),
        the hyperparameter settings, and order with which hyperparameter was ran.
        
    """
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    keys, values = zip(*param_grid.items())

    # Keep the length of all set of the parameter combinations
    param_len = 1
    for v in values:
        param_len=param_len*len(v)

    param_len-=1

    method_start = time.time()
    precent_range = list(range(10,110,10))
    
    #setting random state
    random.seed(random_state)

    # Dataframe for recording things other then the results
    records = pd.DataFrame(columns = ['iteration', 'params', 'duration_in_second', 'num_labels', 'num_labels_std'],
                                  index = list(range(param_len)))

    # Dataframe for results
    results = pd.DataFrame(columns = evaluators.keys(),
                                  index = list(range(param_len)))
    std_words = '_std'

    std_results = pd.DataFrame(columns = [s+std_words for s in list(evaluators.keys())],
                            index= list(range(param_len)))

    
    counter = 0
    for i in itertools.product(*values):
        
         # Retrieving the parameter set for a given value i
        hyperparameters = dict(zip(keys, i))
        
        
        # dictionary for storing average results 
        num_labels = [] 
        scores = defaultdict(list)
        times = []

        #Run evaluation for each hyperparameter setting per num_iter
        for num in range(num_iter):
            sub = X

            start = time.time()
            if(type(subsampling) == int):
                sub = numpy_sampling(sub, subsampling)

            # Evaluate randomly selected hyperparameters
            predicted = model(**hyperparameters).fit_predict(sub)
            times.append(time.time()-start)

            num_labels.append(len(np.unique(predicted)))

            metric_results = metrics_scores_dict(sub, predicted, evaluators, return_dict= True)

            for metric, score in metric_results.items():
                scores[metric].append(score)

        for metric, score_ls in scores.items():
            results.at[counter, metric]= np.mean(score_ls)
            std_results.at[counter, metric+std_words] = np.std(score_ls)
            

        records.loc[counter, :] = [counter,hyperparameters,np.mean(times), np.mean(num_labels), np.std(num_labels)]
        
        #Print the percentage until finish
        percent_done = roundup(int((counter/param_len)*100))

        if(percent_done in precent_range):
            print(f"Around {percent_done}% done." )
            precent_range.remove(percent_done)
        
        counter+=1

    records  = pd.concat([records, results, std_results], axis=1)

    # Sort with best score on top
    records.sort_values(list(evaluators.keys()), ascending = ascending, inplace = True)
    records.reset_index(inplace = True)

    print("Hyperpameter tuning is done and the best scores are:")
    print(records.loc[0][list(evaluators.keys())])
    print("Number of unique labels", int(records['num_labels'][0]))
    print("with parameter:", records['params'][0])
    print("Finish tuning in ",time.time() - method_start, "seconds.")

    return records 

In [23]:
seperated_grid_results = grid_seach_optimization(seperated.X, GaussianMixture, evaluators, params,ascending_met, random_state=0,num_iter = 5)


Around 10% done.
Around 20% done.
Around 30% done.
Around 40% done.
Around 50% done.
Around 60% done.
Around 70% done.
Around 80% done.
Around 90% done.
Around 100% done.
Hyperpameter tuning is done and the best scores are:
silhouette_score        0.851333
davies_bouldin_score    0.211788
Name: 0, dtype: object
Number of unique labels 3
with parameter: {'n_components': 3, 'n_init': 2, 'covariance_type': 'full', 'init_params': 'k-means++'}
Finish tuning in  149.5449047088623 seconds.


In [18]:
seperated_grid_results

Unnamed: 0,index,iteration,params,duration_in_second,num_labels,num_labels_std,silhouette_score,davies_bouldin_score,silhouette_score_std,davies_bouldin_score_std
0,16,16,"{'n_components': 3, 'n_init': 2, 'covariance_t...",0.020702,3.0,0.0,0.851333,0.211788,0.0,0.0
1,17,17,"{'n_components': 3, 'n_init': 2, 'covariance_t...",0.0637,3.0,0.0,0.851333,0.211788,0.0,0.0
2,18,18,"{'n_components': 3, 'n_init': 2, 'covariance_t...",0.013194,3.0,0.0,0.851333,0.211788,0.0,0.0
3,19,19,"{'n_components': 3, 'n_init': 2, 'covariance_t...",0.06046,3.0,0.0,0.851333,0.211788,0.0,0.0
4,20,20,"{'n_components': 3, 'n_init': 4, 'covariance_t...",0.035922,3.0,0.0,0.851333,0.211788,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
59,62,62,"{'n_components': 5, 'n_init': 8, 'covariance_t...",0.149589,5.0,0.0,0.397077,1.805195,0.092379,0.321908
60,49,49,"{'n_components': 5, 'n_init': 2, 'covariance_t...",0.130623,5.0,0.0,0.354889,2.193312,0.006024,0.021414
61,55,55,"{'n_components': 5, 'n_init': 4, 'covariance_t...",0.179534,5.0,0.0,0.353793,2.207787,0.007072,0.022433
62,63,63,"{'n_components': 5, 'n_init': 8, 'covariance_t...",0.304292,5.0,0.0,0.353361,2.218777,0.007347,0.02556


In [36]:
seperated_grid_results[seperated_grid_results["iteration"]==seperated_grid_results.shape[0]-1]

Unnamed: 0,index,iteration,params,duration_in_second,silhouette_score,davies_bouldin_score,silhouette_score_std,davies_bouldin_score_std
55,63,63,"{'n_components': 5, 'n_init': 8, 'covariance_t...",0.277553,0.407462,2.087124,0.099149,0.29097


In [25]:
seperated_grid_results["silhouette_score"].loc[seperated_grid_results.shape[0]-2] = 6

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seperated_grid_results["silhouette_score"].iloc[seperated_grid_results.shape[0]-2] = 6
