Notebook to load the result.json from a random search / grid search hyperparameter selection section

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import json
from pathlib import Path
import sys
import os
import re

sys.path.append('../')
from eval_metrics import METRICS

In [2]:
run_name = 'lda_multicore_random_search_20240119_114453'
run_dir = Path(run_name)
run_result_json_path = run_dir.joinpath('result.json')
run_config_json_path = run_dir.joinpath('config.json')

metrics_names = [m.value for m in METRICS]

In [3]:
with open(run_result_json_path) as f:
    run_result = json.load(f)

with open(run_config_json_path) as f:
    run_config = json.load(f)

run_result

{'best_metric': -0.03381356620954536,
 'best_model_checkpoint': 'lda_multicore_random_search_20240119_114453/lda_multicore_offset_16_num_topics_10_decay_0.7',
 'best_hyperparameters': {'num_topics': 30,
  'workers': 3,
  'chunksize': 2000,
  'passes': 10,
  'alpha': 'symmetric',
  'eta': None,
  'decay': 0.9,
  'offset': 16,
  'eval_every': 10,
  'iterations': 50,
  'gamma_threshold': 0.001,
  'minimum_probability': 0.01,
  'random_state': 42,
  'minimum_phi_value': 0.01,
  'per_word_topics': False,
  'dtype': "<class 'numpy.float32'>"},
 'monitor_type': 'c_npmi',
 'log_history': [{'c_npmi': -0.14408378239825823,
   'c_v': 0.4467070662402987,
   'u_mass': -12.384271956401005,
   'c_uci': -4.9745392889314894,
   'topic_diversity': 0.8933333333333333,
   'inverted_rbo': 0.9587820470646962,
   'pairwise_jaccard_similarity': 0.029854809437386474,
   'hyperparameters': {'num_topics': 30,
    'workers': 3,
    'chunksize': 2000,
    'passes': 10,
    'alpha': 'symmetric',
    'eta': None,
  

In [4]:
# focus on the log history
# create a dataframe from the log history

log_history = run_result['log_history']

# from: https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
from collections.abc import MutableMapping

def _flatten_dict_gen(d, parent_key, sep):
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            yield from flatten_dict(v, new_key, sep=sep).items()
        else:
            yield new_key, v


def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
    return dict(_flatten_dict_gen(d, parent_key, sep))

# for each dictionary in the log_history list
# convert them to a flattened dictionary
# then append to a list
log_history_flattened = [flatten_dict(log, sep='.') for log in log_history]
log_history_flattened


[{'c_npmi': -0.14408378239825823,
  'c_v': 0.4467070662402987,
  'u_mass': -12.384271956401005,
  'c_uci': -4.9745392889314894,
  'topic_diversity': 0.8933333333333333,
  'inverted_rbo': 0.9587820470646962,
  'pairwise_jaccard_similarity': 0.029854809437386474,
  'hyperparameters.num_topics': 30,
  'hyperparameters.workers': 3,
  'hyperparameters.chunksize': 2000,
  'hyperparameters.passes': 10,
  'hyperparameters.alpha': 'symmetric',
  'hyperparameters.eta': None,
  'hyperparameters.decay': 0.7,
  'hyperparameters.offset': 128,
  'hyperparameters.eval_every': 10,
  'hyperparameters.iterations': 50,
  'hyperparameters.gamma_threshold': 0.001,
  'hyperparameters.minimum_probability': 0.01,
  'hyperparameters.random_state': 42,
  'hyperparameters.minimum_phi_value': 0.01,
  'hyperparameters.per_word_topics': False,
  'hyperparameters.dtype': "<class 'numpy.float32'>"},
 {'c_npmi': -0.12066801423313349,
  'c_v': 0.4695268507787126,
  'u_mass': -10.097817853049417,
  'c_uci': -4.3000276296

In [5]:
# create a dataframe from the list of flattened dictionaries
log_history_df = pd.DataFrame(log_history_flattened)
log_history_df

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,hyperparameters.num_topics,hyperparameters.workers,hyperparameters.chunksize,...,hyperparameters.decay,hyperparameters.offset,hyperparameters.eval_every,hyperparameters.iterations,hyperparameters.gamma_threshold,hyperparameters.minimum_probability,hyperparameters.random_state,hyperparameters.minimum_phi_value,hyperparameters.per_word_topics,hyperparameters.dtype
0,-0.144084,0.446707,-12.384272,-4.974539,0.893333,0.958782,0.029855,30,3,2000,...,0.7,128,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
1,-0.120668,0.469527,-10.097818,-4.300028,0.835,0.946205,0.037282,20,3,2000,...,0.8,64,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
2,-0.050303,0.534362,-6.049482,-2.594367,0.78,0.885862,0.056947,10,3,2000,...,0.8,16,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
3,-0.099818,0.456655,-7.652754,-3.448683,0.76,0.921964,0.051122,20,3,2000,...,0.9,16,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
4,-0.033814,0.557811,-5.359241,-2.179838,0.79,0.905221,0.048964,10,3,2000,...,0.7,16,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
5,-0.077491,0.463247,-6.110928,-2.87069,0.74,0.893998,0.085308,10,3,2000,...,0.8,128,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
6,-0.14147,0.417424,-10.890116,-4.769389,0.816667,0.924801,0.050146,30,3,2000,...,0.8,64,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
7,-0.122213,0.372327,-7.868889,-3.975141,0.606667,0.806936,0.121393,30,3,2000,...,0.8,128,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
8,-0.129421,0.52191,-8.912569,-4.32542,0.83,0.946658,0.036871,20,3,2000,...,0.8,16,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>
9,-0.145979,0.44966,-10.800261,-4.811866,0.836667,0.93104,0.04388,30,3,2000,...,0.9,16,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>


In [11]:
# sort by c_npmi
aaa = log_history_df.sort_values(by='c_npmi', ascending=False)
aaa

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.low_memory,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
11,0.15936,0.736042,-0.215378,1.572884,0.68,0.971925,0.029161,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,10,71
17,0.148927,0.714929,-0.218321,1.494394,0.67875,0.976178,0.02313,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,10,120,english,True,10,81
15,0.143404,0.704631,-0.222557,1.398259,0.688,0.968158,0.027177,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,30,90,english,True,10,51
4,0.137412,0.713508,-0.226645,1.23854,0.718421,0.961122,0.035787,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,90,english,True,10,41
2,0.132052,0.719587,-0.083229,1.092262,0.66,0.886554,0.11351,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,200,english,True,10,11
1,0.111151,0.744181,-0.152728,0.891481,0.605085,0.945372,0.060758,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,20,61
10,0.110444,0.719416,-0.227431,0.941981,0.680769,0.977346,0.024298,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,120,english,True,20,81
16,0.101151,0.724493,-0.21662,0.738231,0.64,0.946919,0.055351,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,31
9,0.100998,0.761304,-0.16013,0.824367,0.674359,0.971981,0.031359,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,180,english,True,30,81
23,0.100161,0.719013,-0.165695,0.755407,0.636735,0.953011,0.04874,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,51


In [12]:
aaa[aaa['hyperparameters.sbert_params.model_name_or_path'] == 'all-MiniLM-L12-v1']

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.low_memory,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
4,0.137412,0.713508,-0.226645,1.23854,0.718421,0.961122,0.035787,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,90,english,True,10,41
1,0.111151,0.744181,-0.152728,0.891481,0.605085,0.945372,0.060758,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,20,61
10,0.110444,0.719416,-0.227431,0.941981,0.680769,0.977346,0.024298,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,120,english,True,20,81
9,0.100998,0.761304,-0.16013,0.824367,0.674359,0.971981,0.031359,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,180,english,True,30,81
23,0.100161,0.719013,-0.165695,0.755407,0.636735,0.953011,0.04874,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,51
7,0.099775,0.740856,-0.177363,0.68602,0.575862,0.913845,0.091492,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,20,200,english,True,20,31
22,0.092475,0.629878,-0.465147,0.468985,0.84,0.984671,0.013068,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,30,english,True,10,31
0,0.092217,0.753588,-0.150548,0.645419,0.60625,0.951025,0.053354,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,180,english,True,30,51
5,0.091976,0.753699,-0.143123,0.690079,0.657303,0.976477,0.025502,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,150,english,True,30,91
20,0.091074,0.745403,-0.15265,0.678093,0.668116,0.968834,0.033474,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,150,english,True,30,71


In [6]:
# only show the columns within the search_space
search_space = run_config['search_space']

# flatten the dictionary
search_space_flattened = flatten_dict({'hyperparameters': search_space}, sep='.')
search_space_flattened

{'hyperparameters.num_topics': [10, 20, 30],
 'hyperparameters.decay': [0.7, 0.8, 0.9],
 'hyperparameters.offset': [16, 64, 128]}

In [8]:
bbb = log_history_df[metrics_names + list(search_space_flattened.keys())]
bbb

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,hyperparameters.num_topics,hyperparameters.decay,hyperparameters.offset
0,-12.384272,0.446707,-4.974539,-0.144084,0.893333,0.958782,0.029855,30,0.7,128
1,-10.097818,0.469527,-4.300028,-0.120668,0.835,0.946205,0.037282,20,0.8,64
2,-6.049482,0.534362,-2.594367,-0.050303,0.78,0.885862,0.056947,10,0.8,16
3,-7.652754,0.456655,-3.448683,-0.099818,0.76,0.921964,0.051122,20,0.9,16
4,-5.359241,0.557811,-2.179838,-0.033814,0.79,0.905221,0.048964,10,0.7,16
5,-6.110928,0.463247,-2.87069,-0.077491,0.74,0.893998,0.085308,10,0.8,128
6,-10.890116,0.417424,-4.769389,-0.14147,0.816667,0.924801,0.050146,30,0.8,64
7,-7.868889,0.372327,-3.975141,-0.122213,0.606667,0.806936,0.121393,30,0.8,128
8,-8.912569,0.52191,-4.32542,-0.129421,0.83,0.946658,0.036871,20,0.8,16
9,-10.800261,0.44966,-4.811866,-0.145979,0.836667,0.93104,0.04388,30,0.9,16


In [9]:
bbb.sort_values(by='c_npmi', ascending=False)

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,hyperparameters.num_topics,hyperparameters.decay,hyperparameters.offset
4,-5.359241,0.557811,-2.179838,-0.033814,0.79,0.905221,0.048964,10,0.7,16
2,-6.049482,0.534362,-2.594367,-0.050303,0.78,0.885862,0.056947,10,0.8,16
5,-6.110928,0.463247,-2.87069,-0.077491,0.74,0.893998,0.085308,10,0.8,128
3,-7.652754,0.456655,-3.448683,-0.099818,0.76,0.921964,0.051122,20,0.9,16
1,-10.097818,0.469527,-4.300028,-0.120668,0.835,0.946205,0.037282,20,0.8,64
7,-7.868889,0.372327,-3.975141,-0.122213,0.606667,0.806936,0.121393,30,0.8,128
8,-8.912569,0.52191,-4.32542,-0.129421,0.83,0.946658,0.036871,20,0.8,16
6,-10.890116,0.417424,-4.769389,-0.14147,0.816667,0.924801,0.050146,30,0.8,64
0,-12.384272,0.446707,-4.974539,-0.144084,0.893333,0.958782,0.029855,30,0.7,128
9,-10.800261,0.44966,-4.811866,-0.145979,0.836667,0.93104,0.04388,30,0.9,16
