Notebook to load the result.json from a random search / grid search hyperparameter selection section

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns

import json
from pathlib import Path
import sys
import os
import re

sys.path.append('../')
from eval_metrics import METRICS

In [18]:
run_name = 'bertopic_random_search_20240124_153050'
run_dir = Path(run_name)
run_result_json_path = run_dir.joinpath('result.json')
run_config_json_path = run_dir.joinpath('config.json')

metrics_names = [m.value for m in METRICS]

In [5]:
with open(run_result_json_path) as f:
    run_result = json.load(f)

with open(run_config_json_path) as f:
    run_config = json.load(f)

run_result

{'best_metric': 0.15936011386090257,
 'best_model_checkpoint': 'bertopic_random_search_20240124_153050/bertopic_sb_model_name_or_path_all-mpnet-base-v1_hs_min_samples_40_hs_min_cluster_size_180_bt_top_n_words_10_bt_nr_topics_70',
 'best_hyperparameters': {'sbert_params': {'model_name_or_path': 'all-mpnet-base-v1'},
  'vocab_tokenizer_params': {'ngram_range': [1, 2]},
  'umap_params': {'n_neighbors': 15,
   'n_components': 5,
   'metric': 'cosine',
   'min_dist': 0.0,
   'n_epochs': None,
   'low_memory': False,
   'random_state': None},
  'hdbscan_params': {'metric': 'euclidean',
   'prediction_data': True,
   'min_samples': 40,
   'min_cluster_size': 180},
  'bertopic_params': {'language': 'english',
   'calculate_probabilities': True,
   'top_n_words': 10,
   'nr_topics': 71}},
 'monitor_type': 'c_npmi',
 'log_history': [{'c_npmi': 0.09221685949766062,
   'c_v': 0.7535877155377385,
   'u_mass': -0.15054789138925634,
   'c_uci': 0.6454193389350572,
   'topic_diversity': 0.60625,
   'i

In [7]:
# focus on the log history
# create a dataframe from the log history

log_history = run_result['log_history']

# from: https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
from collections.abc import MutableMapping

def _flatten_dict_gen(d, parent_key, sep):
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            yield from flatten_dict(v, new_key, sep=sep).items()
        else:
            yield new_key, v


def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
    return dict(_flatten_dict_gen(d, parent_key, sep))

# for each dictionary in the log_history list
# convert them to a flattened dictionary
# then append to a list
log_history_flattened = [flatten_dict(log, sep='.') for log in log_history]
log_history_flattened


[{'c_npmi': 0.09221685949766062,
  'c_v': 0.7535877155377385,
  'u_mass': -0.15054789138925634,
  'c_uci': 0.6454193389350572,
  'topic_diversity': 0.60625,
  'inverted_rbo': 0.9510249551573594,
  'pairwise_jaccard_similarity': 0.053353917705327825,
  'model_name': 'sb_model_name_or_path_all-MiniLM-L12-v1_hs_min_samples_30_hs_min_cluster_size_180_bt_top_n_words_30_bt_nr_topics_50',
  'hyperparameters.sbert_params.model_name_or_path': 'all-MiniLM-L12-v1',
  'hyperparameters.vocab_tokenizer_params.ngram_range': [1, 2],
  'hyperparameters.umap_params.n_neighbors': 15,
  'hyperparameters.umap_params.n_components': 5,
  'hyperparameters.umap_params.metric': 'cosine',
  'hyperparameters.umap_params.min_dist': 0.0,
  'hyperparameters.umap_params.n_epochs': None,
  'hyperparameters.umap_params.low_memory': False,
  'hyperparameters.umap_params.random_state': None,
  'hyperparameters.hdbscan_params.metric': 'euclidean',
  'hyperparameters.hdbscan_params.prediction_data': True,
  'hyperparameter

In [8]:
# create a dataframe from the list of flattened dictionaries
log_history_df = pd.DataFrame(log_history_flattened)
log_history_df

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.low_memory,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
0,0.092217,0.753588,-0.150548,0.645419,0.60625,0.951025,0.053354,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,180,english,True,30,51
1,0.111151,0.744181,-0.152728,0.891481,0.605085,0.945372,0.060758,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,20,61
2,0.132052,0.719587,-0.083229,1.092262,0.66,0.886554,0.11351,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,200,english,True,10,11
3,0.071528,0.648377,-0.303749,0.202981,0.768571,0.987691,0.011415,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,30,english,True,20,71
4,0.137412,0.713508,-0.226645,1.23854,0.718421,0.961122,0.035787,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,90,english,True,10,41
5,0.091976,0.753699,-0.143123,0.690079,0.657303,0.976477,0.025502,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,150,english,True,30,91
6,0.071857,0.700117,-0.217763,0.289676,0.75,0.98308,0.01691,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,30,english,True,30,51
7,0.099775,0.740856,-0.177363,0.68602,0.575862,0.913845,0.091492,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,20,200,english,True,20,31
8,0.078202,0.709542,-0.189626,0.435725,0.702899,0.97837,0.019305,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,90,english,True,30,71
9,0.100998,0.761304,-0.16013,0.824367,0.674359,0.971981,0.031359,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,180,english,True,30,81


In [11]:
# sort by c_npmi
aaa = log_history_df.sort_values(by='c_npmi', ascending=False)
aaa

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.low_memory,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
11,0.15936,0.736042,-0.215378,1.572884,0.68,0.971925,0.029161,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,10,71
17,0.148927,0.714929,-0.218321,1.494394,0.67875,0.976178,0.02313,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,10,120,english,True,10,81
15,0.143404,0.704631,-0.222557,1.398259,0.688,0.968158,0.027177,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,30,90,english,True,10,51
4,0.137412,0.713508,-0.226645,1.23854,0.718421,0.961122,0.035787,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,90,english,True,10,41
2,0.132052,0.719587,-0.083229,1.092262,0.66,0.886554,0.11351,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,40,200,english,True,10,11
1,0.111151,0.744181,-0.152728,0.891481,0.605085,0.945372,0.060758,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,20,61
10,0.110444,0.719416,-0.227431,0.941981,0.680769,0.977346,0.024298,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,120,english,True,20,81
16,0.101151,0.724493,-0.21662,0.738231,0.64,0.946919,0.055351,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,all-mpnet-base-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,31
9,0.100998,0.761304,-0.16013,0.824367,0.674359,0.971981,0.031359,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,180,english,True,30,81
23,0.100161,0.719013,-0.165695,0.755407,0.636735,0.953011,0.04874,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,51


In [12]:
aaa[aaa['hyperparameters.sbert_params.model_name_or_path'] == 'all-MiniLM-L12-v1']

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.low_memory,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
4,0.137412,0.713508,-0.226645,1.23854,0.718421,0.961122,0.035787,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,90,english,True,10,41
1,0.111151,0.744181,-0.152728,0.891481,0.605085,0.945372,0.060758,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,40,180,english,True,20,61
10,0.110444,0.719416,-0.227431,0.941981,0.680769,0.977346,0.024298,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,120,english,True,20,81
9,0.100998,0.761304,-0.16013,0.824367,0.674359,0.971981,0.031359,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,180,english,True,30,81
23,0.100161,0.719013,-0.165695,0.755407,0.636735,0.953011,0.04874,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,120,english,True,20,51
7,0.099775,0.740856,-0.177363,0.68602,0.575862,0.913845,0.091492,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,20,200,english,True,20,31
22,0.092475,0.629878,-0.465147,0.468985,0.84,0.984671,0.013068,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,10,30,english,True,10,31
0,0.092217,0.753588,-0.150548,0.645419,0.60625,0.951025,0.053354,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,30,180,english,True,30,51
5,0.091976,0.753699,-0.143123,0.690079,0.657303,0.976477,0.025502,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,150,english,True,30,91
20,0.091074,0.745403,-0.15265,0.678093,0.668116,0.968834,0.033474,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,all-MiniLM-L12-v1,"[1, 2]",...,False,,euclidean,True,50,150,english,True,30,71


In [20]:
# only show the columns within the search_space
search_space = run_config['search_space']

# flatten the dictionary
search_space_flattened = flatten_dict({'hyperparameters': search_space}, sep='.')
search_space_flattened

{'hyperparameters.sbert_params.model_name_or_path': ['all-MiniLM-L12-v1',
  'all-mpnet-base-v1'],
 'hyperparameters.hdbscan_params.min_cluster_size': [30,
  60,
  90,
  120,
  150,
  180,
  200],
 'hyperparameters.hdbscan_params.min_samples': [10, 20, 30, 40, 50],
 'hyperparameters.bertopic_params.top_n_words': [10, 20, 30],
 'hyperparameters.bertopic_params.nr_topics': [10,
  20,
  30,
  40,
  50,
  60,
  70,
  80,
  90,
  100]}

In [23]:
bbb = log_history_df[['model_name'] + metrics_names + list(search_space_flattened.keys())]
bbb

Unnamed: 0,model_name,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,hyperparameters.sbert_params.model_name_or_path,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.hdbscan_params.min_samples,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
0,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.150548,0.753588,0.645419,0.092217,0.60625,0.951025,0.053354,all-MiniLM-L12-v1,180,30,30,51
1,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.152728,0.744181,0.891481,0.111151,0.605085,0.945372,0.060758,all-MiniLM-L12-v1,180,40,20,61
2,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.083229,0.719587,1.092262,0.132052,0.66,0.886554,0.11351,all-mpnet-base-v1,200,40,10,11
3,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.303749,0.648377,0.202981,0.071528,0.768571,0.987691,0.011415,all-MiniLM-L12-v1,30,10,20,71
4,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.226645,0.713508,1.23854,0.137412,0.718421,0.961122,0.035787,all-MiniLM-L12-v1,90,10,10,41
5,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.143123,0.753699,0.690079,0.091976,0.657303,0.976477,0.025502,all-MiniLM-L12-v1,150,50,30,91
6,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.217763,0.700117,0.289676,0.071857,0.75,0.98308,0.01691,all-MiniLM-L12-v1,30,40,30,51
7,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.177363,0.740856,0.68602,0.099775,0.575862,0.913845,0.091492,all-MiniLM-L12-v1,200,20,20,31
8,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.189626,0.709542,0.435725,0.078202,0.702899,0.97837,0.019305,all-mpnet-base-v1,90,40,30,71
9,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.16013,0.761304,0.824367,0.100998,0.674359,0.971981,0.031359,all-MiniLM-L12-v1,180,50,30,81


In [24]:
bbb.sort_values(by='c_npmi', ascending=False)

Unnamed: 0,model_name,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,hyperparameters.sbert_params.model_name_or_path,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.hdbscan_params.min_samples,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.nr_topics
11,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.215378,0.736042,1.572884,0.15936,0.68,0.971925,0.029161,all-mpnet-base-v1,180,40,10,71
17,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.218321,0.714929,1.494394,0.148927,0.67875,0.976178,0.02313,all-mpnet-base-v1,120,10,10,81
15,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.222557,0.704631,1.398259,0.143404,0.688,0.968158,0.027177,all-mpnet-base-v1,90,30,10,51
4,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.226645,0.713508,1.23854,0.137412,0.718421,0.961122,0.035787,all-MiniLM-L12-v1,90,10,10,41
2,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.083229,0.719587,1.092262,0.132052,0.66,0.886554,0.11351,all-mpnet-base-v1,200,40,10,11
1,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.152728,0.744181,0.891481,0.111151,0.605085,0.945372,0.060758,all-MiniLM-L12-v1,180,40,20,61
10,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.227431,0.719416,0.941981,0.110444,0.680769,0.977346,0.024298,all-MiniLM-L12-v1,120,30,20,81
16,sb_model_name_or_path_all-mpnet-base-v1_hs_min...,-0.21662,0.724493,0.738231,0.101151,0.64,0.946919,0.055351,all-mpnet-base-v1,120,50,20,31
9,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.16013,0.761304,0.824367,0.100998,0.674359,0.971981,0.031359,all-MiniLM-L12-v1,180,50,30,81
23,sb_model_name_or_path_all-MiniLM-L12-v1_hs_min...,-0.165695,0.719013,0.755407,0.100161,0.636735,0.953011,0.04874,all-MiniLM-L12-v1,120,50,20,51
