Cross model evaluation

Given three folders with similar setup, and same dataset, eval their performance

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
import json
from datetime import datetime
import sys

In [2]:
%load_ext autoreload

In [3]:
from eval_metrics import SEARCH_BEHAVIOUR, METRICS, COHERENCE_MODEL_METRICS
from dataset_loader import GENRES

In [4]:
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
dataset_name = GENRES.INDIE

# create the folder name for each model
training_datetime_lda = datetime(2024, 1, 31, 13, 13, 10)
training_datetime_bertopic = datetime(2024, 1, 31, 20, 47, 46)
training_datetime_ctm = datetime(2024, 1, 31, 23, 8, 4)

lda_folder = Path('lda_dev')
bertopic_folder = Path('bertopic_dev')
ctm_folder = Path('ctm_dev')

training_folder_lda = lda_folder / f'lda_multicore_genre_{str(dataset_name)}_{search_behaviour.value}_{training_datetime_lda.strftime("%Y%m%d_%H%M%S")}'
training_folder_bertopic = bertopic_folder / f'bertopic_genre_{str(dataset_name)}_{search_behaviour.value}_{training_datetime_bertopic.strftime("%Y%m%d_%H%M%S")}'
training_folder_ctm = ctm_folder / f'ctm_genre_{str(dataset_name)}_{search_behaviour.value}_{training_datetime_ctm.strftime("%Y%m%d_%H%M%S")}'

# check each model folder exists
if not training_folder_lda.exists():
    print(f"Folder {training_folder_lda} does not exist")
    sys.exit(1)
if not training_folder_bertopic.exists():
    print(f"Folder {training_folder_bertopic} does not exist")
    sys.exit(1)
if not training_folder_ctm.exists():
    print(f"Folder {training_folder_ctm} does not exist")
    sys.exit(1)

In [6]:
# load the config and result files
config_dicts = {}
result_dicts = {}

for model_type, training_folder in zip(['lda', 'bertopic', 'ctm'], [training_folder_lda, training_folder_bertopic, training_folder_ctm]):
    config_file = training_folder / 'config.json'
    result_file = training_folder / 'result.json'

    with open(config_file, 'r') as f:
        config = json.load(f)
    with open(result_file, 'r') as f:
        result = json.load(f)

    config_dicts[model_type] = config
    result_dicts[model_type] = result

config_dicts

{'lda': {'model': 'lda_multicore',
  'workers': 3,
  'chunksize': 2024,
  'passes': 5,
  'alpha': 'symmetric',
  'eta': None,
  'decay': 0.5,
  'offset': 1.0,
  'eval_every': 10,
  'iterations': 50,
  'gamma_threshold': 0.001,
  'minimum_probability': 0.01,
  'random_state': 42,
  'minimum_phi_value': 0.01,
  'per_word_topics': False,
  'dtype': "<class 'numpy.float32'>",
  'search_behaviour': 'grid_search',
  'search_space': {'num_topics': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]},
  'metrics': ['c_npmi',
   'c_v',
   'u_mass',
   'c_uci',
   'topic_diversity',
   'inverted_rbo',
   'pairwise_jaccard_similarity'],
  'monitor': 'c_npmi',
  'gensim_version': '4.3.2'},
 'bertopic': {'model': 'bertopic',
  'sbert_params': {'model_name_or_path': 'all-MiniLM-L6-v2'},
  'vocab_tokenizer_params': {'n_frequency': 70, 'ngram_range': [1, 1]},
  'umap_params': {'n_neighbors': 15,
   'n_components': 5,
   'metric': 'cosine',
   'min_dist': 0.1,
   'n_epochs': None,
   'low_memory': False,
   'ran

---

View result of each model

In [12]:
# from: https://www.freecodecamp.org/news/how-to-flatten-a-dictionary-in-python-in-4-different-ways/
from collections.abc import MutableMapping

def _flatten_dict_gen(d, parent_key, sep):
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            yield from flatten_dict(v, new_key, sep=sep).items()
        else:
            yield new_key, v


def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
    return dict(_flatten_dict_gen(d, parent_key, sep))

def get_log_history_df(model_type):

    if model_type not in ['lda', 'bertopic', 'ctm']:
        return None

    log_history = result_dicts[model_type]['log_history']


    # for each dictionary in the log_history list
    # convert them to a flattened dictionary
    # then append to a list
    log_history_flattened = [flatten_dict(log, sep='.') for log in log_history]
    log_history_flattened
    log_history_df = pd.DataFrame(log_history_flattened)

    # append a column to the dataframe for the model type
    log_history_df['model_type'] = model_type

    return log_history_df

In [13]:
# LDA log history

log_history_lda_df = get_log_history_df('lda')
log_history_lda_df

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.num_topics,hyperparameters.workers,...,hyperparameters.offset,hyperparameters.eval_every,hyperparameters.iterations,hyperparameters.gamma_threshold,hyperparameters.minimum_probability,hyperparameters.random_state,hyperparameters.minimum_phi_value,hyperparameters.per_word_topics,hyperparameters.dtype,model_type
0,0.028072,0.441637,-2.567145,0.187376,0.79,0.922509,0.049419,num_topics_10,10,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
1,0.029617,0.430911,-3.050097,0.111961,0.795,0.964144,0.02561,num_topics_20,20,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
2,0.016055,0.410309,-3.510934,-0.196823,0.803333,0.978585,0.016541,num_topics_30,30,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
3,0.006415,0.391909,-3.731234,-0.389883,0.8,0.984359,0.013185,num_topics_40,40,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
4,-0.005462,0.372949,-4.009474,-0.635204,0.832,0.986784,0.010724,num_topics_50,50,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
5,-0.012428,0.360629,-4.106303,-0.790823,0.863333,0.993053,0.005261,num_topics_60,60,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
6,-0.026427,0.339959,-4.323799,-1.095375,0.862857,0.990434,0.007968,num_topics_70,70,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
7,-0.033008,0.326944,-4.371363,-1.221434,0.865,0.991463,0.007147,num_topics_80,80,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
8,-0.044946,0.316953,-4.539877,-1.508052,0.886667,0.993834,0.00564,num_topics_90,90,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda
9,-0.053376,0.313269,-4.597848,-1.709517,0.882,0.993633,0.005934,num_topics_100,100,3,...,1.0,10,50,0.001,0.01,42,0.01,False,<class 'numpy.float32'>,lda


In [14]:
# bertopic log history

log_history_bertopic_df = get_log_history_df('bertopic')
log_history_bertopic_df

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.vocab_tokenizer_params.ngram_range,...,hyperparameters.umap_params.random_state,hyperparameters.hdbscan_params.min_cluster_size,hyperparameters.hdbscan_params.min_samples,hyperparameters.hdbscan_params.metric,hyperparameters.hdbscan_params.prediction_data,hyperparameters.bertopic_params.language,hyperparameters.bertopic_params.top_n_words,hyperparameters.bertopic_params.calculate_probabilities,hyperparameters.bertopic_params.nr_topics,model_type
0,0.077081,0.528683,-0.203793,0.151528,0.91,0.969984,0.023805,bt_nr_topics_10,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,11,bertopic
1,0.04682,0.475084,-0.195191,-0.528989,0.89,0.977722,0.018713,bt_nr_topics_20,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,21,bertopic
2,0.050121,0.485897,-0.226812,-0.33314,0.856667,0.980851,0.018655,bt_nr_topics_30,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,31,bertopic
3,0.054031,0.504927,-0.284736,-0.37137,0.8575,0.98468,0.012906,bt_nr_topics_40,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,41,bertopic
4,0.056782,0.503027,-0.324919,-0.17552,0.834,0.981808,0.016836,bt_nr_topics_50,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,51,bertopic
5,0.049155,0.495749,-0.292939,-0.321552,0.811667,0.987319,0.012328,bt_nr_topics_60,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,61,bertopic
6,0.035656,0.494164,-0.355744,-0.730663,0.854286,0.990005,0.009103,bt_nr_topics_70,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,71,bertopic
7,0.026905,0.471419,-0.368478,-0.799957,0.84625,0.989786,0.009218,bt_nr_topics_80,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,81,bertopic
8,0.039652,0.494436,-0.354966,-0.606937,0.834444,0.993317,0.006657,bt_nr_topics_90,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,91,bertopic
9,0.035192,0.48498,-0.405524,-0.654158,0.823,0.994207,0.005076,bt_nr_topics_100,all-MiniLM-L6-v2,"[1, 1]",...,,15,5,euclidean,True,english,10,True,101,bertopic


In [15]:
# CTM log history

log_history_ctm_df = get_log_history_df('ctm')
log_history_ctm_df

Unnamed: 0,c_npmi,c_v,u_mass,c_uci,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_name,hyperparameters.sbert_params.model_name_or_path,hyperparameters.countvect_params.max_features,...,hyperparameters.ctm_params.hidden_sizes,hyperparameters.ctm_params.dropout,hyperparameters.ctm_params.lr,hyperparameters.ctm_params.momentum,hyperparameters.ctm_params.solver,hyperparameters.ctm_params.num_epochs,hyperparameters.ctm_params.n_components,hyperparameters.ctm_params.bow_size,hyperparameters.ctm_params.contextual_size,model_type
0,-0.020749,0.411303,-0.005676,-0.882963,0.88,0.973116,0.020723,ctm_n_components_10_sb_model_name_or_path_all-...,all-MiniLM-L6-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,10,2000,384,ctm
1,-0.027462,0.428263,-0.002675,-1.057831,0.84,0.96512,0.025531,ctm_n_components_10_sb_model_name_or_path_all-...,all-mpnet-base-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,10,2000,768,ctm
2,0.01007,0.474674,-0.042634,-0.449961,0.825,0.972084,0.018925,ctm_n_components_20_sb_model_name_or_path_all-...,all-MiniLM-L6-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,20,2000,384,ctm
3,-0.00204,0.467539,-0.055554,-0.773707,0.82,0.973696,0.017878,ctm_n_components_20_sb_model_name_or_path_all-...,all-mpnet-base-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,20,2000,768,ctm
4,0.020888,0.492191,-0.04092,-0.35211,0.79,0.977195,0.013935,ctm_n_components_30_sb_model_name_or_path_all-...,all-MiniLM-L6-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,30,2000,384,ctm
5,0.027265,0.478484,-0.040654,-0.164879,0.77,0.976139,0.017264,ctm_n_components_30_sb_model_name_or_path_all-...,all-mpnet-base-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,30,2000,768,ctm
6,0.02392,0.495583,-0.045026,-0.294131,0.725,0.979828,0.014193,ctm_n_components_40_sb_model_name_or_path_all-...,all-MiniLM-L6-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,40,2000,384,ctm
7,0.015132,0.484001,-0.04997,-0.469364,0.6875,0.975694,0.017803,ctm_n_components_40_sb_model_name_or_path_all-...,all-mpnet-base-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,40,2000,768,ctm
8,0.013462,0.468815,-0.064452,-0.465961,0.646,0.974424,0.018273,ctm_n_components_50_sb_model_name_or_path_all-...,all-MiniLM-L6-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,50,2000,384,ctm
9,0.014382,0.470818,-0.045673,-0.413583,0.602,0.965784,0.023317,ctm_n_components_50_sb_model_name_or_path_all-...,all-mpnet-base-v2,2000,...,"[100, 100]",0.2,0.002,0.99,adam,25,50,2000,768,ctm


Then group all the models, keeping only the model name and the metric

use the model name as the key to view hyperparameter of the model

In [16]:
metrics_names = [m.value for m in METRICS]

interested_columns = metrics_names + ['model_type', 'model_name']

# concat the three log history dataframe with only the interested columns
log_history_df = pd.concat([log_history_lda_df, log_history_bertopic_df, log_history_ctm_df])[interested_columns]
log_history_df

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_type,model_name
0,-2.567145,0.441637,0.187376,0.028072,0.79,0.922509,0.049419,lda,num_topics_10
1,-3.050097,0.430911,0.111961,0.029617,0.795,0.964144,0.02561,lda,num_topics_20
2,-3.510934,0.410309,-0.196823,0.016055,0.803333,0.978585,0.016541,lda,num_topics_30
3,-3.731234,0.391909,-0.389883,0.006415,0.8,0.984359,0.013185,lda,num_topics_40
4,-4.009474,0.372949,-0.635204,-0.005462,0.832,0.986784,0.010724,lda,num_topics_50
5,-4.106303,0.360629,-0.790823,-0.012428,0.863333,0.993053,0.005261,lda,num_topics_60
6,-4.323799,0.339959,-1.095375,-0.026427,0.862857,0.990434,0.007968,lda,num_topics_70
7,-4.371363,0.326944,-1.221434,-0.033008,0.865,0.991463,0.007147,lda,num_topics_80
8,-4.539877,0.316953,-1.508052,-0.044946,0.886667,0.993834,0.00564,lda,num_topics_90
9,-4.597848,0.313269,-1.709517,-0.053376,0.882,0.993633,0.005934,lda,num_topics_100


In [17]:
# with that information, we can sort by different metrics

# sort by c_v
log_history_df.sort_values(by='c_v', ascending=False)

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_type,model_name
0,-0.203793,0.528683,0.151528,0.077081,0.91,0.969984,0.023805,bertopic,bt_nr_topics_10
3,-0.284736,0.504927,-0.37137,0.054031,0.8575,0.98468,0.012906,bertopic,bt_nr_topics_40
4,-0.324919,0.503027,-0.17552,0.056782,0.834,0.981808,0.016836,bertopic,bt_nr_topics_50
5,-0.292939,0.495749,-0.321552,0.049155,0.811667,0.987319,0.012328,bertopic,bt_nr_topics_60
6,-0.045026,0.495583,-0.294131,0.02392,0.725,0.979828,0.014193,ctm,ctm_n_components_40_sb_model_name_or_path_all-...
8,-0.354966,0.494436,-0.606937,0.039652,0.834444,0.993317,0.006657,bertopic,bt_nr_topics_90
6,-0.355744,0.494164,-0.730663,0.035656,0.854286,0.990005,0.009103,bertopic,bt_nr_topics_70
4,-0.04092,0.492191,-0.35211,0.020888,0.79,0.977195,0.013935,ctm,ctm_n_components_30_sb_model_name_or_path_all-...
2,-0.226812,0.485897,-0.33314,0.050121,0.856667,0.980851,0.018655,bertopic,bt_nr_topics_30
9,-0.405524,0.48498,-0.654158,0.035192,0.823,0.994207,0.005076,bertopic,bt_nr_topics_100


In [18]:
# sort by c_npmi

log_history_df.sort_values(by='c_npmi', ascending=False)

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_type,model_name
0,-0.203793,0.528683,0.151528,0.077081,0.91,0.969984,0.023805,bertopic,bt_nr_topics_10
4,-0.324919,0.503027,-0.17552,0.056782,0.834,0.981808,0.016836,bertopic,bt_nr_topics_50
3,-0.284736,0.504927,-0.37137,0.054031,0.8575,0.98468,0.012906,bertopic,bt_nr_topics_40
2,-0.226812,0.485897,-0.33314,0.050121,0.856667,0.980851,0.018655,bertopic,bt_nr_topics_30
5,-0.292939,0.495749,-0.321552,0.049155,0.811667,0.987319,0.012328,bertopic,bt_nr_topics_60
1,-0.195191,0.475084,-0.528989,0.04682,0.89,0.977722,0.018713,bertopic,bt_nr_topics_20
8,-0.354966,0.494436,-0.606937,0.039652,0.834444,0.993317,0.006657,bertopic,bt_nr_topics_90
6,-0.355744,0.494164,-0.730663,0.035656,0.854286,0.990005,0.009103,bertopic,bt_nr_topics_70
9,-0.405524,0.48498,-0.654158,0.035192,0.823,0.994207,0.005076,bertopic,bt_nr_topics_100
1,-3.050097,0.430911,0.111961,0.029617,0.795,0.964144,0.02561,lda,num_topics_20


In [19]:
# sort by inverted rbo (an indicator of diversity), The higher the better

log_history_df.sort_values(by='inverted_rbo', ascending=False)

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_type,model_name
9,-0.405524,0.48498,-0.654158,0.035192,0.823,0.994207,0.005076,bertopic,bt_nr_topics_100
8,-4.539877,0.316953,-1.508052,-0.044946,0.886667,0.993834,0.00564,lda,num_topics_90
9,-4.597848,0.313269,-1.709517,-0.053376,0.882,0.993633,0.005934,lda,num_topics_100
8,-0.354966,0.494436,-0.606937,0.039652,0.834444,0.993317,0.006657,bertopic,bt_nr_topics_90
5,-4.106303,0.360629,-0.790823,-0.012428,0.863333,0.993053,0.005261,lda,num_topics_60
7,-4.371363,0.326944,-1.221434,-0.033008,0.865,0.991463,0.007147,lda,num_topics_80
6,-4.323799,0.339959,-1.095375,-0.026427,0.862857,0.990434,0.007968,lda,num_topics_70
6,-0.355744,0.494164,-0.730663,0.035656,0.854286,0.990005,0.009103,bertopic,bt_nr_topics_70
7,-0.368478,0.471419,-0.799957,0.026905,0.84625,0.989786,0.009218,bertopic,bt_nr_topics_80
5,-0.292939,0.495749,-0.321552,0.049155,0.811667,0.987319,0.012328,bertopic,bt_nr_topics_60


In [21]:
# sort by jaccard simularity, the higher the better (an indicator of similarity)
# (an inverse of diversity, hence not that useful)

log_history_df.sort_values(by='pairwise_jaccard_similarity', ascending=False)

Unnamed: 0,u_mass,c_v,c_uci,c_npmi,topic_diversity,inverted_rbo,pairwise_jaccard_similarity,model_type,model_name
0,-2.567145,0.441637,0.187376,0.028072,0.79,0.922509,0.049419,lda,num_topics_10
19,-0.093074,0.454698,-0.418214,0.011204,0.38,0.943371,0.038724,ctm,ctm_n_components_100_sb_model_name_or_path_all...
18,-0.104318,0.453353,-0.490329,0.008264,0.362,0.942139,0.035605,ctm,ctm_n_components_100_sb_model_name_or_path_all...
17,-0.092806,0.454922,-0.491434,0.008228,0.397778,0.94459,0.035509,ctm,ctm_n_components_90_sb_model_name_or_path_all-...
16,-0.07928,0.46454,-0.297793,0.017141,0.414444,0.947884,0.035019,ctm,ctm_n_components_90_sb_model_name_or_path_all-...
14,-0.069809,0.464751,-0.321359,0.0156,0.45625,0.950133,0.033186,ctm,ctm_n_components_80_sb_model_name_or_path_all-...
15,-0.065462,0.47381,-0.370981,0.016224,0.45625,0.956212,0.030803,ctm,ctm_n_components_80_sb_model_name_or_path_all-...
13,-0.071685,0.475107,-0.321426,0.017227,0.494286,0.961957,0.028028,ctm,ctm_n_components_70_sb_model_name_or_path_all-...
11,-0.081013,0.474235,-0.478883,0.01448,0.51,0.963509,0.026739,ctm,ctm_n_components_60_sb_model_name_or_path_all-...
12,-0.079787,0.463851,-0.522707,0.010283,0.484286,0.959116,0.026629,ctm,ctm_n_components_70_sb_model_name_or_path_all-...
