In [66]:
import pandas as pd
import json
import os

paths=["./metrics_20231201_1820","metrics_20231201_2122",
       "./metrics_20231202_0043","./metrics_20231202_0115",
       "./metrics_20231202_0137"]
# List of JSON file paths (replace with your actual file paths)
json_file = "BERTopic.json"

# Initialize an empty DataFrame
df = pd.DataFrame()
All_results=pd.DataFrame()

# Loop through each file
for path in paths:
    with open(os.path.join(path, json_file)) as f:
        data = json.load(f)

        # Use file name as the model name, removing '.json'
        model_name = os.path.splitext(os.path.basename(json_file))[0]

        # Loop through each entry in the JSON file (skipping the metadata)
        for entry in data[1:]:
            # Extract metrics and parameters
            metrics = entry.get('metrics', {})
            params = entry.get('params', {})
            # # Combine number of topics from 'nr_topics' or 'num_topics'
            # topics = params.get('nr_topics', params.get('num_topics', None))
            # # Get min_topic_size
            # min_topic_size = params.get('min_topic_size', None)
            # # Get Embedding model
            # embedding_model = params.get('embedding_model', None)
            # # Get Representation model
            # representation_model = params.get('representation_model', None)
            # # Combine model name, metrics, and topics
            # combined_data = {"Model": model_name, "Num_Topics": topics, 
            #                 "Embedding_Model":embedding_model, "Representation_Model":representation_model,
            #                  "Min_Topic_Size":min_topic_size,**metrics}
            
            # Append to the DataFrame
            df = df._append({**metrics,**params}, ignore_index=True)
            # df = df.append(combined_data, ignore_index=True)


In [70]:

from sklearn.preprocessing import MinMaxScaler

def rank_models(df):
    # Normalize the metrics
    scaler = MinMaxScaler()
    # Assuming higher values are better for all metrics. Adjust accordingly.
    metrics = ['cv', 'npmi', 'uci', 'umass', 'diversity']
    df_normalized = pd.DataFrame(scaler.fit_transform(df[metrics]), columns=metrics)
    # Each of 'cv', 'npmi', 'uci', and 'umass' gets 10% weight, 'diversity' gets 60%
    weights = {'cv': 0.125, 'npmi': 0.125, 'uci': 0.125, 'umass': 0.125, 'diversity': 0.5}

    # Apply the weights
    for metric in metrics:
        df_normalized[metric] = df_normalized[metric] * weights[metric]

    # Calculate average score
    df['norm_score'] = df_normalized.sum(axis=1)
    return df.sort_values('norm_score',ascending=False)

# Apply the function to your DataFrame
# ranked_df = rank_models(df[df['result_num_topic']>2])
ranked_df = rank_models(df)


In [71]:
ranked_df.drop_duplicates(subset=['norm_score'],keep='first',inplace=True)
ranked_df = ranked_df[ranked_df['result_num_topic']>5]

In [72]:
ranked_df.norm_score.describe()

count    113.000000
mean       0.312566
std        0.053565
min        0.220721
25%        0.276536
50%        0.309969
75%        0.345145
max        0.524284
Name: norm_score, dtype: float64

In [73]:
ranked_df.cv.describe()

count    113.000000
mean       0.545578
std        0.098593
min        0.374615
25%        0.450662
50%        0.548598
75%        0.630324
max        0.881067
Name: cv, dtype: float64

In [74]:
ranked_df.diversity.describe()

count    113.000000
mean       0.377391
std        0.095788
min        0.162626
25%        0.312245
50%        0.381250
75%        0.420202
max        0.721429
Name: diversity, dtype: float64

In [75]:
third_qs = ranked_df[((ranked_df['cv'] >=0.63) | (ranked_df['diversity'] >=0.42))]
third_qs = third_qs[third_qs.norm_score>=0.34]
third_qs[:10]

Unnamed: 0,cv,npmi,uci,umass,diversity,result_num_topic,nr_topics,min_topic_size,embedding_model,UMAP_PARAMS,representation_model,n_gram_range,norm_score
120,0.620206,0.105419,0.58692,-2.215467,0.712245,49,50,20,paraphrase-multilingual-MiniLM-L12-v2,,KeyBERT,"[1, 1]",0.524284
121,0.881067,0.746184,3.63506,-0.77336,0.17551,49,50,20,paraphrase-multilingual-MiniLM-L12-v2,,KeyBERT,"[1, 3]",0.507693
125,0.721687,0.492682,1.784781,-1.672897,0.367857,28,50,50,paraphrase-multilingual-MiniLM-L12-v2,,KeyBERT,"[1, 3]",0.459008
113,0.444603,-0.013678,-1.294874,-2.736177,0.721429,28,100,50,/home/kw215/Documents/research_codes/Topic-mod...,,KeyBERT,"[1, 1]",0.399934
37,0.671368,0.409213,0.956615,-1.634041,0.342857,49,50,20,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 10, 'n_components': 15}",KeyBERT,"[1, 3]",0.399612
17,0.671449,0.39622,0.992131,-1.752581,0.334694,49,50,10,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 10, 'n_components': 8}",KeyBERT,"[1, 3]",0.388436
13,0.68047,0.421106,1.146711,-1.695362,0.312245,49,50,10,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 10, 'n_components': 15}",KeyBERT,"[1, 3]",0.387534
65,0.714617,0.481751,1.241273,-1.650036,0.269697,99,100,10,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 10, 'n_components': 8}",KeyBERT,"[1, 3]",0.384332
107,0.450662,-0.002483,-0.979668,-2.826241,0.674194,31,50,50,/home/kw215/Documents/research_codes/Topic-mod...,,KeyBERT,"[1, 1]",0.379007
118,0.56156,0.241548,0.339896,-2.045126,0.452,25,100,50,/home/kw215/Documents/research_codes/Topic-mod...,,KeyBERT,"[1, 2]",0.37867


In [42]:
for e in third_qs['embedding_model']:
    print(e)

paraphrase-multilingual-MiniLM-L12-v2
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_all_MiniLM-v12-v2-drug-submissions
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_all_MiniLM-v12-v2-drug-submissions
all-MiniLM-L12-v2
paraphrase-multilingual-MiniLM-L12-v2
paraphrase-multilingual-MiniLM-L12-v2
all-MiniLM-L12-v2
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_all-MiniLM-L12-v2-3epoch
paraphrase-multilingual-MiniLM-L12-v2
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_roberta_base_drug_submissions
paraphrase-multilingual-MiniLM-L12-v2
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_all-MiniLM-L12-v2-3epoch
paraphrase-multilingual-MiniLM-L12-v2
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/tsdae_all-MiniLM-L12-v2-3epoch
/home/kw215/Documents/research_codes/Topic-modeling-evaluations/evaluation/

In [8]:
ranked_df.to_csv('formatted_report.csv')

In [21]:
# on 20231201_0329
# ranked_df[:20]

Unnamed: 0,cv,npmi,uci,umass,diversity,nr_topics,min_topic_size,embedding_model,UMAP_PARAMS,representation_model,n_gram_range,norm_score
441,0.664816,0.384094,1.526866,-1.055159,0.6,100.0,20.0,all-mpnet-base-v2,"{'n_neighbors': 10, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.858036
153,0.664816,0.384094,1.526866,-1.055159,0.6,50.0,20.0,all-mpnet-base-v2,"{'n_neighbors': 10, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.858036
9,0.664816,0.384094,1.526866,-1.055159,0.6,30.0,20.0,all-mpnet-base-v2,"{'n_neighbors': 10, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.858036
297,0.664816,0.384094,1.526866,-1.055159,0.6,70.0,20.0,all-mpnet-base-v2,"{'n_neighbors': 10, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.858036
449,0.685239,0.418587,0.793103,-1.783475,0.276404,100.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.736071
167,0.582348,0.356075,0.658621,-2.644791,0.6,50.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 5, 'random...",MMR,"[1, 2]",0.71409
455,0.582348,0.356075,0.658621,-2.644791,0.6,100.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 5, 'random...",MMR,"[1, 2]",0.71409
23,0.582348,0.356075,0.658621,-2.644791,0.6,30.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 5, 'random...",MMR,"[1, 2]",0.71409
311,0.582348,0.356075,0.658621,-2.644791,0.6,70.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 5, 'random...",MMR,"[1, 2]",0.71409
305,0.648289,0.364139,0.5016,-2.045688,0.328986,70.0,20.0,paraphrase-multilingual-MiniLM-L12-v2,"{'n_neighbors': 15, 'n_components': 8, 'random...",KeyBERT,"[1, 2]",0.688053


In [35]:
from IPython.display import display

# Assuming df is your DataFrame

# Define a function to highlight the maximum value in each column
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

# Apply the function to your DataFrame for specified columns
styled_df = ranked_df.style.apply(highlight_max, subset=['cv', 'npmi', 'uci', 'umass', 'diversity'])

# Display the styled DataFrame
display(styled_df)


Unnamed: 0,Model,Num_Topics,cv,npmi,uci,umass,diversity,norm_score
9,BERTopic,50.0,0.473774,-0.001526,-0.74721,-1.97399,0.9,0.978676
7,BERTopic,30.0,0.425856,-0.003743,-0.377035,-1.687519,0.9,0.925084
11,BERTopic,100.0,0.414853,-0.028862,-0.684031,-1.993915,0.9,0.861184
8,BERTopic,50.0,0.465695,-0.023856,-1.933982,-4.457895,0.716327,0.761289
10,BERTopic,100.0,0.462193,-0.028288,-2.227761,-4.797577,0.622222,0.69577
6,BERTopic,30.0,0.416512,-0.037993,-2.040508,-4.750124,0.772414,0.680437
3,CTM,30.0,0.432272,-0.098422,-3.527642,-7.010648,0.783333,0.525453
1,LDA,50.0,0.356134,-0.056055,-1.59115,-3.08321,0.458,0.513601
4,CTM,50.0,0.443918,-0.112409,-3.725134,-6.77687,0.734,0.508461
5,CTM,100.0,0.430465,-0.084084,-3.137435,-5.680304,0.553,0.502998
