In [1]:
import numpy as np
import pandas as pd
from functools import reduce
import pathlib
import matplotlib.pyplot as plt
import gzip
import scipy.sparse as sparse

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['xtick.labelsize'] = 24
plt.rcParams['ytick.labelsize'] = 24
plt.rcParams['font.size'] = 22
plt.rcParams['axes.titlesize'] = 24
plt.rcParams['axes.labelsize'] = 24
plt.rcParams['legend.fontsize'] = 24
plt.rcParams['lines.markersize'] = 13
plt.style.use('seaborn-white')
plt.rcParams['lines.linewidth'] = 4

### Paths to HTMs

In [2]:
path_models_ctm_cordis = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_htm_ctm")
path_models_mallet_cordis = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_htm")
path_models_ctm_s2cs = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/S2CS/models_htm_ctm")
path_models_mallet_s2cs = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/S2CS/models_htm")
path_models = path_models_mallet
if path_models == path_models_ctm_cordis:
    corpus = "cordis"
    model_type = "ctm"
elif path_models == path_models_mallet_cordis:
    corpus = "cordis"
    model_type = "mallet"
elif path_models == path_models_ctm_s2cs:
    corpus = "s2cs"
    model_type = "ctm"
elif path_models == path_models_mallet_s2cs:
    corpus = "s2cs"
    model_type = "mallet"

NameError: name 'path_models_mallet' is not defined

### Get root models

In [None]:
dfs = []
for entry in path_models.iterdir():
    # check if it is a root model
    if "root" in entry.as_posix():
        # Path to the root model
        path = entry
        
        # Thr and exp_tpc do not apply for the root model
        thr = -1
        exp_tpc = -1
        
        # Experiment iteration
        iter_ = int(entry.as_posix().split("model_")[1].split("_")[0])
        
        # Size of the topics
        alphas = np.load(path.joinpath('TMmodel/alphas.npy')).tolist()
        
        # Coherences (CV and NPMI)
        cohrs = np.load(path.joinpath('TMmodel/topic_coherence.npy')).tolist()
        if len(cohrs) > len(alphas):
            cohrs_cv = cohrs[0:len(alphas)]
            cohrs_npmi = cohrs[len(alphas):]
        elif len(cohrs) == len(alphas):
            cohrs_cv = cohrs
            cohrs_npmi = [0] * len(alphas)
        
        # Topics' entropies
        entropies = np.load(path.joinpath('TMmodel/topic_entropy.npy')).tolist()
        
        # Ids of the topics
        tpc_ids = np.arange(0,len(alphas),1)
        
        # Corpus size
        if path.joinpath('corpus.txt').is_file():
            corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                path.joinpath('corpus.txt'), encoding="utf-8").readlines()]
            size = len(corpus)
        elif path.joinpath('corpus.parquet').is_dir():
            dfc = pd.read_parquet(path.joinpath('corpus.parquet'))
            size = len(dfc)
            
        # Create dataframe for the root model
        root_tpc_df = pd.DataFrame(
            {'iter': [iter_] * len(alphas),
             'path': [path] * len(alphas),
             'cohrs_cv': cohrs_cv,
             'cohrs_npmi': cohrs_npmi,
             'entropies': entropies,
             'alphas': alphas,
             'tpc_ids': tpc_ids,
             'thr': [thr] * len(alphas),
             'exp_tpc': [exp_tpc] * len(alphas),
             'size': [size] * len(alphas),
             'tr_tpcs': [0] * len(alphas),
            })
        
        # Append to the list of dataframes to concatenate them
        dfs.append(root_tpc_df)
df = pd.concat(dfs)
df = df.sort_values(by=['iter'])
df

In [None]:
if df.iloc[0].path.joinpath('corpus.txt').is_file():
    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                df.iloc[0].path.joinpath('corpus.txt'), encoding="utf-8").readlines()]
    root_size = len(corpus)
elif df.iloc[0].path.joinpath('corpus.parquet').is_dir():
    dfc = pd.read_parquet(df.iloc[0].path.joinpath('corpus.parquet'))
    root_size = len(dfc)
root_size

### Get submodels

In [None]:
# Iter over each root model (according to its corresponding iteration, iter)
concats = [df]
not_finished = []
for el in df.iter.unique():
    path_root = df[df.iter == el].iloc[0].path
    for entry in path_root.iterdir():
        if entry.joinpath('TMmodel/topic_coherence.npy').is_file():
        
            if "ws" in entry.as_posix():
                thr = 0
                size = 0
            else:
                thr = float(entry.as_posix().split("thr_")[1].split("_")[0])
                
                if entry.joinpath('corpus.txt').is_file():
                    corpus = [line.rsplit(' 0 ')[1].strip() for line in open(
                                entry.joinpath('corpus.txt'), encoding="utf-8").readlines()]
                    size = len(corpus)
                elif entry.joinpath('corpus.parquet').is_dir():
                    dfc = pd.read_parquet(entry.joinpath('corpus.parquet'))
                    size = len(dfc)
                size = size * 100 / root_size

            # get topic from which the submodel is generated
            exp_tpc = int(entry.as_posix().split("from_topic_")[1].split("_")[0])
            
            
            # Size of the topics
            alphas = np.load(entry.joinpath('TMmodel/alphas.npy')).tolist()
        
            # Alphas submodel is the mean of the cohr of its topics
            alpha = np.mean(alphas)
            
            # Coherences (CV and NPMI)
            cohrs = np.load(entry.joinpath('TMmodel/topic_coherence.npy')).tolist()
            if len(cohrs) > len(alphas):
                cohrs_cv = cohrs[0:len(alphas)]
                cohrs_npmi = cohrs[len(alphas):]
            elif len(cohrs) == len(alphas):
                cohrs_cv = cohrs
                cohrs_npmi = [0] * len(alphas)
            
            # cohr submodel is the mean of the cohr of its topics
            cohr_cv = np.mean(cohrs_cv)
            cohr_npmi = np.mean(cohrs_npmi)
            
            # Topics' entropies
            entropy = np.mean(np.load(entry.joinpath('TMmodel/topic_entropy.npy')).tolist())
            
            tr_tpcs = int(entry.as_posix().split("train_with_")[1].split("_")[0])
            
            # add entry of submodel to dataframe
            root_tpc_df = pd.DataFrame(
            {'iter': [el],
             'path': [entry],
             'cohrs_cv': [cohr_cv],
             'cohrs_npmi': [cohr_npmi],
             'entropies': [entropy],
             'alphas': [alpha],
             'tpc_ids': [exp_tpc],
             'thr': [thr],
             'exp_tpc': [exp_tpc],
             'size': [size],
             'tr_tpcs': [tr_tpcs]
            })
            concats.append(root_tpc_df)
        else:
            not_finished.append(entry)
df = pd.concat(concats)

### Generate graphs for root models

In [None]:
df_root = df[df.thr==-1]
df_root

In [None]:
df1 = df_root.groupby('tpc_ids')[['cohrs_cv', 'cohrs_npmi', 'entropies', 'alphas']].mean()
df1 = df1.rename(columns={'cohrs_cv': 'cohrs_cv_mean',
                          'cohrs_npmi': 'cohrs_npmi_mean',
                          'entropies': 'entropies_mean',
                          'alphas': 'alphas_mean',
                         })

df2 = df_root.groupby('tpc_ids')[['cohrs_cv', 'cohrs_npmi', 'entropies']].var()
df2 = df2.rename(columns={'cohrs_cv': 'cohrs_cv_var',
                          'cohrs_npmi': 'cohrs_npmi_var',
                          'entropies': 'entropies_var',
                         })

df_root_plot = pd.concat([df1, df2], axis=1, join='inner')

df_root_plot['tpc_ids'] = np.arange(0,len(df_root_plot),1)
df_root_plot

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize = (55, 10),  dpi=120)
y_repr = ['cohrs_cv', 'cohrs_npmi', 'entropies']
y_labels = ['Coherence CV', 'Coherence NPMI', 'Entropy']
colors = ["#2D6187", "#28ABB9", "#387838"]
maxs_mins = [(0.98,1.015),(0.95,1.04),(0.99,1.003)]

labels_all = []
handles_all = []
for yrepr, ylabel, color, max_min, ax in zip(y_repr, y_labels, colors, maxs_mins, axs.flat):
    
    df_root_plot.plot.bar(
        ax=ax,
        y=yrepr + "_mean",
        yerr = yrepr + "_var",
        label=ylabel, use_index=True, stacked=True,
        color = color,
        capsize=4)
    ax2 = df_root_plot.plot(
            ax=ax, 
            y='alphas_mean', kind = 'line', linewidth=4, label='Size', color="#A9294F", use_index=True, secondary_y=True)

    ax.grid()
    ax.set_xlabel('Topic ID')
    ax.set_ylabel(ylabel)
    ax.set_ylim([df_root_plot[yrepr + "_mean"].values.min()*max_min[0], df_root_plot[yrepr + "_mean"].values.max()*max_min[1]])
    ax2.set_ylabel('Size')
    ax.get_legend().remove()
    handles, labels = ax.get_legend_handles_labels()
    labels_all += labels
    handles_all += handles

handles2, labels2 = ax2.get_legend_handles_labels()

fig.legend(handles_all+handles2, labels_all+labels2, loc='upper center', ncol=4, bbox_to_anchor=(0.5, 1.03),
           frameon=True, shadow=True)

save_fig = "Images/root_model_" + corpus + "_" + model_type + ".png"
fig.savefig(save_fig, dpi='figure', bbox_inches='tight')

### Get graphs for HTM-WS submodels

In [None]:
df_ws = df[df.thr==0]

concat = []
for el in df_ws.tr_tpcs.unique():
    
    df1 = df_ws[df_ws.tr_tpcs==el].groupby('exp_tpc')[['cohrs_cv', 'cohrs_npmi', 'entropies']].mean()
    df1 = df1.rename(columns={'cohrs_cv': 'cohrs_cv_mean' + "_" + str(el),
                              'cohrs_npmi': 'cohrs_npmi_mean' + "_" + str(el),
                              'entropies': 'entropies_mean' + "_" + str(el)
                             })
    concat.append(df1)

    df2 = df_ws[df_ws.tr_tpcs==el].groupby('exp_tpc')[['cohrs_cv', 'cohrs_npmi', 'entropies']].var()
    df2 = df2.rename(columns={'cohrs_cv': 'cohrs_cv_var' + "_" + str(el),
                              'cohrs_npmi': 'cohrs_npmi_var' + "_" + str(el),
                              'entropies': 'entropies_var' + "_" + str(el)
                             })
    concat.append(df2)

df_ws_plot = pd.concat(concat, axis=1, join='inner').reset_index()

df_ws_plot

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize = (50, 8), dpi=120)
y_repr = ['cohrs_cv', 'cohrs_npmi', 'entropies']
y_labels = ['Coherence CV', 'Coherence NPMI', 'Entropy']
colors = [['#2D6187','#3573A0','#83B3D6'],
          ['#28ABB9','#2dc1d0','#8adde6'],
          ['#387838', '#82AB82','#ABBEAB']]
maxs_mins = [(0.98,1.015),(0.95,1.02),(0.99,1.003)]

for yrepr, ylabel, color, max_min, ax in zip(y_repr, y_labels, colors, maxs_mins, axs.flat):
    aux = [df_ws_plot[yrepr + "_var_6"].values,
           df_ws_plot[yrepr + "_var_8"].values, 
           df_ws_plot[yrepr + "_var_10"].values]
    y_aux = [yrepr + "_mean_6", yrepr + "_mean_8", yrepr + "_mean_10"]
    df_ws_plot.plot.bar(
        x='exp_tpc',
        ax=ax,
        y= y_aux,
        yerr = aux,
        label=['6 tpcs', '8 tpcs', '10 tpcs'],
        color=color,
        capsize=4)
    
    ax.grid()
    ax.set_xlabel('Topic ID')
    ax.set_ylabel(ylabel)
    ax.set_ylim([df_ws_plot[y_aux].values.min()*max_min[0], df_ws_plot[y_aux].values.max()*max_min[1]])
    ax.set_xticklabels(ax.get_xticks(), rotation = 0)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
          frameon=True, shadow=True, ncol=3)
    
plt.show()

save_fig = "Images/htm_ws_" + corpus + "_" + model_type + ".png"
fig.savefig(save_fig, dpi='figure', bbox_inches='tight')

In [None]:
prop_min = (600*100)/root_size
prop_max = 30

### Get graphs for HTM-DS submodels

In [None]:
df_ds = df[df.thr>0]
df_ds

In [None]:
metrics_display = ['cohrs_cv', 'cohrs_npmi', 'entropies']
y_labels = ['Coherence CV', 'Coherence NPMI', 'Entropy']
colors = [['#2D6187','#3573A0','#83B3D6'],
          ['#28ABB9','#2dc1d0','#8adde6'],
          ['#387838', '#82AB82','#ABBEAB']]
maxs_mins = [(0.98,1.05),(-0.98,1.2),(0.98,1.02)]

for metric,label,color,max_min in zip(metrics_display,y_labels,colors,maxs_mins):
    
    fig, axs = plt.subplots(nrows=5, ncols=2, figsize = (50, 50), sharex=True, sharey=True,  dpi=120)

    for tpc, ax in zip(sorted(df_ds.exp_tpc.unique()), axs.flat):
        aux_df_ds = df_ds[df_ds.exp_tpc==tpc]
        concat = []
        for el in aux_df_ds.tr_tpcs.unique():

            df1 = aux_df_ds[aux_df_ds.tr_tpcs==el].groupby('thr')[[metric,'size']].mean()
            df1 = df1.rename(columns={metric: metric + '_mean_' + str(el),
                                      'size':  'size_mean' + "_" + str(el)})
            concat.append(df1)

            df2 = aux_df_ds[aux_df_ds.tr_tpcs==el].groupby('thr')[[metric,'size']].var()
            df2 = df2.rename(columns={metric: metric + '_var_' + str(el),
                                      'size':  'size_var' + "_" + str(el)})
            concat.append(df2)

        aux_df_ds_plot = pd.concat(concat, axis=1, join='inner').reset_index()
                
        aux = [aux_df_ds_plot[metric + "_var_6"].values,
               aux_df_ds_plot[metric + "_var_8"].values, 
               aux_df_ds_plot[metric + "_var_10"].values]
            
        y_aux = [metric + "_mean_6", metric + "_mean_8", metric + "_mean_10"]

        aux_df_ds_plot.plot.bar(
                ax=ax,
                x='thr',
                y=y_aux,#['cohrs_mean_6','cohrs_mean_8','cohrs_mean_10'],
                yerr = aux,
                label=['6 tpcs', '8 tpcs', '10 tpcs'], use_index=True,
                color=color,
                capsize=4, rot=0)

        ax2 = aux_df_ds_plot.plot(
                ax=ax, 
                y='size_mean_6', 
                kind = 'line',
                color='#A9294F',
                label='Nr docs', 
                secondary_y=True,
                linewidth=3)

        ax2.hlines(y=[prop_max,prop_min], xmin=-1, xmax=len(aux_df_ds_plot.thr.unique()),
                   colors='purple', linestyles='--', lw=3,
                   label='Nr docs max / Nr docs min')
        ax2.set_ylim([0, 32])
        ax.set_ylim([aux_df_ds_plot[y_aux].values.min()*(max_min[0]), aux_df_ds_plot[y_aux].values.max()*max_min[1]])
        ax.set_xlabel('Threshold')
        ax.set_title(f'Submodels generated from topic {tpc}')
        ax.get_legend().remove()
        ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()

    fig.legend(handles+handles2, labels+labels2, loc='upper center', ncol=6, bbox_to_anchor=(0.5, 1.02),
              frameon=True, shadow=True)
    
    
    fig.text(-0.01, 0.5, label, va='center', rotation='vertical')
    fig.text(1.01, 0.5,'% of docs in the original corpus', va='center', rotation='vertical')
    fig.tight_layout()