In [1]:
import pandas as pd
import pathlib
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# Define the figure size and dpi
fig_width = 6.9  # inches
fig_height = 3.5  # inches
fig_dpi = 350

plt.rcParams.update({
    'figure.figsize': (fig_width, fig_height),
    'figure.dpi': fig_dpi,

    # Fonts
    'font.size': 12,

    # Axes
    'axes.labelsize': 12,
    'axes.titlesize': 12,
    'axes.linewidth': 1,
    'axes.grid': True,
    'grid.linestyle': ':',
    'grid.linewidth': 1,
    'grid.color': 'gray',

    # Legend
    'legend.fontsize': 8,
    'legend.frameon': True,
    'legend.framealpha': 0.8,
    'legend.fancybox': False,
    'legend.edgecolor': 'gray',
    'legend.facecolor': 'white',
    'legend.borderaxespad': 0.5,
    'legend.borderpad': 0.4,
    'legend.labelspacing': 0.5,

    # Lines
    'lines.linewidth': 2.0,
    'lines.markersize': 2,
    'axes.labelsize': 10,
    'axes.titlesize': 12,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
})

In [4]:
df_cordis = pd.read_csv(pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_val_mallet/val_results.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '/export/usuarios_ml4ds/lbartolome/Datasets/CORDIS/models_val_mallet/val_results.csv'

In [None]:
# Calculate the mean and standard deviation of cohr and disp_perc per ntopics
df_cordis = df_cordis.groupby('ntopics').agg({'cohr': ['mean', 'std'], 'disp_perc': ['mean', 'std']}).reset_index()

# Flatten the column names for easier access
df_cordis.columns = ['ntopics', 'cohr_mean', 'cohr_std', 'disp_perc_mean', 'disp_perc_std']

df_cordis

In [None]:
df_cancer = pd.read_csv(pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/Cancer/models_val_mallet/val_results.csv"))

In [None]:
# Calculate the mean and standard deviation of cohr and disp_perc per ntopics
df_cancer = df_cancer.groupby('ntopics').agg({'cohr': ['mean', 'std'], 'disp_perc': ['mean', 'std']}).reset_index()

# Flatten the column names for easier access
df_cancer.columns = ['ntopics', 'cohr_mean', 'cohr_std', 'disp_perc_mean', 'disp_perc_std']

df_cancer

In [None]:
df_ai = pd.read_csv(pathlib.Path("/export/usuarios_ml4ds/lbartolome/Datasets/S2CS-AI/models_val_mallet/val_results.csv"))

In [None]:
# Calculate the mean and standard deviation of cohr and disp_perc per ntopics
df_ai = df_ai.groupby('ntopics').agg({'cohr': ['mean', 'std'], 'disp_perc': ['mean', 'std']}).reset_index()

# Flatten the column names for easier access
df_ai.columns = ['ntopics', 'cohr_mean', 'cohr_std', 'disp_perc_mean', 'disp_perc_std']

df_ai

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize = (20.5, 4.8))
dfs = [df_cordis, df_cancer, df_ai]
titles = ["CORDIS", "S2-Cancer", "S2-AI"]
y_repr = ['cohrs_cv', 'cohrs_npmi', 'entropies']
y_labels = ['Coherence CV', 'Coherence NPMI', 'Entropy']
colors = ["#2D6187", "#28ABB9", "#387838"]

labels_all = []
handles_all = []
for df, color, ax, title in zip(dfs, colors, axs.flat, titles):
    
    ax.errorbar(
        df['ntopics'],
        df['cohr_mean'],
        yerr=df['cohr_std'],
        fmt='x-',
        ecolor='black',
        capsize=4,
        color='#36AE7C',
        label='$C_{NPMI}$')

    # Creating a twin axis on the right side for 'disp_perc'
    ax2 = ax.twinx()
    ax2.errorbar(
        df['ntopics'],
        df['disp_perc_mean'],
        yerr=df['disp_perc_std'], 
        fmt='x-',
        ecolor='black',
        capsize=4,
        color='#187498',
        label='DP')
    
    # if title == "CORDIS" or title == "Cancer":
    #    ax2.set_yticks([])
    
    #if title == "S2CS-AI":
    #    ax2.set_ylabel('Thetas dispersion %')
        
    #if title == "CORDIS":
    #    ax.set_ylabel('Coherence NPMI')

    ax.grid(True)
    ax.set_xlabel('Topic ID')
    ax.set_title(title)
    ax2.grid(True)
    #ax.get_legend().remove()
    handles, labels = ax.get_legend_handles_labels()
    labels_all += labels
    handles_all += handles

handles_all, labels_all = ax.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()

fig.legend(handles_all+handles2, labels_all+labels2, loc='upper center', ncol=2, bbox_to_anchor=(0.511, 1.001),
           frameon=True, shadow=False)

save_fig = "images/var_mallets.png"
fig.savefig(save_fig, dpi='figure', bbox_inches='tight')