In [1]:
import pandas as pd

In [2]:
def format_res(df: pd.DataFrame, n_splits: int, cluster_outliers: str):

    precision_mean = df['precision'].mean()
    precision_se = df['precision'].sem()

    recall_mean = df['recall'].mean() 
    recall_se = df['recall'].sem()

    f1_mean = df['f1'].mean() 
    f1_se = df['f1'].sem()

    time_mean = df['time'].mean()
    time_se = df['time'].sem()

    output = (
        n_splits, 
        cluster_outliers,
        precision_mean,
        precision_se,
        recall_mean,
        recall_se,
        f1_mean,
        f1_se,
        time_mean,
        time_se
        )
    return output

In [3]:
def get_res(n_splits, cluster_outliers):
    df = pd.read_csv(f'res_newout_{n_splits}_{cluster_outliers}.csv')
    return format_res(df, n_splits, cluster_outliers)

In [4]:
df_res = pd.DataFrame(columns=['n_splits', 'cluster_outliers', 'precision_mean', 'precision_se', 'recall_mean', 'recall_se', 'f1_mean', 'f1_se', 'time_mean', 'time_se'])

for n_splits in [50, 10, 100]:
    for cluster_outliers in ['all']:
        df_res.loc[len(df_res)] = get_res(n_splits, cluster_outliers)

In [5]:
df_res.sort_values(by=['precision_mean', 'f1_mean'], ascending=False)

Unnamed: 0,n_splits,cluster_outliers,precision_mean,precision_se,recall_mean,recall_se,f1_mean,f1_se,time_mean,time_se
1,10,all,0.969811,0.001007,0.824517,0.001719,0.89127,0.001063,11.789393,0.014898
0,50,all,0.968683,0.000624,0.898456,0.001658,0.932239,0.000823,11.351364,0.022142
2,100,all,0.967663,0.000138,0.909846,0.001504,0.937858,0.000777,11.266368,0.035009


In [6]:
def get_n_added(n_splits):
    if n_splits == 10:
        return 3375
    elif n_splits == 50:
        return 675
    elif n_splits == 100:
        return 338

def get_splits(n_splits):
    if n_splits == 10:
        return '30372+3375'
    elif n_splits == 50:
        return '33072+675'
    elif n_splits == 100:
        return '33409+338'

def get_splits_percentage(n_splits):
    if n_splits == 10:
        return 3375 / 30372
    elif n_splits == 50:
        return 675 / 33072
    elif n_splits == 100:
        return 338 / 33409

df_res['splits'] = df_res['n_splits'].apply(lambda x: get_splits(x))
df_res['splits_percentage'] = df_res['n_splits'].apply(lambda x: get_splits_percentage(x))
df_res['splits_percentage'] = df_res['splits_percentage'].apply(lambda x: round(x * 100, 2)).astype(str) + '%'
df_res['n_added'] = df_res['n_splits'].apply(lambda x: get_n_added(x))

In [7]:
df_res["precision"] = df_res["precision_mean"].round(4).astype(str) + "+/-" + df_res["precision_se"].round(4).astype(str)
df_res["recall"] = df_res["recall_mean"].round(4).astype(str) + "+/-" + df_res["recall_se"].round(4).astype(str)
df_res["f1"] = df_res["f1_mean"].round(4).astype(str) + "+/-" + df_res["f1_se"].round(4).astype(str)
df_res["time"] = df_res["time_mean"].round(2).astype(str) + "+/-" + df_res["time_se"].round(2).astype(str)

In [8]:
df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False)

Unnamed: 0,n_splits,cluster_outliers,precision_mean,precision_se,recall_mean,recall_se,f1_mean,f1_se,time_mean,time_se,splits,splits_percentage,n_added,precision,recall,f1,time
1,10,all,0.969811,0.001007,0.824517,0.001719,0.89127,0.001063,11.789393,0.014898,30372+3375,11.11%,3375,0.9698+/-0.001,0.8245+/-0.0017,0.8913+/-0.0011,11.79+/-0.01
0,50,all,0.968683,0.000624,0.898456,0.001658,0.932239,0.000823,11.351364,0.022142,33072+675,2.04%,675,0.9687+/-0.0006,0.8985+/-0.0017,0.9322+/-0.0008,11.35+/-0.02
2,100,all,0.967663,0.000138,0.909846,0.001504,0.937858,0.000777,11.266368,0.035009,33409+338,1.01%,338,0.9677+/-0.0001,0.9098+/-0.0015,0.9379+/-0.0008,11.27+/-0.04


In [9]:
# rename columns cluster_outliers to "OB"
df_res.rename(columns={'cluster_outliers': 'OB', 'time': 'time [s]'}, inplace=True)
print(df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).to_latex(index=False, columns=['splits', 'OB', 'precision', 'recall', 'f1', 'time [s]']))

\begin{tabular}{llllll}
\toprule
    splits &  OB &       precision &          recall &              f1 &     time [s] \\
\midrule
30372+3375 & all &  0.9698+/-0.001 & 0.8245+/-0.0017 & 0.8913+/-0.0011 & 11.79+/-0.01 \\
 33072+675 & all & 0.9687+/-0.0006 & 0.8985+/-0.0017 & 0.9322+/-0.0008 & 11.35+/-0.02 \\
 33409+338 & all & 0.9677+/-0.0001 & 0.9098+/-0.0015 & 0.9379+/-0.0008 & 11.27+/-0.04 \\
\bottomrule
\end{tabular}



  print(df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).to_latex(index=False, columns=['splits', 'OB', 'precision', 'recall', 'f1', 'time [s]']))
