In [1]:
import pandas as pd

In [2]:
def format_res(df: pd.DataFrame, n_splits: int, cluster_outliers: str):

    precision_mean = df['precision'].mean()
    recall_mean = df['recall'].mean() 
    f1_mean = df['f1'].mean() 
    time_mean = df['time'].mean()

    output = (
        n_splits, 
        cluster_outliers,
        precision_mean,
        recall_mean,
        f1_mean,
        time_mean,
        )
    return output

In [3]:
def get_res(n_splits, cluster_outliers):
    df = pd.read_csv(f'res_dmdate_{n_splits}_{cluster_outliers}.csv')
    return format_res(df, n_splits, cluster_outliers)

In [4]:
df_res = pd.DataFrame(columns=['n_splits', 'cluster_outliers', 'precision_mean', 'recall_mean', 'f1_mean', 'time_mean'])

for n_splits in range(2, 11):
    for cluster_outliers in ['all', 'skip']:
        df_res.loc[len(df_res)] = get_res(n_splits, cluster_outliers)

In [5]:
df_res.sort_values(by=['precision_mean', 'f1_mean'], ascending=False)

Unnamed: 0,n_splits,cluster_outliers,precision_mean,recall_mean,f1_mean,time_mean
2,3,all,0.98445,0.794402,0.879274,19.899144
4,4,all,0.983759,0.818533,0.893572,18.237953
8,6,all,0.981982,0.841699,0.906445,17.228561
0,2,all,0.981417,0.815637,0.89088,21.586592
6,5,all,0.980813,0.838803,0.904266,17.352589
10,7,all,0.979459,0.874517,0.924018,16.919664
12,8,all,0.976369,0.877413,0.92425,16.451477
16,10,all,0.974468,0.88417,0.927126,16.798816
14,9,all,0.972558,0.855212,0.910118,16.48013
1,2,skip,0.002807,0.824324,0.005594,8.253551


For reference:

Basiline + 1by1 | t_s for 1by1 

* n_splits = 10 --> 30372 + 3375 | 61.696611
* n_splits = 50 --> 33072 + 675 | 14.362957
* n_splits = 100 --> 33409 + 338 | 7.342449

In [6]:
def get_splits(n_splits):
    if n_splits == 2:
        return '16874x2'
    elif n_splits == 3:
        return '11249x3'
    elif n_splits == 4:
        return '8437x4'

df_res['splits'] = df_res['n_splits'].apply(lambda x: get_splits(x))

In [7]:
df_res["precision"] = df_res["precision_mean"].round(4).astype(str) 
df_res["recall"] = df_res["recall_mean"].round(4).astype(str) 
df_res["f1"] = df_res["f1_mean"].round(4).astype(str) 
df_res["time"] = df_res["time_mean"].round(2).astype(str)

In [8]:
df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False)

Unnamed: 0,n_splits,cluster_outliers,precision_mean,recall_mean,f1_mean,time_mean,splits,precision,recall,f1,time
2,3,all,0.98445,0.794402,0.879274,19.899144,11249x3,0.9844,0.7944,0.8793,19.9
4,4,all,0.983759,0.818533,0.893572,18.237953,8437x4,0.9838,0.8185,0.8936,18.24
8,6,all,0.981982,0.841699,0.906445,17.228561,,0.982,0.8417,0.9064,17.23
0,2,all,0.981417,0.815637,0.89088,21.586592,16874x2,0.9814,0.8156,0.8909,21.59
6,5,all,0.980813,0.838803,0.904266,17.352589,,0.9808,0.8388,0.9043,17.35
10,7,all,0.979459,0.874517,0.924018,16.919664,,0.9795,0.8745,0.924,16.92
12,8,all,0.976369,0.877413,0.92425,16.451477,,0.9764,0.8774,0.9243,16.45
16,10,all,0.974468,0.88417,0.927126,16.798816,,0.9745,0.8842,0.9271,16.8
14,9,all,0.972558,0.855212,0.910118,16.48013,,0.9726,0.8552,0.9101,16.48
1,2,skip,0.002807,0.824324,0.005594,8.253551,16874x2,0.0028,0.8243,0.0056,8.25


Observation:

* 1by1 works better with a small splits_percentage
* re-clustering all the outliers significantly increases the precision 
* clustering only the "new" outliers has no noticeable effect on the precision

Conclusion:

The LOW PRECISION is mainly driven by either an identity is not detected thus is
in the outliers (-1) clusters. Or the identity is "detected" but is in a "to be
discarded" clusters, i.e., a large cluster with obvious outliers / multiple
identities. 


* 1by1 is a good solution to add elements to the baseline only if a small number
  of picture is added to the dataset
* clustering only the new outliers is not useful in this case, however we
  believe depeding on the situation it could be useful, e.g., if we add 10+
  pictures of a new identity to the dataset
* This situation may not have been given enough attention in our test bench, and
  this could reflect a real life situation 
* Re-clustering the outliers is necessary if we want to maintain a high precision
  in the baseline

In [9]:
# rename columns cluster_outliers to "OB"
df_res.rename(columns={'cluster_outliers': 'OB', 'time': 'time [s]'}, inplace=True)
print(df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).to_latex(index=False, columns=['n_splits', 'OB', 'precision', 'recall', 'f1', 'time [s]']))

\begin{tabular}{rlllll}
\toprule
 n\_splits &   OB & precision & recall &     f1 & time [s] \\
\midrule
        3 &  all &    0.9844 & 0.7944 & 0.8793 &     19.9 \\
        4 &  all &    0.9838 & 0.8185 & 0.8936 &    18.24 \\
        6 &  all &     0.982 & 0.8417 & 0.9064 &    17.23 \\
        2 &  all &    0.9814 & 0.8156 & 0.8909 &    21.59 \\
        5 &  all &    0.9808 & 0.8388 & 0.9043 &    17.35 \\
        7 &  all &    0.9795 & 0.8745 &  0.924 &    16.92 \\
        8 &  all &    0.9764 & 0.8774 & 0.9243 &    16.45 \\
       10 &  all &    0.9745 & 0.8842 & 0.9271 &     16.8 \\
        9 &  all &    0.9726 & 0.8552 & 0.9101 &    16.48 \\
        2 & skip &    0.0028 & 0.8243 & 0.0056 &     8.25 \\
        3 & skip &    0.0016 & 0.7847 & 0.0033 &     5.04 \\
        4 & skip &     0.001 & 0.8205 & 0.0021 &     3.99 \\
        5 & skip &    0.0009 &  0.778 & 0.0018 &     3.25 \\
        6 & skip &    0.0008 & 0.7654 & 0.0016 &      2.9 \\
        7 & skip &    0.0007 & 0.8031 & 0.

  print(df_res.sort_values(by=["precision_mean", "f1_mean"], ascending=False).to_latex(index=False, columns=['n_splits', 'OB', 'precision', 'recall', 'f1', 'time [s]']))
