In [None]:
import plotnine as p9
import pandas as pd

In [None]:
nemo = pd.read_csv(r"\research_data\metrics\NeMo.csv")
pyannote = pd.read_csv(r"\research_data\metrics\pyannote.csv")
pyaudio = pd.read_csv(r"\research_data\metrics\pyAudioAnalysis.csv")
sd = pd.read_csv(r"\research_data\metrics\simple_diarizer.csv")
whisperx = pd.read_csv(r"\research_data\metrics\WhisperX.csv")

In [None]:
sd['Specs'] = sd['Specs'].replace('ecapa_ahc', 'ECAPA + AHC')
sd['Specs'] = sd['Specs'].replace('ecapa_sc', 'ECAPA + SC')
sd['Specs'] = sd['Specs'].replace('xvec_ahc', 'x-vectors + AHC')
sd['Specs'] = sd['Specs'].replace('xvec_sc', 'x-vectors + SC')
pyannote['Method'] = pyannote['Method'].replace('PyAnnote', 'pyannote')

In [None]:
nemo['Version'] = ''
pyaudio['Version'] = ''
sd['Version'] = ''
nemo['Pipeline'] = 'NeMo'
pyaudio['Pipeline'] = 'pyAudioAnalysis'
sd['Pipeline'] = 'SD'
sd['Method'] = sd['Specs']
sd = sd.drop(['Specs'], axis=1)
pyannote = pyannote.rename({'Method': 'Pipeline'}, axis=1)
whisperx = whisperx.rename({'Method': 'Pipeline'}, axis=1)
pyannote['Version'] = pyannote['Version'].astype(str)
whisperx['Version'] = whisperx['Version'].astype(str)

In [None]:
df_all = pd.concat([nemo, pyaudio, sd, pyannote, whisperx], axis=0)
df_all['Dataset'] = df_all['Dataset'].replace('ThisAmericanLife', 'TAL')

In [None]:
df_all['Set'] = df_all['Set'].fillna(" ")
df_all['Method'] = df_all['Method'].fillna(" ")

In [None]:
df_all['Set'] = df_all['Set'].str.replace('Test',' ')

In [None]:
df_all.loc[df_all["Dataset"] == "This American Life", "Dataset"] = 'TAL'
df_all.loc[df_all["Dataset"] == "MSDWILD", "Dataset"] = 'MSDWild'
df_all

In [None]:
df_all['DataSet'] = df_all[['Dataset', 'Set']].agg(' '.join, axis=1)

In [None]:
df_all['Value'] = round(df_all['Value'] * 100, 3)

In [None]:
df_all['Pipeline Specs'] = df_all[['Pipeline', 'Version']].agg(' '.join, axis=1)
df_all['Pipeline Specs'] = df_all[['Pipeline Specs', 'Method']].agg(''.join, axis=1)

In [None]:
df_all['Pipeline Specs'] = df_all['Pipeline Specs'].astype(str)

In [None]:
df_all['Metric'] = pd.Categorical(df_all['Metric'], 
                             ordered=True,
                             categories=["DER", "JER", "Coverage", 
                                         "Purity"])

In [None]:
df_all = df_all[df_all['Approach']=='Baseline']

## pyannote

In [None]:
df_py = df_all[(df_all['Pipeline'] == 'pyannote') & ((df_all['DataSet'] == 'AliMeeting Near'))]
df_py

In [None]:
df_py = df_all[(df_all['Pipeline'] == 'pyannote')]

In [None]:
plot = p9.ggplot(df_py[df_py['Metric'].isin(['DER', 'JER']) ],
  p9.aes(x = 'DataSet', y = 'Value', fill = 'Version')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot() + \
  p9.xlab('Data Set') + \
  p9.ylab('Value (%)') + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~Metric', ncol = 1, scales = 'free_y') + \
  p9.theme(
    legend_position = 'top',
    figure_size = (12, 8),

    legend_box_margin = -10,   
    axis_text_x  = p9.element_text(angle = 20, hjust = 15), 
    text = p9.element_text(size=12, weight='bold')
  )
plot
save_file = f'\\research_data\\metrics\\DER_JER_pyannote.png'
plot.save(filename = save_file, dpi = 300)

## WhisperX

In [None]:
df_w = df_all[(df_all['Pipeline'] == 'WhisperX')]

In [None]:
plot = p9.ggplot(df_w[df_w['Metric'].isin(['DER', 'JER']) ],
  p9.aes(x = 'DataSet', y = 'Value', fill = 'Version')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot() + \
  p9.xlab('Data Set') + \
  p9.ylab('Value (%)') + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~Metric', ncol = 1, scales = 'free_y') + \
  p9.theme(
    legend_position = 'top',
    figure_size = (12, 8),

    legend_box_margin = -10,   
    axis_text_x  = p9.element_text(angle = 20, hjust = 15), 
    text = p9.element_text(size=12, weight='bold')
  )
plot
save_file = f'\\research_data\\metrics\\DER_JER_WhisperX.png'
plot.save(filename = save_file, dpi = 300)

## simple_diarizer

In [None]:
df_sd = df_all[(df_all['Pipeline'] == 'SD')]

Ranked analysis

In [None]:
df_sd = df_all[(df_all['Pipeline'] == 'SD')]
ranked_performance = df_sd[df_sd['Metric'].isin(['DER', 'JER'])][['DataSet', 'Method', 'File', 'Metric', 'Value']].copy()
ranked_performance

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'min', ascending = True).astype(int)
ranked_performance

In [None]:
average_rank = ranked_performance[['DataSet', 'Method', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Method', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

In [None]:
ranked_performance = df_sd[df_sd['Metric'].isin(['Coverage', 'Purity'])][['DataSet', 'Method', 'File', 'Metric', 'Value']].copy()
ranked_performance

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'max', ascending = False).astype(int)
ranked_performance

In [None]:
ranked_performance[(ranked_performance['DataSet'] == 'AISHELL-4  ') & (ranked_performance['File'] == 'S_R004S01C01')]

In [None]:
average_rank = ranked_performance[['DataSet', 'Method', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Method', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

## NeMo

In [None]:
df_nemo = df_all[(df_all['Pipeline'] == 'NeMo') & (df_all['Method'] != 'Joint') & (df_all['Method'] != 'Joint (ASR-based TS)')]

In [None]:
ranked_performance = df_nemo[df_nemo['Metric'].isin(['DER', 'JER'])][['DataSet', 'Method', 'File', 'Metric', 'Value']].copy()

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'min', ascending = True).astype(int)

In [None]:
average_rank = ranked_performance[['DataSet', 'Method', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Method', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

In [None]:
ranked_performance = df_nemo[df_nemo['Metric'].isin(['Coverage', 'Purity'])][['DataSet', 'Method', 'File', 'Metric', 'Value']].copy()

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'max', ascending = False).astype(int)

In [None]:
average_rank = ranked_performance[['DataSet', 'Method', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Method', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

In [None]:
sets = ['AMI  ', 'AliMeeting Near', 'MSDWild Many', 'RAMC  ', 'TAL  ', 'VoxConverse  ']
df_nemo_sub = df_nemo[df_nemo['DataSet'].isin(sets)]
df_nemo_sub

In [None]:
average_df = df_nemo_sub.groupby(by = ['DataSet', 'Method', 'Metric']).agg(Value_mean = ('Value', 'mean'), Value_sd = ('Value', 'std')).apply(lambda x : round(x, 3))
average_df

In [None]:
print(average_df)

## Comparison

In [None]:
df_all_comp = df_all[df_all['Pipeline Specs'].isin(['pyannote 3.1 ', 'WhisperX 3.1 ',
                                             'NeMo Clustering', 'SD ECAPA + AHC',
                                             'pyAudioAnalysis  ', 'NeMo Joint (ASR-based TS)'])]
df_all_der = df_all_comp[(df_all_comp['Metric'] == 'DER')]
df_all_comp.loc[df_all_comp["Pipeline Specs"] == "NeMo Joint (ASR-based TS)", "Pipeline Specs"] = 'NeMo Joint'

In [None]:
ranked_performance = df_all_comp[df_all_comp['Metric'].isin(['DER', 'JER'])][['DataSet', 'Pipeline Specs', 'File', 'Metric', 'Value']].copy()
ranked_performance

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'min', ascending = True).astype(int)
ranked_performance

In [None]:
average_rank = ranked_performance[['DataSet', 'Pipeline Specs', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Pipeline Specs', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

In [None]:
ranked_performance = df_all_comp[df_all_comp['Metric'].isin(['Coverage', 'Purity'])][['DataSet', 'Pipeline Specs', 'File', 'Metric', 'Value']].copy()
ranked_performance

In [None]:
ranked_performance['rank'] = ranked_performance.groupby(by = ['DataSet', 'File', 'Metric'])['Value'].rank(method = 'max', ascending = False).astype(int)
ranked_performance

In [None]:
average_rank = ranked_performance[['DataSet', 'Pipeline Specs', 'File', 'Metric', 'rank']] \
  .groupby(by = ['Pipeline Specs', 'Metric']).agg(rank_mean = ('rank', 'mean'), rank_sd = ('rank', 'std')).apply(lambda x : round(x, 3)).dropna()
average_rank.reset_index(inplace = True)
average_rank['label'] = average_rank.apply(lambda x : str(round(x['rank_mean'], 2)) + ' ± ' + str(round(x['rank_sd'], 2)), axis = 1)
average_rank

Composite metric

In [None]:
# NeMo Clustering
DER_mean = 2.86
DER_std = 1.23
JER_mean = 3.16
JER_std = 1.40
cov_mean = 2.60
cov_std = 0.97
pur_mean = 3.23
pur_std = 1.54
diarization_score_mean_clust = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_clust = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("NeMo Clustering")
print(diarization_score_mean_clust, "+-", diarization_score_std_clust)


# NeMo Joint
DER_mean = 3.32
DER_std = 1.21
JER_mean = 3.43
JER_std = 1.31
cov_mean = 2.23
cov_std = 1.05
pur_mean = 2.31
pur_std = 1.68
diarization_score_mean_joint = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_joint = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("\nNeMo Joint")
print(diarization_score_mean_joint, "+-", diarization_score_std_joint)

# SD ECAPA + AHC
DER_mean = 3.29
DER_std = 1.45
JER_mean = 3.57
JER_std = 1.57
cov_mean = 2.96
cov_std = 1.40
pur_mean = 3.93
pur_std = 1.59
diarization_score_mean_sd = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_sd = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("\nSD ECAPA + AHC")
print(diarization_score_mean_sd, "+-", diarization_score_std_sd)


# pyannote 3.1
DER_mean = 1.70
DER_std = 1.12
JER_mean = 1.63
JER_std = 1.02
cov_mean = 4.22
cov_std = 0.97
pur_mean = 3.70
pur_std = 1.51
diarization_score_mean_pya = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_pya = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("\npyannote 3.1")
print(diarization_score_mean_pya, "+-", diarization_score_std_pya)

# pyAudioAnalysis
DER_mean = 5.69
DER_std = 0.66
JER_mean = 4.91
JER_std = 1.42
cov_mean = 5.84
cov_std = 0.37
pur_mean = 4.58
pur_std = 1.58
diarization_score_mean_pyAA = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_pyAA = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("\npyAudioAnalysis")
print(diarization_score_mean_pyAA, "+-", diarization_score_std_pyAA)


# WhisperX 3.1
DER_mean = 3.92
DER_std = 1.27
JER_mean = 3.90
JER_std = 1.38
cov_mean = 3.23
cov_std = 1.31
pur_mean = 3.36
pur_std = 1.94
diarization_score_mean_w = DER_mean * 0.25 +  JER_mean * 0.25 +  cov_mean * 0.25 +  pur_mean * 0.25
diarization_score_std_w = DER_std * 0.25 +  JER_std * 0.25 +  cov_std * 0.25 +  pur_std * 0.25
print("\nWhisperX 3.1")
print(diarization_score_mean_w, "+-", diarization_score_std_w)

### pyAudio

In [None]:
df_pyaud = df_all[(df_all['Pipeline'] == 'pyAudioAnalysis')]
df_pyaud

In [None]:
plot = p9.ggplot(df_pyaud,
  p9.aes(x = 'DataSet', y = 'Value', fill = 'Pipeline')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot() + \
  p9.xlab('Data Set') + \
  p9.ylab('Value (%)') + \
  p9.facet_wrap(facets = '~Metric', ncol = 1, scales = 'free_y') + \
  p9.scale_fill_manual(values=['#ef8a62'])  +\
  p9.theme(
    legend_position = 'none',
    figure_size = (8, 10),

    legend_box_margin = -10,   
    axis_text_x  = p9.element_text(angle = 30, hjust = 15), 
    text = p9.element_text(size=12, weight='bold')
  )
plot
save_file = f'\\research_data\\metrics\\pyaud.png'
plot.save(filename = save_file, dpi = 300)