In [None]:
import os
import re
import numpy as np
import pandas as pd
import plotnine as p9

In [None]:
APPROACH = 'Baseline'
VERSION = 3.1

In [None]:
def get_df_per_method(error_rules, metric):
  error_rules.columns = ['antecedent_support', 'p_value', 'kurtosis', 'skewness', 
                        'mean', 'mode', 'median', 'standard_deviation', 'values', 
                        'feature_conditions', 'Method']
  error_rules['Metric'] = metric
  def caren_distribution_as_vector(string : str) -> np.array:
    x = np.array([[float(y[0])] * int(y[1]) 
                    for y in [x.split('/') 
                              for x in re.sub('\s+\}', '', 
                                              re.sub('[a-zA-Z]+\=\{\s', '', 
                                                      string)).split(',')]])
    return np.hstack(x)

  error_rules['feature_conditions'] = error_rules['feature_conditions'].astype(str)
  error_rules['feature_conditions'] = error_rules['feature_conditions'].str.split('\s+&\s+')
  error_rules['values'] = error_rules['values'].apply(caren_distribution_as_vector)
  error_rules['standard_deviation'] = round(error_rules['standard_deviation'], 3)
  error_rules['mean'] = round(error_rules['mean'], 3)
  error_rules['mode'] = round(error_rules['mode'], 3)
  error_rules['median'] = round(error_rules['median'], 3)
  error_rules['values'] = error_rules['values']
  all_values = error_rules.iloc[error_rules.shape[0] - 1]['values']
  whole_data_label = f'All data\n({all_values.shape[0]} : 100%)'
  

  df = pd.DataFrame({
    'feature_conditions' : [whole_data_label] * all_values.shape[0],
    'values' : all_values,
    'method' : error_rules.iloc[0]['Method'],
    'metric' : error_rules.iloc[0]['Metric'],
  })

  if not os.path.exists(f'./output/{APPROACH}_{metric}_{VERSION}'):
    os.makedirs(f'./output/{APPROACH}_{metric}_{VERSION}')


  for i in range(error_rules.shape[0] - 1):
    subgroup_values = error_rules.iloc[i]['values']
    subgroup_feature_conditions = error_rules.iloc[i]['feature_conditions']
    subgroup_feature_conditions_units = []
    if type(subgroup_feature_conditions) is list:
      for conditions in subgroup_feature_conditions:
          if "OS" in conditions:
              conditions += ' %'
          elif "SNR" in conditions:
              conditions += ' dB'
          elif "D" in conditions:
              conditions += ' min'
          subgroup_feature_conditions_units.append(conditions)
    else:
        subgroup_feature_conditions_units.append(subgroup_feature_conditions)
    subgroup_feature_conditions_units.sort()
    subgroup_feature_conditions_units = ',\n '.join(subgroup_feature_conditions_units) +\
        f'\n({subgroup_values.shape[0]} instances : {np.round((subgroup_values.shape[0] / all_values.shape[0])*100, decimals = 3)}%)'

    df = pd.concat([
      df,
      pd.DataFrame({
        'feature_conditions' : [subgroup_feature_conditions_units] * subgroup_values.shape[0],
        'values' : subgroup_values,
        'method' : error_rules.iloc[0]['Method'],
        'metric' : error_rules.iloc[0]['Metric'],
      })
    ])

  # print(df)
  return df, whole_data_label

DER

In [None]:
file_py_der = f'research_data\\edr\\edrs\\{APPROACH}_DER_{VERSION}_pyannote.csv'
error_rules_py_der = pd.read_csv(file_py_der,  sep = ';')
error_rules_py_der['Method'] = 'pyannote 3.1'
error_rules_py_der_2, label_py_der = get_df_per_method(error_rules_py_der, "DER")

file_nemo_der_clust = f'research_data\\edr\\edrs\\{APPROACH}_DER_NeMo_Clustering.csv'
error_rules_nemo_der_clust = pd.read_csv(file_nemo_der_clust,  sep = ';')
error_rules_nemo_der_clust['Method'] = 'NeMo Clustering'
error_rules_nemo_der_2_clust, label_nemo_der_clust = get_df_per_method(error_rules_nemo_der_clust, 'DER')

file_nemo_der_joint = f'research_data\\edr\\edrs\\{APPROACH}_DER_NeMo_Joint.csv'
error_rules_nemo_der_joint = pd.read_csv(file_nemo_der_joint,  sep = ';')
error_rules_nemo_der_joint['Method'] = 'NeMo Joint'
error_rules_nemo_der_2_joint, label_nemo_der_joint = get_df_per_method(error_rules_nemo_der_joint, 'DER')

error_rules = pd.concat([error_rules_py_der_2, error_rules_nemo_der_2_clust, error_rules_nemo_der_2_joint])

In [None]:

labels = error_rules['feature_conditions'].unique()
labels = labels[1:]
labels.sort()
labels = labels[::-1]
labels = np.append(labels, label_nemo_der_joint)
error_rules['feature_conditions'] = pd.Categorical(error_rules['feature_conditions'], labels)


In [None]:
error_rules['feature_conditions'].unique()

In [None]:
error_rules[(error_rules['method'] == 'pyannote 3.1') & (error_rules['feature_conditions'] == 'All data\n(1015 : 100%)')]['values'].describe()

In [None]:
print(error_rules[(error_rules['method'] == 'NeMo Clustering')]['feature_conditions'].unique())

In [None]:
error_rules[(error_rules['method'] == 'NeMo Clustering') & (error_rules['feature_conditions'] == 'Spk=[1 - 2]\n(389 instances : 38.325%)')]['values'].describe()

In [None]:
20.737000 - 20.508000

In [None]:
41.905000 -  34.430000

In [None]:
plot = p9.ggplot(error_rules,
  p9.aes(x = 'feature_conditions', y = 'values', fill='method')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#f7f7f7', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'top',
    figure_size = (8, 14),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=17, weight='bold'),
    legend_direction="vertical",
  )
plot
save_file = f'research_data\\edr\\output/{APPROACH}_all_{VERSION}/all_der_subgroups_rq3_paper_new.png'
plot.save(filename = save_file, dpi = 300)

JER

In [None]:
file_py_jer = f'research_data\\edr\\edrs\\{APPROACH}_JER_{VERSION}_pyannote.csv'
error_rules_py_jer = pd.read_csv(file_py_jer,  sep = ';')
error_rules_py_jer['Method'] = 'pyannote 3.1'
error_rules_py_jer_2, label_py_jer = get_df_per_method(error_rules_py_jer, "JER")

file_nemo_jer_clust = f'research_data\\edr\\edrs\\{APPROACH}_JER_nemo_Clustering.csv'
error_rules_nemo_jer_clust = pd.read_csv(file_nemo_jer_clust,  sep = ';')
error_rules_nemo_jer_clust['Method'] = 'NeMo Clustering'
error_rules_nemo_jer_2_clust, label_nemo_jer_clust = get_df_per_method(error_rules_nemo_jer_clust, 'JER')

file_nemo_jer_joint = f'research_data\\edr\\edrs\\{APPROACH}_JER_nemo_Joint.csv'
error_rules_nemo_jer_joint = pd.read_csv(file_nemo_jer_joint,  sep = ';')
error_rules_nemo_jer_joint['Method'] = 'NeMo Joint'
error_rules_nemo_jer_2_joint, label_nemo_jer_joint = get_df_per_method(error_rules_nemo_jer_joint, 'JER')

error_rules = pd.concat([error_rules_py_jer_2, error_rules_nemo_jer_2_clust, error_rules_nemo_jer_2_joint])

In [None]:

labels = error_rules['feature_conditions'].unique()
labels = labels[1:]
labels.sort()
labels = labels[::-1]
labels = np.append(labels, label_nemo_jer_joint)
error_rules['feature_conditions'] = pd.Categorical(error_rules['feature_conditions'], labels)


In [None]:
error_rules

In [None]:
error_rules[error_rules['method'] == 'pyannote 3.1']['feature_conditions'].unique()

In [None]:
print('NeMo Joint - 3 spk')
print(error_rules[(error_rules['method'] == 'NeMo Joint') & (error_rules['feature_conditions'] == 'Spk=(2 - 3]\n(181 instances : 17.833%)')]['values'].median())
print('NeMo Joint - 2 spk')
print(error_rules[(error_rules['method'] == 'NeMo Joint') & (error_rules['feature_conditions'] == 'Spk=[1 - 2]\n(389 instances : 38.325%)')]['values'].median())

print('pyannote 3.1 - 3 spk')
print(error_rules[(error_rules['method'] == 'pyannote 3.1') & (error_rules['feature_conditions'] == 'Spk=(2 - 3]\n(181 instances : 17.833%)')]['values'].median())
print('pyannote 3.1 - 2 spk')
print(error_rules[(error_rules['method'] == 'pyannote 3.1') & (error_rules['feature_conditions'] == 'Spk=[1 - 2]\n(389 instances : 38.325%)')]['values'].median())

In [None]:
plot = p9.ggplot(error_rules,
  p9.aes(x = 'feature_conditions', y = 'values', fill='method')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#f7f7f7', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'top',
    figure_size = (8, 14),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=17, weight='bold'),
    legend_direction="vertical",
  )
plot
save_file = f'research_data\\edr\\output/{APPROACH}_all_{VERSION}/all_jer_subgroups_rq3_paper_new.png'
plot.save(filename = save_file, dpi = 300)