In [None]:
import os
import re
import numpy as np
import pandas as pd
import plotnine as p9

In [None]:
APPROACH = 'Baseline'
VERSION = 3.1

In [None]:
def get_df_per_method(error_rules, metric):
  error_rules.columns = ['antecedent_support', 'p_value', 'kurtosis', 'skewness', 
                        'mean', 'mode', 'median', 'standard_deviation', 'values', 
                        'feature_conditions', 'Method']
  error_rules['Metric'] = metric
  def caren_distribution_as_vector(string : str) -> np.array:
    x = np.array([[float(y[0])] * int(y[1]) 
                    for y in [x.split('/') 
                              for x in re.sub('\s+\}', '', 
                                              re.sub('[a-zA-Z]+\=\{\s', '', 
                                                      string)).split(',')]])
    return np.hstack(x)

  error_rules['feature_conditions'] = error_rules['feature_conditions'].astype(str)
  error_rules['feature_conditions'] = error_rules['feature_conditions'].str.split('\s+&\s+')
  error_rules['values'] = error_rules['values'].apply(caren_distribution_as_vector)
  error_rules['standard_deviation'] = round(error_rules['standard_deviation'], 3)
  error_rules['mean'] = round(error_rules['mean'], 3)
  error_rules['mode'] = round(error_rules['mode'], 3)
  error_rules['median'] = round(error_rules['median'], 3)
  error_rules['values'] = error_rules['values']
  all_values = error_rules.iloc[error_rules.shape[0] - 1]['values']
  whole_data_label = f'All data\n({all_values.shape[0]} : 100%)'
  

  df = pd.DataFrame({
    'feature_conditions' : [whole_data_label] * all_values.shape[0],
    'values' : all_values,
    'method' : error_rules.iloc[0]['Method'],
    'metric' : error_rules.iloc[0]['Metric'],
  })

  if not os.path.exists(f'./output/{APPROACH}_{metric}_{VERSION}'):
    os.makedirs(f'./output/{APPROACH}_{metric}_{VERSION}')


  for i in range(error_rules.shape[0] - 1):
    subgroup_values = error_rules.iloc[i]['values']
    subgroup_feature_conditions = error_rules.iloc[i]['feature_conditions']
    subgroup_feature_conditions_units = []
    if type(subgroup_feature_conditions) is list:
      for conditions in subgroup_feature_conditions:
          if "OS" in conditions:
              conditions += ' %'
          elif "SNR" in conditions:
              conditions += ' dB'
          elif "D" in conditions:
              conditions += ' min'
          subgroup_feature_conditions_units.append(conditions)
    else:
        subgroup_feature_conditions_units.append(subgroup_feature_conditions)
    subgroup_feature_conditions_units.sort()
    subgroup_feature_conditions_units = ',\n '.join(subgroup_feature_conditions_units) +\
        f'\n({subgroup_values.shape[0]} instances : {np.round((subgroup_values.shape[0] / all_values.shape[0])*100, decimals = 3)}%)'

    df = pd.concat([
      df,
      pd.DataFrame({
        'feature_conditions' : [subgroup_feature_conditions_units] * subgroup_values.shape[0],
        'values' : subgroup_values,
        'method' : error_rules.iloc[0]['Method'],
        'metric' : error_rules.iloc[0]['Metric'],
      })
    ])

  # print(df)
  return df, whole_data_label

pyaudio

In [None]:
file_pyaud_der = f'\\research_data\\edrs\\{APPROACH}_DER_pyaudio.csv'
error_rules_pyaud_der = pd.read_csv(file_pyaud_der,  sep = ';')
error_rules_pyaud_der['Method'] = ''

In [None]:
error_rules_pyaud_der_2, label_pyaud_der = get_df_per_method(error_rules_pyaud_der, 'DER')
error_rules_der = error_rules_pyaud_der_2

labels_der = error_rules_der['feature_conditions'].unique()
labels_der = labels_der[1:]
labels_der.sort()
labels_der = labels_der[::-1]
labels_der = np.append(labels_der, label_pyaud_der)
error_rules_der['feature_conditions'] = pd.Categorical(error_rules_der['feature_conditions'], labels_der)

In [None]:
error_rules = error_rules_der

In [None]:
error_rules_der['Colour'] = error_rules_der['feature_conditions'].map(lambda x: '#ef8a62' if x.startswith('All data') else '#67a9cf')
error_rules_der

In [None]:
error_rules_der.sort_values(by='feature_conditions').reset_index(drop = True)
error_rules_der['feature_conditions'] = pd.Categorical(error_rules_der.feature_conditions, categories=pd.unique(error_rules_der.feature_conditions))

In [None]:
error_rules_der['metric'] = pd.Categorical(error_rules_der['metric'], 
                             ordered=True,
                             categories=["DER"])

In [None]:
p9.ggplot(error_rules_der,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'none',
    figure_size = (6, 6),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=12, weight='bold')
  )

In [None]:

plot = p9.ggplot(error_rules_der,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'none',
    figure_size = (6, 6),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=13, weight='bold')
  )
plot
save_file = f'\\research_data\\output/{APPROACH}_all_{VERSION}/all_subgroups_pyaudio_der_outliers.png'
plot.save(filename = save_file, dpi = 300)

In [None]:
file_pyaud_cov = f'\\research_data\\edrs\\{APPROACH}_Coverage_pyaudio.csv'
error_rules_pyaud_cov = pd.read_csv(file_pyaud_cov,  sep = ';')
error_rules_pyaud_cov['Method'] = ''
error_rules_pyaud_cov_2, label_pyaud_cov = get_df_per_method(error_rules_pyaud_cov, 'Coverage')
error_rules_cov = error_rules_pyaud_cov_2

labels_cov = error_rules_cov['feature_conditions'].unique()
labels_cov = labels_cov[1:]
labels_cov.sort()
labels_cov = labels_cov[::-1]
labels_cov = np.append(labels_cov, label_pyaud_cov)
error_rules_cov['feature_conditions'] = pd.Categorical(error_rules_cov['feature_conditions'], labels_cov)
error_rules = error_rules_cov
error_rules_cov['Colour'] = error_rules_cov['feature_conditions'].map(lambda x: '#ef8a62' if x.startswith('All data') else '#67a9cf')
error_rules_cov
error_rules_cov.sort_values(by='feature_conditions').reset_index(drop = True)
error_rules_cov['feature_conditions'] = pd.Categorical(error_rules_cov.feature_conditions, categories=pd.unique(error_rules_cov.feature_conditions))
error_rules_cov['metric'] = pd.Categorical(error_rules_cov['metric'], 
                             ordered=True,
                             categories=["Coverage"])

In [None]:

p9.ggplot(error_rules_cov,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.coord_flip() + \
  p9.ylab('Value (%)') + \
  p9.theme(
    legend_position = 'none',
    figure_size = (6, 6),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=12, weight='bold')
  )

In [None]:
plot = p9.ggplot(error_rules_cov,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 1) + \
  p9.xlab('Feature Conditions') + \
  p9.coord_flip() + \
  p9.ylab('Value (%)') + \
  p9.theme(
    legend_position = 'none',
    figure_size = (6, 6),
    legend_title = p9.element_blank(),    
    text = p9.element_text(size=13, weight='bold')
  )
plot
save_file = f'\\research_data\\output/{APPROACH}_all_{VERSION}/all_subgroups_pyaudio_cov_outliers.png'
plot.save(filename = save_file, dpi = 300)

In [None]:
full_pyaudio_df = pd.concat([error_rules_der, error_rules_cov])
full_pyaudio_df

In [None]:
full_pyaudio_df['metric'] = pd.Categorical(full_pyaudio_df['metric'], 
                             ordered=False,
                             categories=["DER",  "Coverage"])

In [None]:
plot = p9.ggplot(full_pyaudio_df,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot() + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.facet_wrap(facets = '~metric', ncol = 2, scales = 'free') + \
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'none',
    figure_size = (16, 9),
    legend_title = p9.element_blank(),    
    axis_text_x  = p9.element_text(angle = 40, hjust = 3), 
    text = p9.element_text(size=16, weight='bold')
  )
plot
# save_file = f'\\research_data\\output/{APPROACH}_all_{VERSION}/all_subgroups_pyaudio_outliers.png'
# plot.save(filename = save_file, dpi = 300)

In [None]:


plot = p9.ggplot(error_rules_cov,
  p9.aes(x = 'feature_conditions', y = 'values', fill='Colour')
  ) + \
  p9.theme_bw() + \
  p9.geom_boxplot(position = p9.position_dodge(-0.85), ) + \
  p9.scale_fill_manual(values=['#ef8a62', '#67a9cf'])  +\
  p9.xlab('Feature Conditions') + \
  p9.ylab('Value (%)') + \
  p9.coord_flip() + \
  p9.theme(
    legend_position = 'none',
    figure_size = (10, 6),
    legend_title = p9.element_blank(),    
    axis_text_x  = p9.element_text(angle = 35, hjust = 3), 
    text = p9.element_text(size=12, weight='bold')
  )
plot
save_file = f'\\research_data\\output/{APPROACH}_all_{VERSION}/all_subgroups_pyaudio_coverage_outliers.png'
plot.save(filename = save_file, dpi = 300)