In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sqlite3
import os

In [3]:
DB_PATH = "./run.db"
PLOT_PATH = './plots/results'
METRICS = ['accuracy', 'f1_score', 'g_mean', 'roc_auc']

In [4]:
if not os.path.exists(PLOT_PATH):
  os.makedirs(PLOT_PATH, exist_ok=True)

In [5]:
def fetch_data_without_dataset():
  sqlite_connection = sqlite3.connect(DB_PATH)
  query = """
    select
      model,
      scaling,
      level,
      avg(accuracy),
      avg(f1_score),
      avg(g_mean),
      avg(roc_auc)
    from result
    group by
      model,
      scaling,
      level;
    """
  cursor = sqlite_connection.cursor()
  cursor.execute(query)
  data = cursor.fetchall()
  result_df = pd.DataFrame(data, columns=['model', 'scaling', 'level', 'accuracy', 'f1_score', 'g_mean', 'roc_auc'])
  result_df['level'] = result_df.level.apply(lambda x: int(x.split()[1]))
  return result_df

In [6]:
def fetch_all_data():
  sqlite_connection = sqlite3.connect(DB_PATH)
  query = """
    select
      model,
      scaling,
      level,
      dataset,
      avg(accuracy),
      avg(f1_score),
      avg(g_mean),
      avg(roc_auc)
    from result
    group by
      model,
      scaling,
      level,
      dataset;
    """
  cursor = sqlite_connection.cursor()
  cursor.execute(query)
  data = cursor.fetchall()
  result_df = pd.DataFrame(data, columns=['model', 'scaling', 'level', 'dataset', 'accuracy', 'f1_score', 'g_mean', 'roc_auc'])
  result_df['level'] = result_df.level.apply(lambda x: int(x.split()[1]))
  result_df['dataset'] = result_df.dataset.apply(lambda x: int(x.split()[1]))
  return result_df

In [7]:
def plot_metric_by_level():
  # Gera gráfico de linha de performance por nível
  result_df = fetch_data_without_dataset()
  models = result_df['model'].unique()
  scalings = result_df['scaling'].unique()
  div = len(models) // 3
  for i in range(3):
    curr_models = models[i*div : (i+1)*div]
    plot_data = result_df.query(f'model.isin(@curr_models)')
    grid = sns.FacetGrid(plot_data.melt(id_vars=['model', 'scaling', 'level']), row='scaling', col='model', margin_titles=True, sharex=False, sharey=False)
    grid.map_dataframe(sns.lineplot, 'level', 'value', hue='variable')
    grid.add_legend(fontsize=15)
    grid.set_titles(col_template='{col_name}', row_template="", size=15)
    grid.tick_params(axis='y', labelsize=12)
    grid.set_xlabels(size=12)
    for j in range(grid.axes.shape[0]):
      grid.axes[j,0].set_ylabel(f"{scalings[j]}", size=15)
    grid.tight_layout()
    grid.savefig(f"{PLOT_PATH}/level_by_performance_{i+1}.pdf")

In [8]:
def show_average_diff_table(metric, abs=True):
  # Gera tabela com a diferença média entre cada normalização e o modelo original, dado uma métrica
  result_df = fetch_all_data()
  models = result_df['model'].unique()
  scalings = result_df['scaling'].unique()
  plot_diffs = {}
  for model in models:
    diffs = []
    original_scale_df = result_df.query(f"model == '{model}' and scaling == 'original'")
    methods = [scaling for scaling in scalings if scaling != 'original']
    for scaling in methods:
      plot_data = result_df.query(f"model == '{model}' and scaling == '{scaling}'")
      diff = plot_data[metric].to_numpy() - original_scale_df[metric].to_numpy()
      if abs:
        diff = np.abs(diff)
      diffs.append(np.average(diff))
    plot_diffs[model] = diffs
  return pd.DataFrame.from_records(plot_diffs, index=methods).transpose()


In [9]:
def plot_average_diff_by_dataset():
  # Gera gráfico de diferença média de cada dataset para cada modelo
  # Gera um gráfico para cada métrica
  result_df = fetch_all_data()
  models = result_df['model'].unique()
  scalings = result_df['scaling'].unique()
  methods = [method for method in scalings if method != 'original']
  div = len(models) // 3
  for metric in METRICS:
    for d in range(3):
      fig, axes = plt.subplots(5, 7, figsize=(30, 15))
      for i in range(7):
        model = models[(d*div)+i]
        for j in range(len(methods)):
          method = methods[j]
          diffs = []
          for dataset in range(1, 101):
            model_df = result_df.query(f"model == '{model}' and dataset == {dataset} and scaling == '{method}'")
            original_scale_df = result_df.query(f"model == '{model}' and dataset == {dataset} and scaling == 'original'")
            diff = model_df.query(f"scaling == '{method}'")[metric].to_numpy() - original_scale_df[metric].to_numpy()
            diffs.append(np.mean(diff))
          ax = axes[j][i]
          sns.barplot(diffs, ax=ax)
          ax.set_ylabel("")
          ax.set_xlabel("")
          ax.set_xticks([])
          ax.tick_params(axis='y', labelsize=12)
          ax.set(ylim=(-1,1))
          if j == 0:
            ax.set_title(model, size=15)
          if i == 0:
            ax.set_ylabel(method, size=15)
      plt.tight_layout()
      plt.savefig(f"{PLOT_PATH}/datasets_{metric}_{d+1}.pdf")

In [10]:
def plot_range():
  # Gera gráfico da diferença entre a maior e a menor performance por nível, para cada modelo
  result_df = fetch_data_without_dataset()
  models = result_df['model'].unique()
  plot_results = []
  for i in range(len(models)):
    model = models[i]
    for level in range(1, 11):
      model_level = result_df.query(f"model == '{model}' and level == {level}")
      accuracy = model_level['accuracy']
      f1_score = model_level['f1_score']
      g_mean = model_level['g_mean']
      roc_auc = model_level['roc_auc']
      accuracy_diff = accuracy.max() - accuracy.min()
      f1_score_diff = f1_score.max() - f1_score.min()
      g_mean_diff = g_mean.max() - g_mean.min()
      roc_auc_diff = roc_auc.max() - roc_auc.min()
      plot_results.append((level, accuracy_diff, f1_score_diff, g_mean_diff, roc_auc_diff, model))
  plot_df = pd.DataFrame(plot_results, columns=['level', 'accuracy', 'f1_score', 'g_mean', 'roc_auc', 'model'])
  grid = sns.catplot(plot_df.melt(id_vars=['level', 'model']), kind='bar', col='model', x='level', y='value', hue='variable', col_wrap=3, sharex=False, sharey=False)
  grid.set_titles(col_template='{col_name}', row_template='', size=15)
  grid.set(ylim=(0, 1.1))
  grid.set_xlabels(size=12)
  grid.tick_params(axis='both', labelsize=12)
  sns.move_legend(grid, "center right", fontsize=15, bbox_to_anchor=(1.05, 0.5))
  # grid.add_legend(fontsize=15)
  for ax in grid.axes.flat:
    ax.set_ylabel("")
  grid.tight_layout()
  grid.savefig(f"{PLOT_PATH}/range.pdf")

In [11]:
def plot_range_for_each_dataset():
  # Gera o plot de range para cada dataset individual e para cada modelo
  result_df = fetch_all_data()
  models = result_df['model'].unique()
  for model in models:
    plot_results = []
    for i in range(1, 101):
      for level in range(1, 11):
        dataset_df = result_df.query(f"model == '{model}' and level == {level} and dataset == {i}")
        accuracy = dataset_df['accuracy']
        f1_score = dataset_df['f1_score']
        g_mean = dataset_df['g_mean']
        roc_auc = dataset_df['roc_auc']
        accuracy_diff = accuracy.max() - accuracy.min()
        f1_score_diff = f1_score.max() - f1_score.min()
        g_mean_diff = g_mean.max() - g_mean.min()
        roc_auc_diff = roc_auc.max() - roc_auc.min()
        plot_results.append((level, accuracy_diff, f1_score_diff, g_mean_diff, roc_auc_diff, i))
    plot_df = pd.DataFrame(plot_results, columns=['level', 'accuracy', 'f1_score', 'g_mean', 'roc_auc', 'dataset'])
    grid = sns.catplot(plot_df.melt(id_vars=['level', 'dataset']), kind='bar', col='dataset', x='level', y='value', hue='variable', sharex=False, sharey=False, col_wrap=10, width=0.9)
    grid.set_titles(col_template='dataset {col_name}', row_template='', size=15)
    grid.set(ylim=(0, 1.1))
    grid.tick_params(axis='both', labelsize=12)
    sns.move_legend(grid, "center right", fontsize=20, title=None, bbox_to_anchor=(1.05, 0.5))
    for ax in grid.axes.flat:
      ax.set_xlabel("")
      ax.set_ylabel("")
    grid.tight_layout()
    grid.savefig(f"{PLOT_PATH}/range_dataset/range_datasets_{model}.pdf")