# SVM Kernel Comparison

In [None]:
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
import re

In [None]:
def kernel_barplot(data_type: str, input_folder: str, output_folder: str, metric: str) -> None:
    
    '''
    Barplot showing the average perfomances across the experiments of different kernels of ls-gkm.
    
    
    data_type: one of the following ["dnase-shuffle", "shuffle-dnase", "dnase", "shuffle"]
    input_folder: folder containing all the data regarding dnase, shuffle, dnase-shuffle and shuffle-dnase
    output_folder: where the barplot will be created, structured based on the data_type 
    metric: one of the following ["auroc","auprc", "f1score"]
    
    It creates the barplot regarding the selected data_type, metric and dataset_size, retrieving the data from the 
    "{input_folder}/{data_type}/dataset-size-comparison" folder and saving it in the 
    "{output_folder}/{data_type}/dataset_size_comparison_barplot" barplot.

    '''
        
        
    possible_data_types = ["dnase-shuffle", "shuffle-dnase", "dnase", "shuffle"]
    if data_type not in possible_data_types:
        raise ValueError(f"data_type must be one of the following: {possible_data_types}")
    
    possible_metrics = ["auroc","auprc", "f1score"]
    if metric not in possible_metrics:
        raise ValueError(f"metric must be one of the following: {possible_metrics}")
        
    
    # Set the input and output folder
    color  = {"est_lmer": "#191970ff",
              "gapped_kmer": "#800080ff",
              "gkm": "#666666ff",
              "wgkm": "#008000ff",
              "wgkmrbf": "#ffc0cbff",
              "gkmrbf": "#ffa500ff"
             }
    kernels = ['wgkm', 'wgkmrbf', 'est_lmer', 'gkm', 'gkmrbf', 'gapped_kmer']

    data_folder = f"{input_folder}/{data_type}/svm-kernel-comparison"
    output_folder = f"{output_folder}/{data_type}/svm_kernel_comparison_barplot"

    # Create the output folder(s)
    os.makedirs(output_folder, exist_ok=True)
    
    kernel_data = {}
    for kernel in kernels:
        kernel_name = re.sub("_", "", kernel) if "_" in kernel else kernel
        kernel_data[kernel_name] = pd.read_csv(f"{data_folder}/summary_table_full_{kernel}.tsv", sep="\t")
    
    
    # Retrieve the data refering to AUROC only
    metric_col_name = "F1" if metric == "f1score" else metric.upper()
    metric_data = {}
    for kernel, df in kernel_data.items():
        metric_cols = [col for col in df.columns if col.startswith(metric_col_name) or col.startswith("EXP")]
        metric_data[kernel] = df[metric_cols]

    # Compute for each tool and width the mean and standard deviation
    metric_mean_dicts = {}
    metric_std_dicts = {}
    for kernel, df in metric_data.items():
        k = df.columns[1].split("_",3)[3]
        means = [df[col].mean().round(3) for col in df.columns[1:]]
        stds = [df[col].std().round(3) for col in df.columns[1:]]
        tool_means = dict(zip([col.split("_",3)[3] for col in df.columns[1:]], means))
        tool_stds = dict(zip([col.split("_",3)[3] for col in df.columns[1:]], stds))

        metric_mean_dicts[k] = tool_means[k]
        metric_std_dicts[k] = tool_stds[k]
    
    # Plot
    fig = plt.subplots(figsize =(20,15))
    ax = plt.gca()
    color_mapping = [color[key] for key in metric_mean_dicts.keys()]

    plt.bar(metric_mean_dicts.keys(),metric_mean_dicts.values(),color = color_mapping,edgecolor ="grey",zorder=3)
    for k,v in metric_std_dicts.items():
        plt.text(k,metric_mean_dicts[k]+0.075,str("std"), horizontalalignment = "center", fontsize = 15)
        plt.text(k,metric_mean_dicts[k]+0.055,str(f'{v:.3f}'), horizontalalignment = "center", fontsize = 15)
        plt.text(k,metric_mean_dicts[k]+0.03,str("mean"), horizontalalignment = "center", fontsize = 15)
        plt.text(k,metric_mean_dicts[k]+0.01,str(f'{metric_mean_dicts[k]:.3f}'), horizontalalignment = "center", fontsize = 15)

    plt.yticks(fontsize = 15)
    plt.xticks(fontsize = 20)
    plt.grid(axis = "y",zorder=0,alpha=0.3)
    ylabel = "F1-Score" if metric == "f1score" else metric.upper()
    plt.ylabel(ylabel,fontsize=30)
    plt.xlabel("Kernels",fontsize=30)
    plt.title(f"{data_type} {ylabel} Mean of Kernels",fontsize=30)
    ax.set_ylim([0, 1.1])
    plt.savefig(f"{output_folder}/{data_type}_{ylabel}_Mean_Kernels.png", dpi = 300)
    plt.savefig(f"{output_folder}/{data_type}_{ylabel}_Mean_Kernels.svg", dpi = 300)
    plt.show()

In [None]:
kernel_barplot(data_type= "", input_folder= "", output_folder= "", metric= "")