In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import re

In [232]:
df = pd.read_csv("mejores_modelos/resultados_definitivos.csv", index_col=0)
df = df.loc[:, ["Modelo", "Hiper parámetros"]]

In [233]:
def args_string_to_dict(arguments):
    arguments = arguments[1:-1]
    arguments = arguments.split(", '")
    arguments = [ x.replace("'", "") for x in arguments ]
    arguments = [ tuple(x.split(": ")) for x in arguments ]

    def eval_obj(string):
        if re.match("[a-zA-Z]+", string) and string != "None":
            if string == "None":
                return None
            elif string == "True":
                return True
            elif string == "False":
                return False
            else:
                return string
        else: 
            return eval(string)

    arguments = [ (x, eval_obj(y)) for x, y in arguments]
    arguments = {k:v for k,v in arguments}

    return arguments

def separate_args(args_dict):
    # extract FE and model arguments to pass to the 
    fe_args = {k: v for k,v in args_dict.items() if k.startswith("FeaExt")}
    fe_args = {k.split("__")[-1]: v for k,v in fe_args.items()}

    model_args = {k: v for k,v in args_dict.items() if k.startswith("Pred")}
    model_args = {k.split("__")[-1]: v for k,v in model_args.items()}
    
    return fe_args, model_args

In [234]:
def limpieza(string):
    string = string.replace("- not balanced", "")
    string = string.replace("- balanced", " (PC)")
    string = string.replace(" - cargos limpio", "")
    string = string.replace("oversampling", "SM")
    
    string = string.replace("mlp", "MLP")
    string = string.replace("rf", "RF")
    string = string.replace("svm", "SVM")
    
    string = string.replace("ngramas", "N-gramas")
    string = string.replace("tf-idf", "TF-IDF")
    
    return string

df["Modelo"] = df["Modelo"].apply(limpieza)
df["Modelo"]

19         N-gramas + MLP 
0      N-gramas + RF  (PC)
8           N-gramas + RF 
33         N-gramas + SVM 
35    N-gramas + SVM  (PC)
19    SM + N-gramas + MLP 
32     SM + N-gramas + RF 
17    SM + N-gramas + SVM 
1            TF-IDF + MLP 
0        TF-IDF + RF  (PC)
25            TF-IDF + RF 
8            TF-IDF + SVM 
30      TF-IDF + SVM  (PC)
Name: Modelo, dtype: object

In [235]:
"""
default_mlp_args = {
    'Pred__random_state': [0],
    'Pred__max_iter': [200],
    'Pred__tol': [0.001],
    'Pred__learning_rate_init': [0.001],
}

default_rf_args = {
  'Pred__random_state': [0], 
  'Pred__criterion': ["entropy",], 
  'Pred__n_estimators': [100],
}

default_svm_args ={
    'Pred__random_state': [0], 
}
"""

def filter_default_arguments(args_dic):
    invariable_args = ["random_state", "max_iter", "tol", "learning_rate_init", "criterion", 
     "n_estimators", "class_weight"]
    
    new_dic = {}
    for k, v in args_dic.items():
        if all([x not in k for x in invariable_args]):
            new_dic[k] = v
    
    if "Pred__bootstrap" in new_dic:
        if not new_dic["Pred__bootstrap"]:
            del(new_dic["Pred__max_samples"])
        else: 
            new_dic["Pred__bootstrap"] = str(new_dic["Pred__bootstrap"])+" ("+\
                str(new_dic["Pred__max_samples"])+")" 
            del(new_dic["Pred__max_samples"])
    
    return new_dic

    



def clean_param_args(params):
    result = {k.split("__")[-1]: v for k, v in params.items()}
    return result

In [236]:
hp_eng_to_spanish = {
    "max_df": "freq_max",
    "min_df": "freq_min",
    "ngram_range": "rango_ngramas",
    
    "min_samples_leaf": "min_obvs_hoja",
    "max_samples": "prop_bootstrap",
    
    "activation": "fun_act",
    "hidden_layer_sizes": "arq_red",
    "batch_size": "tam_lote",
    
    "class_weight": "pesos_de_clase"
}

def process_hp_names(string):
    for eng_n, spa_n in hp_eng_to_spanish.items():
        string = string.replace(eng_n, spa_n)
    return string

In [260]:
df["Hiperparámetros óptimos"] = df["Hiper parámetros"].apply(args_string_to_dict)
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].apply(filter_default_arguments)

df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].apply(clean_param_args)
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].apply(str)
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].apply(process_hp_names)
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].str.replace("'", "")
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].str.replace("), ", 
                                                                          "), newline ",
                                                                         regex=False,
                                                                         n = 1)
df["Hiperparámetros óptimos"] = df["Hiperparámetros óptimos"].apply(lambda x: x[1:-1])

df.iloc[6, 2]

'freq_max: 1.0, freq_min: 0.0, rango_ngramas: (1, 2), newline bootstrap: True (0.95), min_obvs_hoja: 1'

In [264]:
temp_df = df.loc[:, ["Modelo", "Hiperparámetros óptimos"]]
with pd.option_context("max_colwidth", 1000):
    string = temp_df.to_latex(index=False)
    string = string.replace(r"\\", r"\\ \hline")
    
    string = string.replace(r"\toprule", "")
    string = string.replace(r"\midrule", "")
    string = string.replace(r"\bottomrule", "")
    string = string.replace(r"\begin{tabular}{ll}", 
                            r"\begin{tabular}{|p{0.2\textwidth}|p{0.80\textwidth}|} \hline")
    
    string = string.replace(r"newline", r"\newline")
    
    
    print(string)

\begin{tabular}{|p{0.2\textwidth}|p{0.80\textwidth}|} \hline

              Modelo &                                                                                      Hiperparámetros óptimos \\ \hline

     N-gramas + MLP  & freq\_max: 1.0, freq\_min: 0.0, rango\_ngramas: (1, 2), \newline fun\_act: tanh, tam\_lote: auto, arq\_red: (10, 5) \\ \hline
 N-gramas + RF  (PC) &        freq\_max: 1.0, freq\_min: 0.0, rango\_ngramas: (1, 1), \newline bootstrap: True (0.95), min\_obvs\_hoja: 1 \\ \hline
      N-gramas + RF  &        freq\_max: 1.0, freq\_min: 0.0, rango\_ngramas: (1, 1), \newline bootstrap: True (0.95), min\_obvs\_hoja: 1 \\ \hline
     N-gramas + SVM  &                              freq\_max: 1.0, freq\_min: 0.0, rango\_ngramas: (1, 2), \newline C: 10, kernel: rbf \\ \hline
N-gramas + SVM  (PC) &                              freq\_max: 1.0, freq\_min: 0.0, rango\_ngramas: (1, 2), \newline C: 10, kernel: rbf \\ \hline
SM + N-gramas + MLP  & freq\_max: 1.0, freq\_min: 0.0, ran