In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import re

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from imblearn.pipeline import Pipeline as PipelineIL
from imblearn.over_sampling import RandomOverSampler

from sklearn.pipeline import Pipeline

In [3]:
# leer todos los archivos de resultados

df_resultados = pd.DataFrame()
for file in os.listdir("resultados"):
    temp_df = pd.read_csv(f"resultados/{file}",  index_col=0)
    df_resultados = df_resultados.append(temp_df)

df_resultados

Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio,0.997792,0.003122,0.735556,0.038158,0.824219,0.026801,0.757728,0.016955,0.903846,0.041543
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio,0.997792,0.003122,0.740117,0.021980,0.829625,0.017618,0.754142,0.012020,0.923077,0.041543
2,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio,0.997792,0.003122,0.726667,0.039377,0.821469,0.026650,0.744681,0.018563,0.916667,0.045327
3,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio,0.995585,0.003122,0.744444,0.033174,0.831855,0.025019,0.757576,0.010714,0.923077,0.047106
4,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio,0.997792,0.003122,0.735497,0.047987,0.826262,0.032197,0.752564,0.025188,0.916667,0.047970
...,...,...,...,...,...,...,...,...,...,...,...,...
139,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio,0.964767,0.006183,0.678596,0.029277,0.773772,0.023808,0.748271,0.017093,0.801282,0.032686
140,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio,0.867797,0.019749,0.678713,0.052636,0.769045,0.041186,0.758025,0.035068,0.782051,0.059446
141,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio,1.000000,0.000000,0.735848,0.037735,0.822994,0.029522,0.760704,0.019200,0.897436,0.050474
142,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio,0.845794,0.022642,0.581988,0.079462,0.659983,0.073804,0.740616,0.058029,0.596154,0.083086


In [4]:
def is_balanced(string):
    result = "'Pred__class_weight': 'balanced'" in string
    if result:
        return "balanced"
    else:
        return "not balanced"

In [5]:
df_resultados["model_name"] = df_resultados["model_name"] + " - " + df_resultados["params"].apply(is_balanced)

In [6]:
df_resultados

Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.997792,0.003122,0.735556,0.038158,0.824219,0.026801,0.757728,0.016955,0.903846,0.041543
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.997792,0.003122,0.740117,0.021980,0.829625,0.017618,0.754142,0.012020,0.923077,0.041543
2,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.997792,0.003122,0.726667,0.039377,0.821469,0.026650,0.744681,0.018563,0.916667,0.045327
3,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.995585,0.003122,0.744444,0.033174,0.831855,0.025019,0.757576,0.010714,0.923077,0.047106
4,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.997792,0.003122,0.735497,0.047987,0.826262,0.032197,0.752564,0.025188,0.916667,0.047970
...,...,...,...,...,...,...,...,...,...,...,...,...
139,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio - balanced,0.964767,0.006183,0.678596,0.029277,0.773772,0.023808,0.748271,0.017093,0.801282,0.032686
140,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio - not balanced,0.867797,0.019749,0.678713,0.052636,0.769045,0.041186,0.758025,0.035068,0.782051,0.059446
141,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio - not balanced,1.000000,0.000000,0.735848,0.037735,0.822994,0.029522,0.760704,0.019200,0.897436,0.050474
142,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",tf-idf + svm - cargos limpio - balanced,0.845794,0.022642,0.581988,0.079462,0.659983,0.073804,0.740616,0.058029,0.596154,0.083086


In [7]:
# para cada modelo
# identificar el mayor cv_f1
# identificar los hiperparámetros que generaron ese score

top_result_by_model = pd.DataFrame(columns=df_resultados.columns)

for model_name in df_resultados["model_name"].unique():
    model_results = df_resultados[df_resultados["model_name"] == model_name]
    max_test_f1 = model_results["mean_test_f1"].max()
    
    best_model_results = model_results[model_results["mean_test_f1"] == max_test_f1].head(1)
    top_result_by_model = top_result_by_model.append(best_model_results)


print(len(top_result_by_model))
top_result_by_model.sort_values(by="mean_test_accuracy", ascending=False)

13


Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall
17,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + svm - cargos limpio -...,0.997792,0.003122,0.757661,0.007774,0.841321,0.007427,0.764895,0.01064,0.935897,0.032686
35,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - balanced,0.997792,0.003122,0.757602,0.017982,0.839286,0.01458,0.770398,0.009416,0.923077,0.041543
33,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - not balanced,0.997792,0.003122,0.753216,0.014059,0.837845,0.012047,0.76362,0.010109,0.929487,0.039515
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + mlp - cargos limpio - not balanced,1.0,0.0,0.744444,0.022878,0.837847,0.015576,0.742464,0.009096,0.961538,0.027196
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + mlp - cargos limpio -...,0.995585,0.003122,0.744386,0.041367,0.833889,0.028328,0.752497,0.019974,0.935897,0.04797
8,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + svm - cargos limpio - not balanced,1.0,0.0,0.740117,0.011699,0.828011,0.007603,0.759476,0.010132,0.910256,0.009065
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.995585,0.003122,0.74,0.01776,0.837351,0.011845,0.734257,0.006185,0.974359,0.023985
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + rf - cargos limpio - balanced,1.0,0.0,0.735673,0.02155,0.837974,0.01113,0.72471,0.016619,0.99359,0.009065
25,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + rf - cargos limpio - not balanced,0.993392,0.005407,0.731287,0.005563,0.831978,0.001916,0.729633,0.008006,0.967949,0.009065
30,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + svm - cargos limpio - balanced,1.0,0.0,0.73117,0.01814,0.831902,0.011137,0.72943,0.007594,0.967949,0.018131


In [8]:
# extraer características adicionales de los modelos

def get_model(string):
    string, _, _ = string.split(" - ")
    string = string.split(" + ")[-1]
    return string

top_result_by_model["oversampled"] = top_result_by_model["model_name"].str.contains("oversampling")
top_result_by_model["ngramas"] = top_result_by_model["model_name"].str.contains("ngramas")
top_result_by_model["class_weights"] = top_result_by_model["params"].str.contains("balanced")

top_result_by_model["modelo"] = top_result_by_model["model_name"].apply(get_model)

top_result_by_model

Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall,oversampled,ngramas,class_weights,modelo
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.995585,0.003122,0.74,0.01776,0.837351,0.011845,0.734257,0.006185,0.974359,0.023985,False,True,False,mlp
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - balanced,1.0,0.0,0.722222,0.040726,0.828186,0.025238,0.720244,0.019322,0.974359,0.036262,False,True,True,rf
8,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - not balanced,1.0,0.0,0.713626,0.007285,0.827598,0.003113,0.705911,0.004538,1.0,0.0,False,True,False,rf
33,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - not balanced,0.997792,0.003122,0.753216,0.014059,0.837845,0.012047,0.76362,0.010109,0.929487,0.039515,False,True,False,svm
35,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - balanced,0.997792,0.003122,0.757602,0.017982,0.839286,0.01458,0.770398,0.009416,0.923077,0.041543,False,True,True,svm
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + mlp - cargos limpio -...,0.995585,0.003122,0.744386,0.041367,0.833889,0.028328,0.752497,0.019974,0.935897,0.04797,True,True,False,mlp
32,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + rf - cargos limpio - ...,1.0,0.0,0.726725,0.024215,0.827427,0.017691,0.730255,0.005212,0.955128,0.039515,True,True,False,rf
17,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + svm - cargos limpio -...,0.997792,0.003122,0.757661,0.007774,0.841321,0.007427,0.764895,0.01064,0.935897,0.032686,True,True,False,svm
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + mlp - cargos limpio - not balanced,1.0,0.0,0.744444,0.022878,0.837847,0.015576,0.742464,0.009096,0.961538,0.027196,False,False,False,mlp
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + rf - cargos limpio - balanced,1.0,0.0,0.735673,0.02155,0.837974,0.01113,0.72471,0.016619,0.99359,0.009065,False,False,True,rf


In [9]:
top_result_by_model.iloc[1, 0]

"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0, 'FeaExt__ngram_range': (1, 1), 'Pred__bootstrap': True, 'Pred__class_weight': 'balanced', 'Pred__criterion': 'entropy', 'Pred__max_samples': 0.95, 'Pred__min_samples_leaf': 1, 'Pred__n_estimators': 100, 'Pred__random_state': 0}"

In [10]:
# leer el conjunto de datos

data = pd.read_csv("res_completo.csv", index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284 entries, 0 to 283
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_resol             284 non-null    object
 1   año                  284 non-null    int64 
 2   cargos               284 non-null    object
 3   códigos              283 non-null    object
 4   Empresa denunciada   284 non-null    object
 5   Multas               284 non-null    object
 6   Medidas correctivas  284 non-null    object
 7   dec_fav_denunciante  284 non-null    int64 
 8   cargos limpio        284 non-null    object
 9   cargos limpio stem   284 non-null    object
dtypes: int64(2), object(8)
memory usage: 24.4+ KB


In [11]:
# Separación del conjunto de datos en entrenamiento y test
# Separar el 20% para el test
# estratificar de acuerdo con dec_fav_denunciante

X_train, X_test, Y_train, Y_test = train_test_split(data.drop("dec_fav_denunciante", 
                                                            axis=1),
                                                    data["dec_fav_denunciante"],
                                                    test_size=0.2,
                                                    stratify=data["dec_fav_denunciante"],
                                                    random_state=0)

print("training set size: ", X_train.shape[0])
print("test set size: ", X_test.shape[0])

# Crear un generador de cross-validation: 3-fold
kf = StratifiedKFold(n_splits=3, random_state=1, shuffle=True);

training set size:  227
test set size:  57


In [12]:
def args_string_to_dict(arguments):
    arguments = arguments[1:-1]
    arguments = arguments.split(", '")
    arguments = [ x.replace("'", "") for x in arguments ]
    arguments = [ tuple(x.split(": ")) for x in arguments ]

    def eval_obj(string):
        if re.match("[a-zA-Z]+", string) and string != "None":
            if string == "None":
                return None
            elif string == "True":
                return True
            elif string == "False":
                return False
            else:
                return string
        else: 
            return eval(string)

    arguments = [ (x, eval_obj(y)) for x, y in arguments]
    arguments = {k:v for k,v in arguments}

    return arguments

In [13]:
def separate_args(args_dict):
    # extract FE and model arguments to pass to the 
    fe_args = {k: v for k,v in args_dict.items() if k.startswith("FeaExt")}
    fe_args = {k.split("__")[-1]: v for k,v in fe_args.items()}

    model_args = {k: v for k,v in args_dict.items() if k.startswith("Pred")}
    model_args = {k.split("__")[-1]: v for k,v in model_args.items()}
    
    return fe_args, model_args

In [14]:
def get_best_arg(arg_desc_string):
    #arg_desc_string = top_result_by_model[top_result_by_model["model_name"]==model].iloc[indx,0]
    dict_rgs = args_string_to_dict(arg_desc_string)
    fe_args, model_args = separate_args(dict_rgs)
    
    return fe_args, model_args

In [15]:
def create_pipeline(pipeline_descr, fe_args={}, pred_args={}):
    # dada la descripción de un modelo, reconstruir el pipeline

    model_desc, variable, _ = pipeline_descr.split(" - ")
    steps = model_desc.split(" + ")

    pipeline_steps = []

    if "ngramas" in steps:
        pipeline_steps.append(("FeaExt", CountVectorizer(**fe_args)))
    if "tf-idf" in steps:
        pipeline_steps.append(("FeaExt", TfidfVectorizer(**fe_args)))
    
    
    if "oversampling" in steps:
        pipeline_steps.append(("Oversampling", RandomOverSampler(random_state=0, sampling_strategy='not majority')))

    
    if "mlp" in steps:
        pipeline_steps.append(('Pred', MLPClassifier(**pred_args)))
    if "rf" in steps:
        pipeline_steps.append(('Pred', RandomForestClassifier(**pred_args)))
    if "svm" in steps:
        pipeline_steps.append(('Pred', SVC(**pred_args)))                  
    
    pipeline = PipelineIL(pipeline_steps)
    
    return pipeline

In [16]:
def evaluate_model_on_test(row, debug=False):
    # extraer las características del modelo
    model_name = row["model_name"]
    #print(f"model desc: {model_name}")
    
    # extraer la columna usada en la predicción
    _, columna, _ = row["model_name"].split(" - ")
    #print(f"input col: {columna}")

    # extraer los hiperparametros utilizados para entrenar el modelo
    fe_args, model_args = get_best_arg(row["params"])
    #print(fe_args)
    #print(model_args)

    # crear el pipeline
    pipeline = create_pipeline(model_name, fe_args, model_args)
    #print(pipeline)
    
    if debug:
        print(row["params"])
        print()
        print(model_name)
        print(columna)
        print(fe_args)
        print(model_args)
        print(pipeline)
        return None

    # entrenar el modelo
    X_train_temp = X_train.loc[:, [columna]].values.reshape((X_train.shape[0]))
    X_test_temp = X_test.loc[:, [columna]].values.reshape((X_test.shape[0]))
    pipeline.fit(X_train_temp, Y_train)
    
    # evaluar el modelo en test
    #print("test predictions")
    # keys: ['0', '1', 'accuracy', 'macro avg', 'weighted avg']    
    Y_test_pred = pipeline.predict(X_test_temp)
    
    acc_test_score = accuracy_score(y_true = Y_test, y_pred = Y_test_pred)
    f1_test_score = f1_score(y_true = Y_test, y_pred = Y_test_pred)
    prec_test_score = precision_score(y_true = Y_test, y_pred = Y_test_pred)
    rec_test_score = recall_score(y_true = Y_test, y_pred = Y_test_pred)


    return acc_test_score, f1_test_score, prec_test_score, rec_test_score

In [17]:
# testeo de la función

for indx, row in top_result_by_model.sample(1).iterrows():
    print(evaluate_model_on_test(row, debug=False))

(0.7368421052631579, 0.8314606741573033, 0.74, 0.9487179487179487)


In [18]:
# para cada row en top_result_by_model reconstruir el modelo, entrenarlo y evaluarlo en el test
top_result_by_model["test_metrics"] =  top_result_by_model.apply(evaluate_model_on_test, axis=1)

# extraer las métricas de evaluación en el conjunto de datos de test
top_result_by_model["test_accuracy"] = top_result_by_model["test_metrics"].apply(lambda x: x[0])
top_result_by_model["test_f1"] = top_result_by_model["test_metrics"].apply(lambda x: x[1])
top_result_by_model["test_precision"] = top_result_by_model["test_metrics"].apply(lambda x: x[2])
top_result_by_model["test_recall"] = top_result_by_model["test_metrics"].apply(lambda x: x[3])

top_result_by_model

Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,...,std_test_recall,oversampled,ngramas,class_weights,modelo,test_metrics,test_accuracy,test_f1,test_precision,test_recall
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.995585,0.003122,0.74,0.01776,0.837351,0.011845,0.734257,0.006185,...,0.023985,False,True,False,mlp,"(0.7192982456140351, 0.813953488372093, 0.7446...",0.719298,0.813953,0.744681,0.897436
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - balanced,1.0,0.0,0.722222,0.040726,0.828186,0.025238,0.720244,0.019322,...,0.036262,False,True,True,rf,"(0.7543859649122807, 0.8409090909090908, 0.755...",0.754386,0.840909,0.755102,0.948718
8,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - not balanced,1.0,0.0,0.713626,0.007285,0.827598,0.003113,0.705911,0.004538,...,0.0,False,True,False,rf,"(0.7192982456140351, 0.8222222222222223, 0.725...",0.719298,0.822222,0.72549,0.948718
33,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - not balanced,0.997792,0.003122,0.753216,0.014059,0.837845,0.012047,0.76362,0.010109,...,0.039515,False,True,False,svm,"(0.7192982456140351, 0.8095238095238095, 0.755...",0.719298,0.809524,0.755556,0.871795
35,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - balanced,0.997792,0.003122,0.757602,0.017982,0.839286,0.01458,0.770398,0.009416,...,0.041543,False,True,True,svm,"(0.7017543859649122, 0.7951807228915662, 0.75,...",0.701754,0.795181,0.75,0.846154
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + mlp - cargos limpio -...,0.995585,0.003122,0.744386,0.041367,0.833889,0.028328,0.752497,0.019974,...,0.04797,True,True,False,mlp,"(0.7192982456140351, 0.7948717948717948, 0.794...",0.719298,0.794872,0.794872,0.794872
32,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + rf - cargos limpio - ...,1.0,0.0,0.726725,0.024215,0.827427,0.017691,0.730255,0.005212,...,0.039515,True,True,False,rf,"(0.7368421052631579, 0.8314606741573033, 0.74,...",0.736842,0.831461,0.74,0.948718
17,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + svm - cargos limpio -...,0.997792,0.003122,0.757661,0.007774,0.841321,0.007427,0.764895,0.01064,...,0.032686,True,True,False,svm,"(0.7192982456140351, 0.8095238095238095, 0.755...",0.719298,0.809524,0.755556,0.871795
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + mlp - cargos limpio - not balanced,1.0,0.0,0.744444,0.022878,0.837847,0.015576,0.742464,0.009096,...,0.027196,False,False,False,mlp,"(0.6842105263157895, 0.7954545454545455, 0.714...",0.684211,0.795455,0.714286,0.897436
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + rf - cargos limpio - balanced,1.0,0.0,0.735673,0.02155,0.837974,0.01113,0.72471,0.016619,...,0.009065,False,False,True,rf,"(0.7543859649122807, 0.8409090909090908, 0.755...",0.754386,0.840909,0.755102,0.948718


In [19]:
top_result_by_model = top_result_by_model.rename(
    columns={'mean_test_f1': 'CV F1',
             'mean_test_precision': 'CV Prec',
             'mean_test_recall': 'CV Rec',
             'mean_test_accuracy': 'CV Acc',
             'model_name': 'Modelo',
             'params': 'Hiper parámetros'
            })

In [20]:
top_result_by_model

Unnamed: 0,Hiper parámetros,Modelo,mean_train_accuracy,std_train_accuracy,CV Acc,std_test_accuracy,CV F1,std_test_f1,CV Prec,std_test_precision,...,std_test_recall,oversampled,ngramas,class_weights,modelo,test_metrics,test_accuracy,test_f1,test_precision,test_recall
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + mlp - cargos limpio - not balanced,0.995585,0.003122,0.74,0.01776,0.837351,0.011845,0.734257,0.006185,...,0.023985,False,True,False,mlp,"(0.7192982456140351, 0.813953488372093, 0.7446...",0.719298,0.813953,0.744681,0.897436
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - balanced,1.0,0.0,0.722222,0.040726,0.828186,0.025238,0.720244,0.019322,...,0.036262,False,True,True,rf,"(0.7543859649122807, 0.8409090909090908, 0.755...",0.754386,0.840909,0.755102,0.948718
8,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio - not balanced,1.0,0.0,0.713626,0.007285,0.827598,0.003113,0.705911,0.004538,...,0.0,False,True,False,rf,"(0.7192982456140351, 0.8222222222222223, 0.725...",0.719298,0.822222,0.72549,0.948718
33,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - not balanced,0.997792,0.003122,0.753216,0.014059,0.837845,0.012047,0.76362,0.010109,...,0.039515,False,True,False,svm,"(0.7192982456140351, 0.8095238095238095, 0.755...",0.719298,0.809524,0.755556,0.871795
35,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + svm - cargos limpio - balanced,0.997792,0.003122,0.757602,0.017982,0.839286,0.01458,0.770398,0.009416,...,0.041543,False,True,True,svm,"(0.7017543859649122, 0.7951807228915662, 0.75,...",0.701754,0.795181,0.75,0.846154
19,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + mlp - cargos limpio -...,0.995585,0.003122,0.744386,0.041367,0.833889,0.028328,0.752497,0.019974,...,0.04797,True,True,False,mlp,"(0.7192982456140351, 0.7948717948717948, 0.794...",0.719298,0.794872,0.794872,0.794872
32,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + rf - cargos limpio - ...,1.0,0.0,0.726725,0.024215,0.827427,0.017691,0.730255,0.005212,...,0.039515,True,True,False,rf,"(0.7368421052631579, 0.8314606741573033, 0.74,...",0.736842,0.831461,0.74,0.948718
17,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",oversampling + ngramas + svm - cargos limpio -...,0.997792,0.003122,0.757661,0.007774,0.841321,0.007427,0.764895,0.01064,...,0.032686,True,True,False,svm,"(0.7192982456140351, 0.8095238095238095, 0.755...",0.719298,0.809524,0.755556,0.871795
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + mlp - cargos limpio - not balanced,1.0,0.0,0.744444,0.022878,0.837847,0.015576,0.742464,0.009096,...,0.027196,False,False,False,mlp,"(0.6842105263157895, 0.7954545454545455, 0.714...",0.684211,0.795455,0.714286,0.897436
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",tf-idf + rf - cargos limpio - balanced,1.0,0.0,0.735673,0.02155,0.837974,0.01113,0.72471,0.016619,...,0.009065,False,False,True,rf,"(0.7543859649122807, 0.8409090909090908, 0.755...",0.754386,0.840909,0.755102,0.948718


In [21]:
# guardar los resultados en una tabla

top_result_by_model.to_csv("mejores_modelos/resultados_definitivos.csv")

## Tabla de evaluación de modelos finales


EXPERIMENTOS
- ngramas vs ngramas + td-idf
- no oversampling vs oversampling

In [22]:
df = top_result_by_model

In [23]:
model_names_dic = {"mlp": "Redes Neuronales", "svm": "SVM", "rf": "Random Forest"}
df["modelo"] = df["modelo"].map(model_names_dic)

In [75]:
# Experimentos 2: resultados en entrenamiento

temp_df = df[df["ngramas"]==True]

temp_df = temp_df.loc[:, ["CV Acc", "CV F1", "CV Prec", "CV Rec", "ngramas", "modelo", "oversampled", "class_weights"]]

def model_type(row):
    if row["oversampled"]:
        return "oversampled"
    elif row["class_weights"]:
        return "weighted"
    else:
        return "none"

temp_df["model_type"] = temp_df.apply(model_type, axis=1)

temp_df = temp_df.drop(["ngramas", "oversampled", "class_weights"], axis=1)

temp_df = temp_df.melt(id_vars=['model_type', "modelo"], var_name='metric', value_name='value')

temp_df["model_type"] = temp_df["model_type"].map(
    {"none": "Ninguno", 
     "weighted": "Pesos de Clase", 
     "oversampled": "Sobremuestreo"})

temp_df = temp_df.rename(columns={"metric": "Métrica", "model_type": "Tipo", "modelo": "Modelo"})

temp_df = pd.pivot_table(temp_df, 
                         values='value', 
                         index=['Tipo'],
                         columns=['Modelo', 'Métrica'],)

#temp_df
for var in ["Random Forest", "Redes Neuronales", "SVM"]:
    print(r"""
\begin{table}[h!]
\centering"""
         )
    print(temp_df.loc[:, [var]].round(3).to_latex(na_rep="-"))
    print("""
\\caption{Resultados del modelo \\textit{Random Forest} en la etapa de entrenamiento}
\\label{Exp 1 - entrenamiento - rn}
\\end{table}
    """)
    print()


\begin{table}[h!]
\centering
\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Random Forest} \\
Métrica &        CV Acc &  CV F1 & CV Prec & CV Rec \\
Tipo           &               &        &         &        \\
\midrule
Ninguno        &         0.714 &  0.828 &   0.706 &  1.000 \\
Pesos de Clase &         0.722 &  0.828 &   0.720 &  0.974 \\
Sobremuestreo  &         0.727 &  0.827 &   0.730 &  0.955 \\
\bottomrule
\end{tabular}


\caption{Resultados del modelo \textit{Random Forest} en la etapa de entrenamiento}
\label{Exp 1 - entrenamiento - rn}
\end{table}
    


\begin{table}[h!]
\centering
\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Redes Neuronales} \\
Métrica &           CV Acc &  CV F1 & CV Prec & CV Rec \\
Tipo           &                  &        &         &        \\
\midrule
Ninguno        &            0.740 &  0.837 &   0.734 &  0.974 \\
Pesos de Clase &                - &      - &       - &      - \\
Sobremuestreo  &            0.744 &  0.834

In [67]:
# Experimentos 2: resultados en prueba

temp_df = df[df["ngramas"]==True]

temp_df = temp_df.loc[:, ["test_accuracy", "test_f1", "test_precision", "test_recall", "ngramas", "modelo", "oversampled", "class_weights"]]

def model_type(row):
    if row["oversampled"]:
        return "oversampled"
    elif row["class_weights"]:
        return "weighted"
    else:
        return "none"

temp_df["model_type"] = temp_df.apply(model_type, axis=1)

temp_df = temp_df.drop(["ngramas", "oversampled", "class_weights"], axis=1)

temp_df = temp_df.melt(id_vars=['model_type', "modelo"], var_name='metric', value_name='value')

temp_dict = {"test_accuracy": "Acc Prueba", 
             "test_f1": "F1 Prueba", 
             "test_recall": "Rec Prueba",
             "test_precision": "Prec Prueba"}

temp_df["metric"] = temp_df["metric"].map(temp_dict)

temp_df["model_type"] = temp_df["model_type"].map(
    {"none": "Ninguno", 
     "weighted": "Pesos de Clase", 
     "oversampled": "Sobremuestreo"})

temp_df = temp_df.rename(columns={"metric": "Métrica", "model_type": "Tipo", "modelo": "Modelo"})


temp_df = pd.pivot_table(temp_df, 
                         values='value', 
                         index=['Tipo'],
                         columns=['Modelo', 'Métrica'],)

#temp_df.round(3)

for var in ["Random Forest", "Redes Neuronales", "SVM"]:
    print(temp_df.loc[:, [var]].round(3).to_latex(na_rep="-"))
    print()

\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Random Forest} \\
Métrica &    Acc Prueba & F1 Prueba & Prec Prueba & Rec Prueba \\
Tipo           &               &           &             &            \\
\midrule
Ninguno        &         0.719 &     0.822 &       0.725 &      0.949 \\
Pesos de Clase &         0.754 &     0.841 &       0.755 &      0.949 \\
Sobremuestreo  &         0.737 &     0.831 &       0.740 &      0.949 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Redes Neuronales} \\
Métrica &       Acc Prueba & F1 Prueba & Prec Prueba & Rec Prueba \\
Tipo           &                  &           &             &            \\
\midrule
Ninguno        &            0.719 &     0.814 &       0.745 &      0.897 \\
Pesos de Clase &                - &         - &           - &          - \\
Sobremuestreo  &            0.719 &     0.795 &       0.795 &      0.795 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Mode

In [65]:
# Experimentos 1: resultados en entrenamiento

temp_df = df[(df["oversampled"]==False) & (df["class_weights"]==False)]

temp_df = temp_df.loc[:, ["CV Acc", "CV F1", "CV Prec", "CV Rec", "ngramas", "modelo"]]

temp_df["ngramas"] = temp_df["ngramas"].apply(lambda x: "n-gramas" if x else "tf-idf")

temp_df = temp_df.melt(id_vars=['ngramas', 'modelo'], var_name='metric', value_name='value')

temp_df = temp_df.rename(columns={"metric": "Métrica", "ngramas": "Tipo", "modelo": "Modelo"})

temp_df = pd.pivot_table(temp_df, 
                         values='value', 
                         index=['Tipo'],
                         columns=['Modelo', 'Métrica'])

#temp_df.round(3)
for var in ["Random Forest", "Redes Neuronales", "SVM"]:
    print(temp_df.loc[:, [var]].round(3).to_latex(na_rep="-"))
    print()

\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Random Forest} \\
Métrica &        CV Acc &  CV F1 & CV Prec & CV Rec \\
Tipo     &               &        &         &        \\
\midrule
n-gramas &         0.714 &  0.828 &   0.706 &  1.000 \\
tf-idf   &         0.731 &  0.832 &   0.730 &  0.968 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Redes Neuronales} \\
Métrica &           CV Acc &  CV F1 & CV Prec & CV Rec \\
Tipo     &                  &        &         &        \\
\midrule
n-gramas &            0.740 &  0.837 &   0.734 &  0.974 \\
tf-idf   &            0.744 &  0.838 &   0.742 &  0.962 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{SVM} \\
Métrica & CV Acc &  CV F1 & CV Prec & CV Rec \\
Tipo     &        &        &         &        \\
\midrule
n-gramas &  0.753 &  0.838 &   0.764 &  0.929 \\
tf-idf   &  0.740 &  0.828 &   0.759 &  0.910 \\
\bottomrule
\end{tabular}




In [66]:
# Experimentos 1: resultados en prueba

temp_df = df[(df["oversampled"]==False) & (df["class_weights"]==False)]

temp_df = temp_df.loc[:, ["test_accuracy", "test_f1", "test_precision", "test_recall", "ngramas", "modelo"]]

temp_df["ngramas"] = temp_df["ngramas"].apply(lambda x: "n-gramas" if x else "tf-idf")

temp_df = temp_df.melt(id_vars=['ngramas', 'modelo'], var_name='metric', value_name='value')

temp_dict = {"test_accuracy": "Acc Prueba", 
             "test_f1": "F1 Prueba", 
             "test_recall": "Rec Prueba",
             "test_precision": "Prec Prueba"}

temp_df["metric"] = temp_df["metric"].map(temp_dict)

temp_df = temp_df.rename(columns={"metric": "Métrica", "ngramas": "Tipo", "modelo": "Modelo"})

temp_df = pd.pivot_table(temp_df, 
                         values='value', 
                         index=['Tipo'],
                         columns=['Modelo', 'Métrica'])

#temp_df.round(3)

for var in ["Random Forest", "Redes Neuronales", "SVM"]:
    print(temp_df.loc[:, [var]].round(3).to_latex(na_rep="-"))
    print()

\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Random Forest} \\
Métrica &    Acc Prueba & F1 Prueba & Prec Prueba & Rec Prueba \\
Tipo     &               &           &             &            \\
\midrule
n-gramas &         0.719 &     0.822 &       0.725 &      0.949 \\
tf-idf   &         0.737 &     0.824 &       0.761 &      0.897 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{Redes Neuronales} \\
Métrica &       Acc Prueba & F1 Prueba & Prec Prueba & Rec Prueba \\
Tipo     &                  &           &             &            \\
\midrule
n-gramas &            0.719 &     0.814 &       0.745 &      0.897 \\
tf-idf   &            0.684 &     0.795 &       0.714 &      0.897 \\
\bottomrule
\end{tabular}


\begin{tabular}{lrrrr}
\toprule
Modelo & \multicolumn{4}{l}{SVM} \\
Métrica & Acc Prueba & F1 Prueba & Prec Prueba & Rec Prueba \\
Tipo     &            &           &             &            \\
\midrule
n-gramas &      0

In [17]:
# Comparación n-gramas vs t-idf: No hay diferencia aparentemente
df[df["oversampled"]==False].loc[:, ["model_name", "test_accuracy", "test_f1", "test_precision", "test_recall"]]

Unnamed: 0,model_name,test_accuracy,test_f1,test_precision,test_recall
19,ngramas + mlp - cargos limpio,0.719298,0.813953,0.744681,0.897436
0,ngramas + rf - cargos limpio,0.754386,0.840909,0.755102,0.948718
35,ngramas + svm - cargos limpio,0.701754,0.795181,0.75,0.846154
1,tf-idf + mlp - cargos limpio,0.684211,0.795455,0.714286,0.897436
0,tf-idf + rf - cargos limpio,0.754386,0.840909,0.755102,0.948718
30,tf-idf + svm - cargos limpio,0.719298,0.809524,0.755556,0.871795


In [138]:
# Comparación no oevrsampling vs oversampling
df[df["ngramas"]==True].loc[:, ["model_name", "test_accuracy", "test_f1", "test_precision", "test_recall"]]

Unnamed: 0,model_name,test_accuracy,test_f1,test_precision,test_recall
19,ngramas + mlp - cargos limpio,0.719298,0.813953,0.744681,0.897436
0,ngramas + rf - cargos limpio,0.754386,0.840909,0.755102,0.948718
35,ngramas + svm - cargos limpio,0.701754,0.795181,0.75,0.846154
19,oversampling + ngramas + mlp - cargos limpio,0.719298,0.794872,0.794872,0.794872
32,oversampling + ngramas + rf - cargos limpio,0.736842,0.831461,0.74,0.948718
17,oversampling + ngramas + svm - cargos limpio,0.719298,0.809524,0.755556,0.871795
