In [1]:
!pip install imbalanced-learn==0.8.1

Collecting imbalanced-learn==0.8.1
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
[?25l[K     |█▊                              | 10 kB 21.6 MB/s eta 0:00:01[K     |███▌                            | 20 kB 24.5 MB/s eta 0:00:01[K     |█████▏                          | 30 kB 25.6 MB/s eta 0:00:01[K     |███████                         | 40 kB 27.1 MB/s eta 0:00:01[K     |████████▋                       | 51 kB 27.3 MB/s eta 0:00:01[K     |██████████▍                     | 61 kB 26.3 MB/s eta 0:00:01[K     |████████████                    | 71 kB 24.3 MB/s eta 0:00:01[K     |█████████████▉                  | 81 kB 23.3 MB/s eta 0:00:01[K     |███████████████▌                | 92 kB 24.3 MB/s eta 0:00:01[K     |█████████████████▎              | 102 kB 25.3 MB/s eta 0:00:01[K     |███████████████████             | 112 kB 25.3 MB/s eta 0:00:01[K     |████████████████████▊           | 122 kB 25.3 MB/s eta 0:00:01[K     |██████████████████████▌        

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime as dt

In [2]:
#df_cargos_desc = pd.read_excel("cargos_categorias.xlsx", sheet_name=0)
#cargos_list = list(df_cargos_desc["CÓDIGO"])

In [3]:
data = pd.read_csv("res_completo.csv", index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284 entries, 0 to 283
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_resol             284 non-null    object
 1   año                  284 non-null    int64 
 2   cargos               284 non-null    object
 3   códigos              283 non-null    object
 4   Empresa denunciada   284 non-null    object
 5   Multas               284 non-null    object
 6   Medidas correctivas  284 non-null    object
 7   dec_fav_denunciante  284 non-null    int64 
 8   cargos limpio        284 non-null    object
 9   cargos limpio stem   284 non-null    object
dtypes: int64(2), object(8)
memory usage: 24.4+ KB


In [4]:
# crear lista de códigos de cargos multicategoricos
data["códigos"] = data["códigos"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

In [5]:
data.loc[:, ["id_resol", "cargos limpio", "cargos limpio stem", "dec_fav_denunciante"]]

Unnamed: 0,id_resol,cargos limpio,cargos limpio stem,dec_fav_denunciante
0,0002-2020/SPC-INDECOPI,luego contratar servicios denunciada 26 octubr...,luego contratar servicios denunciada 26 octubr...,0
1,0894-2020/SPC-INDECOPI,cláusula 11 incluida boletos viaje establecía ...,clausula 11 incluida boletos viaje establecia ...,1
2,0893-2020/SPC-INDECOPI,cláusula 11 incluida boletos viaje establecía ...,clausula 11 incluida boletos viaje establecia ...,1
3,0051-2020/SPC-INDECOPI,empresa implementó establecimiento comercial l...,empresa implemento establecimiento comercial l...,1
4,0944-2020/SPC-INDECOPI,empresa brindado servicio idóneo ruta trujillo...,empresa brindado servicio idoneo ruta trujillo...,1
...,...,...,...,...
279,1133-2010/SPC-INDECOPI,i 3 abril 2009 contrató servicios expreso cial...,i 3 abril 2009 contrato servicios expreso cial...,1
280,1241-2010/SPC-INDECOPI,señaló denunciado brindado servicio transporte...,señalo denunciado brindado servicio transporte...,0
281,1404-2010/SPC-INDECOPI,señaló 29 abril 2009 contrató servicios denunc...,señalo 29 abril 2009 contrato servicios denunc...,1
282,1468-2010/SPC-INDECOPI,denuncio viaje arequipa cusco realizado 22 jun...,denuncio viaje arequipa cusco realizado 22 jun...,1


In [6]:
# los datos están desbalanceados, así que tenemos que usar el precision y recall
# para evaluar sus resultados

data.groupby(["dec_fav_denunciante"])["id_resol"].count() / data["id_resol"].count()

dec_fav_denunciante
0    0.31338
1    0.68662
Name: id_resol, dtype: float64

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from imblearn.pipeline import Pipeline as PipelineIL
from imblearn.over_sampling import RandomOverSampler

# Separación del conjunto de datos en entrenamiento y test

In [8]:
# Crear un generador de cross-validation: 3-fold
kf = StratifiedKFold(n_splits=3, random_state=1, shuffle=True);

In [9]:
from sklearn.model_selection import train_test_split

# Separación del conjunto de datos en entrenamiento y test
# Separar el 20% para el test
# estratificar de acuerdo con dec_fav_denunciante

X_train, X_test, Y_train, Y_test = train_test_split(data.drop("dec_fav_denunciante", 
                                                            axis=1),
                                                    data["dec_fav_denunciante"],
                                                    test_size=0.2,
                                                    stratify=data["dec_fav_denunciante"],
                                                    random_state=0)

print("training set size: ", X_train.shape[0])
print("test set size: ", X_test.shape[0])

training set size:  227
test set size:  57


# Optimización de hiperparámetros mediante CV

In [10]:
# crear DF para almacenar todos los resultados de los experiemntos
experiments_results = pd.DataFrame(
    columns=['params', 'model_name',
             'mean_train_accuracy', 'std_train_accuracy',
             'mean_test_accuracy',  'std_test_accuracy',
             'mean_test_f1',        'std_test_f1', 
             'mean_test_precision', 'std_test_precision', 
             'mean_test_recall',    'std_test_recall'])

In [11]:
# función que automatiza el proceso de optimización de hiperparámetros meidante CV para un
# modelo, retorna un DF con los resultados de los experimentos

def run_CV(model, model_name, parameters, cv_iter, X, Y):
    
    clf = GridSearchCV(estimator = model,       # especificar el modelo a entrenar
                       param_grid = parameters, # especificar los parámetros a optimizar
                       cv = cv_iter,            # cross validation iterator
                        # definir métricas de evaluación
                       scoring = ['accuracy', 'precision', 'recall', 'f1'], 
                       refit = False,            # no entrenar un modelo con todos los datos
                       return_train_score=True
                      )

    # run the cv experiments
    clf.fit(X, Y)

    # store the results of the experiments in a dataframe
    keys = ['params', 'model_name',
            'mean_train_accuracy', 'std_train_accuracy',
            'mean_test_accuracy',  'std_test_accuracy',
            'mean_test_f1', 'std_test_f1', 
            'mean_test_precision', 'std_test_precision', 
            'mean_test_recall', 'std_test_recall']
             
    results_dic = {k:v for k, v in clf.cv_results_.items() if k in keys}
    results = pd.DataFrame(results_dic)
    results["model_name"] = model_name
    
    return results

In [12]:
def report_metrics(data):

    fig, axis = plt.subplots( 1, 3, figsize=(17, 5))

    sns.scatterplot(x="mean_test_precision", 
                  y="mean_test_recall",
                  data=data,
                  ax=axis[0])

    sns.scatterplot(x="mean_test_accuracy", 
                  y="mean_test_f1",
                  data=data,
                  ax=axis[1])

    sns.scatterplot(x="mean_train_accuracy", 
                  y="mean_test_accuracy",
                  data=data,
                  ax=axis[2])

    axis[0].set_ylim(0.0, 1.0)
    axis[1].set_ylim(0.0, 1.0)
    axis[2].set_ylim(0.0, 1.0)

    axis[0].set_xlim(0.0, 1.0)
    axis[1].set_xlim(0.0, 1.0)
    axis[2].set_xlim(0.0, 1.0)

    plt.show()


## model config

In [13]:
columnas = ["cargos limpio"]

In [14]:
# # Original arguments
# default_ngrams_args = {
#     'FeaExt__max_df': [1.0, 0.9], 
#     'FeaExt__min_df': [0.0, 0.1],
#     'FeaExt__ngram_range': [(1, 1), (2, 2), (1, 2)]
# }

# default_tfidf_args = {
#     'FeaExt__max_df': [1.0, 0.9], 
#     'FeaExt__min_df': [0.0, 0.1],
#     'FeaExt__ngram_range': [(1, 1), (2, 2), (1, 2)]
# }

# default_mlp_args = {
#     'Pred__random_state': [0],
#     'Pred__max_iter': [200],
#     'Pred__tol': [0.001],
#     'Pred__learning_rate_init': [0.001],
#     'Pred__activation': ['tanh', 'relu'],
#     #'Pred__alpha': [0.0001, 0.001],
#     'Pred__batch_size': [25, 'auto'],
#     'Pred__hidden_layer_sizes': [(10), (10, 5)],
# }

# default_rf_args = {
#   'Pred__random_state': [0], 
#   'Pred__criterion': ["entropy", "giny"], 
#   'Pred__n_estimators': [100],
#   'Pred__min_samples_leaf':[3, 5, 7],
#   'Pred__bootstrap': [True, False],
#   'Pred__max_samples': [0.95, 0.90],
# }

# default_svm_args ={
#     'Pred__random_state': [0], 
#     'Pred__C': [0.1, 1, 10], 
#     'Pred__kernel': ['linear', 'rbf'],
# }

In [15]:
default_ngrams_args = {
    'FeaExt__max_df': [1.0, 0.9], 
    'FeaExt__min_df': [0.0, 0.1],
    'FeaExt__ngram_range': [(1, 1), (2, 2), (1, 2)]
}

default_tfidf_args = {
    'FeaExt__max_df': [1.0, 0.9], 
    'FeaExt__min_df': [0.0, 0.1],
    'FeaExt__ngram_range': [(1, 1), (2, 2), (1, 2)]
}

default_mlp_args = {
    'Pred__random_state': [0],
    'Pred__max_iter': [200],
    'Pred__tol': [0.001],
    'Pred__learning_rate_init': [0.001],
    'Pred__activation': ['tanh', 'relu'],
    'Pred__batch_size': [25, 'auto'],
    'Pred__hidden_layer_sizes': [(10), (10, 5)],
}

default_rf_args = {
  'Pred__random_state': [0], 
  'Pred__criterion': ["entropy",], 
  'Pred__n_estimators': [100],
  'Pred__min_samples_leaf':[1, 3, 5, 7],
  'Pred__bootstrap': [True, False],
  'Pred__max_samples': [0.95, 0.90],
}

default_svm_args ={
    'Pred__random_state': [0], 
    'Pred__C': [0.1, 1, 10], 
    'Pred__kernel': ['linear', 'rbf'],
}

In [16]:
modelos = [
    
    # N-GRAMAS + MLP
    {"model": "ngramas + mlp", 
     "pipeline": Pipeline([("FeaExt", CountVectorizer()), ('Pred', MLPClassifier())]),
     "params": {**default_ngrams_args, 
                **default_mlp_args}
    },
    

    # N-GRAMAS + RF
     {"model": "ngramas + rf", 
      "pipeline": Pipeline([("FeaExt", CountVectorizer()), ('Pred', RandomForestClassifier())]),
      "params": {
                **default_ngrams_args,
                **default_rf_args,
                'Pred__class_weight': ['balanced', None]}
     },


    # N-GRAMAS + SVM
     {"model": "ngramas + svm", 
      "pipeline": Pipeline([("FeaExt", CountVectorizer()), ('Pred', SVC())]),
      "params": {
                **default_ngrams_args,
                **default_svm_args,
                'Pred__class_weight': [None, 'balanced']
                 }
     },



    # OVERSAMPLING + N-GRAMAS + MLP
     {"model": "oversampling + ngramas + mlp", 
      "pipeline": PipelineIL([
                  # when predicting, this step does nothing
                  # resamples each CV fold and not the entire training set 
                  ("FeaExt", CountVectorizer()),
                  ("Oversampling", RandomOverSampler(random_state=0, sampling_strategy='not majority')),
                  ('Pred', MLPClassifier())
                  ]),
      "params": {
                **default_ngrams_args, 
                **default_mlp_args
                }
      },


    # OVERSAMPLING + N-GRAMAS + RF
     {"model": "oversampling + ngramas + rf", 
      "pipeline": PipelineIL([ 
                  ("FeaExt", CountVectorizer()),
                  ("Oversampling", RandomOverSampler(random_state=0, sampling_strategy='not majority')),
                  ('Pred', RandomForestClassifier())
                  ]),
      "params": {
                **default_ngrams_args,
                **default_rf_args,
                 }
      },


    # OVERSAMPLING + N-GRAMAS + SVM
     {"model": "oversampling + ngramas + svm", 
      "pipeline": PipelineIL([ 
                  ("FeaExt", CountVectorizer()),
                  ("Oversampling", RandomOverSampler(random_state=0, sampling_strategy='not majority')),
                  ('Pred', SVC())
                  ]),
      "params": {
                **default_ngrams_args,
                **default_svm_args,
                 }
      },


    # TF-IDF + MLP
    {"model": "tf-idf + mlp", 
     "pipeline": Pipeline([("FeaExt", TfidfVectorizer()), ('Pred', MLPClassifier())]),
     "params": {
                **default_tfidf_args, 
                **default_mlp_args
                }
    },


    # TF-IDF + RF
    {"model": "tf-idf + rf", 
     "pipeline": Pipeline([("FeaExt", TfidfVectorizer()), ('Pred', RandomForestClassifier())]),
     "params": {
                **default_tfidf_args,
                **default_rf_args,
                'Pred__class_weight': ['balanced', None]}
    },


    # TF-IDF + SVM
    {"model": "tf-idf + svm", 
     "pipeline": Pipeline([("FeaExt", TfidfVectorizer()), ('Pred', SVC())]),
     "params": {
                **default_tfidf_args,
                **default_svm_args,
                'Pred__class_weight': [None, 'balanced']
                }
    },

]

In [17]:
for columna in columnas:
    for model_spec in modelos:

        pipeline = model_spec["pipeline"]
        params =  model_spec["params"]
        name = model_spec["model"]

        print(f"entrenando modelo: {name} - {columna}")
        
        # seleccionar la columna
        X_train_temp = X_train.loc[:, [columna]].values.reshape((X_train.shape[0]))
        
        # CV
        exp_results_df = run_CV(pipeline, 
                                f"{name} - {columna}", 
                                params, 
                                kf, 
                                X_train_temp, Y_train)
        
        # almacenar los resultados
        experiments_results = experiments_results.append(exp_results_df)

    # guardar los resultados
    time_sv = dt.now().strftime("%m-%d-%H-%M")
    experiments_results.to_csv(f"resultados_{time_sv}.csv")

entrenando modelo: ngramas + mlp - cargos limpio


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


entrenando modelo: ngramas + rf - cargos limpio
entrenando modelo: ngramas + svm - cargos limpio
entrenando modelo: oversampling + ngramas + mlp - cargos limpio


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


entrenando modelo: oversampling + ngramas + rf - cargos limpio
entrenando modelo: oversampling + ngramas + svm - cargos limpio
entrenando modelo: tf-idf + mlp - cargos limpio


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


entrenando modelo: tf-idf + rf - cargos limpio
entrenando modelo: tf-idf + svm - cargos limpio


In [20]:
experiments_results

Unnamed: 0,params,model_name,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall
0,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio,1.000000,0.000000,0.722222,0.040726,0.828186,0.025238,0.720244,0.019322,0.974359,0.036262
1,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio,0.936040,0.027826,0.695965,0.029134,0.786285,0.029598,0.758318,0.013935,0.820513,0.070804
2,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio,0.856832,0.002922,0.625731,0.028946,0.711382,0.039717,0.753195,0.015633,0.679487,0.080576
3,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio,0.799451,0.035231,0.656433,0.027816,0.735002,0.028443,0.786940,0.058120,0.698718,0.077455
4,"{'FeaExt__max_df': 1.0, 'FeaExt__min_df': 0.0,...",ngramas + rf - cargos limpio,1.000000,0.000000,0.713567,0.017892,0.823793,0.010735,0.713806,0.010874,0.974359,0.023985
...,...,...,...,...,...,...,...,...,...,...,...,...
283,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",ngramas + svm - cargos limpio,0.920704,0.005413,0.665322,0.020523,0.766568,0.018856,0.735634,0.015400,0.801282,0.036262
284,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",ngramas + svm - cargos limpio,0.942721,0.016551,0.595146,0.074443,0.682267,0.069712,0.732276,0.045053,0.641026,0.092004
285,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",ngramas + svm - cargos limpio,0.988992,0.003102,0.669649,0.009079,0.764916,0.006224,0.748706,0.011858,0.782051,0.009065
286,"{'FeaExt__max_df': 0.9, 'FeaExt__min_df': 0.1,...",ngramas + svm - cargos limpio,0.942721,0.016551,0.595146,0.074443,0.682267,0.069712,0.732276,0.045053,0.641026,0.092004
