In [1]:
import pandas as pd

In [2]:
df_cargos_desc = pd.read_excel("cargos_categorias.xlsx", sheet_name=0)
cargos_list = list(df_cargos_desc["CÓDIGO"])
#cargos_list

In [3]:
data = pd.read_csv("res_completo.csv", index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 281 entries, 0 to 280
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id_resol             281 non-null    object
 1   año                  281 non-null    int64 
 2   cargos               281 non-null    object
 3   códigos              280 non-null    object
 4   Empresa denunciada   281 non-null    object
 5   Multas               281 non-null    object
 6   Medidas correctivas  281 non-null    object
 7   dec_fav_denunciante  281 non-null    int64 
 8   cargos limpio        281 non-null    object
dtypes: int64(2), object(7)
memory usage: 22.0+ KB


In [4]:
# crear lista de códigos de cargos multicategoricos
data["códigos"] = data["códigos"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

In [None]:
data

Unnamed: 0,id_resol,año,cargos,códigos,Empresa denunciada,Multas,Medidas correctivas,dec_fav_denunciante,cargos limpio
0,0002-2020/SPC-INDECOPI,2020,Luego de contratar los servicios de la denunci...,[ser1],EMPRESA DE TRANSPORTES TURISMO SEÑOR DE ATACO ...,0,0,0,luego contratar servicios denunciada 26 octubr...
1,0894-2020/SPC-INDECOPI,2020,La cláusula 11 incluida en sus boletos de viaj...,[c1],EMPRESA CAPLINA DE TRANSPORTES TURÍSTICOS INTE...,1 UIT,Consignar mecanismos de indemnización por pérd...,1,cláusula 11 incluida boletos viaje establecía ...
2,0893-2020/SPC-INDECOPI,2020,La cláusula 11 incluida en sus boletos de viaj...,[c1],EMPRESA DE TRANSPORTES FLORES HNOS S.R.L.,1 UIT,Consignar mecanismos de indemnización por pérd...,1,cláusula 11 incluida boletos viaje establecía ...
3,0051-2020/SPC-INDECOPI,2020,la empresa no implementó en su establecimiento...,"[o1, o2, seg1, c2]",TURISMO EXPRESO LATINO AMERICANO E.I.R.L,4.68 UIT,Adecuar boletos y difundirlo,1,empresa implementó establecimiento comercial l...
4,0944-2020/SPC-INDECOPI,2020,La empresa no habría brindado un servicio idón...,[ser3],EMPRESA DE TRANSPORTES Y SERVICIOS MÚLTIPLES S...,50 UIT,0,1,empresa brindado servicio idóneo ruta trujillo...
...,...,...,...,...,...,...,...,...,...
276,1133-2010/SPC-INDECOPI,2010,(i) El 3 de abril de 2009 contrató los servic...,"[ser1, ser7, ser13]",Empresa de Transportes Expreso Cial S.A.C.,2 UIT Pago de costas y costos,S/.550.00,1,i 3 abril 2009 contrató servicios expreso cial...
277,1241-2010/SPC-INDECOPI,2010,Señaló que el denunciado no le habría brindado...,"[ser2, ser16]",TURISMO CIVA S.A.C.,0,0,0,señaló denunciado brindado servicio transporte...
278,1404-2010/SPC-INDECOPI,2010,Señaló que el 29 de abril de 2009 contrató\nlo...,[ser2],JULSA ÁNGELES TOURS S.A.C.,1 UIT Pago de costas y costos,S/.40.00,1,señaló 29 abril 2009 contrató servicios denunc...
279,1468-2010/SPC-INDECOPI,2010,se denuncio que en el viaje de Arequipa a\nCus...,[ser2],EMPRESA DE TRANSPORTES TURISMO REAL DEL SUR S....,2 UIT,S/.862.50,1,denuncio viaje arequipa cusco realizado 22 jun...


In [5]:
# los datos están desbalanceados, así que tenemos que usar el precision y recall
# para evaluar sus resultados

data.groupby(["dec_fav_denunciante"])["id_resol"].count() / data["id_resol"].count()

dec_fav_denunciante
0    0.313167
1    0.686833
Name: id_resol, dtype: float64

# Separación del conjunto de datos en entrenamiento y test

In [6]:
from sklearn.model_selection import train_test_split

# Separación del conjunto de datos en entrenamiento y test
# Separar el 20% para el test
# estratificar de acuerdo con dec_fav_denunciante

X_train, X_test, Y_train, Y_test = train_test_split(data.drop("dec_fav_denunciante", 
                                                            axis=1),
                                                    data["dec_fav_denunciante"],
                                                    test_size=0.2,
                                                    stratify=data["dec_fav_denunciante"],
                                                    random_state=0)

print("training set size: ", X_train.shape[0])
print("test set size: ", X_test.shape[0])

training set size:  224
test set size:  57


In [7]:
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [8]:
# Crear un generador de cross-validation: 5-fold
kf = KFold(n_splits=5, random_state=0, shuffle=True);

# Optimización de hiperparámetros mediante CV

In [16]:
# crear DF para almacenar todos los resultados de los experiemntos
experiments_results = pd.DataFrame(
    columns=['params', 'model_name',
             'mean_test_accuracy',  'std_test_accuracy',
             'mean_test_f1',        'std_test_f1', 
             'mean_test_precision', 'std_test_precision', 
             'mean_test_recall',    'std_test_recall'])

In [17]:
# función que automatiza el proceso de optimización de hiperparámetros meidante CV para un
# modelo, retorna un DF con los resultados de los experimentos

def run_CV(model, model_name, parameters, cv_iter, X, Y):
    
    clf = GridSearchCV(estimator = model,       # especificar el modelo a entrenar
                       param_grid = parameters, # especificar los parámetros a optimizar
                       cv = cv_iter,            # 5-fold cross validation
                        # definir métricas de evaluación
                       scoring = ['accuracy', 'precision', 'recall', 'f1'], 
                       refit = False            # no entrenar un modelo con todos los datos
                      )

    # run the cv experiments
    clf.fit(X, Y)

    # store the results of the experiments in a dataframe
    keys = ['params', 'model_name',
            'mean_test_accuracy',  'std_test_accuracy',
            'mean_test_f1', 'std_test_f1', 
            'mean_test_precision', 'std_test_precision', 
            'mean_test_recall', 'std_test_recall']
             
    results_dic = {k:v for k, v in clf.cv_results_.items() if k in keys}
    results = pd.DataFrame(results_dic)
    results["model_name"] = model_name
    
    return results

### Experimentos modelo RF + n-gramas

In [18]:
# crear modelo combinado de n-gramas y random forest
column_trans = ColumnTransformer([
    # Applicar una transformación CountVectorizer a la columna 'cargos'  
    ('CargosLimpioNGramas',  CountVectorizer(), 'cargos limpio')
    ],
    remainder='drop' # ignore the remaining columns
)

pipeline = Pipeline([
    # Primer paso: extracción de características
    ("FeatureExtraction", column_trans),
    # Segundo paso: predicción de la decisión
    # freeze random state
    ('RandForestPred', RandomForestClassifier(random_state=0, criterion="entropy",
                                             class_weight="balanced")) 
])

In [19]:
# especificación de hiperparámetros a optimizar
parameters = {
    'FeatureExtraction__CargosLimpioNGramas__max_features': [50, None], # cant de features a extraer
    'FeatureExtraction__CargosLimpioNGramas__max_df': [1.0, 0.95], 
    'FeatureExtraction__CargosLimpioNGramas__min_df': [0.0, 0.05],
    'FeatureExtraction__CargosLimpioNGramas__ngram_range': [(1, 1), (2, 2), (3, 3)], # n-grams length
    'RandForestPred__min_samples_leaf':[1, 3, 5, 7],
    'RandForestPred__bootstrap': [True, False]
}

In [21]:
exp_results_df = run_CV(pipeline, 
                        "N-gramas + RF", 
                        parameters, 
                        kf, 
                        X_train, Y_train)

experiments_results = experiments_results.append(exp_results_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
experiments_results

Unnamed: 0,params,model_name,mean_test_accuracy,std_test_accuracy,mean_test_f1,std_test_f1,mean_test_precision,std_test_precision,mean_test_recall,std_test_recall
0,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.678586,0.045571,0.789059,0.036740,0.715941,0.043175,0.881419,0.052536
1,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.629596,0.079630,0.741110,0.069190,0.703960,0.055860,0.783632,0.088863
2,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.612020,0.070742,0.718735,0.060193,0.712610,0.057571,0.726970,0.074309
3,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.625152,0.071434,0.716771,0.067663,0.736442,0.062988,0.699274,0.076368
4,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.687374,0.066500,0.790253,0.051520,0.725876,0.045111,0.867377,0.061093
...,...,...,...,...,...,...,...,...,...,...
187,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.442929,0.120605,0.372426,0.250940,0.691818,0.118514,0.338242,0.345333
188,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.415455,0.059357,0.297505,0.117077,0.801667,0.165362,0.187072,0.085434
189,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.411111,0.057090,0.291737,0.105042,0.795000,0.164621,0.181065,0.071481
190,{'FeatureExtraction__CargosLimpioNGramas__max_...,N-gramas + RF,0.411111,0.069567,0.285240,0.141707,0.775000,0.174005,0.180635,0.100452


### Experimentos modelo SVC + n-gramas

In [25]:
# crear modelo combinado de n-gramas y random forest
column_trans = ColumnTransformer([
    # Applicar una transformación CountVectorizer a la columna 'cargos'  
    ('CargosLimpioNGramas',  CountVectorizer(), 'cargos limpio')
    ],
    remainder='drop' # ignore the remaining columns
)

pipeline = Pipeline([
    # Primer paso: extracción de características
    ("FeatureExtraction", column_trans),
    # Segundo paso: predicción de la decisión
    ('SVMPred', SVC(random_state=0, class_weight="balanced")) # freeze random state
])

In [26]:
# especificación de hiperparámetros a optimizar
parameters = {
    'FeatureExtraction__CargosLimpioNGramas__max_features': [50, None], # cant de features a extraer
    'FeatureExtraction__CargosLimpioNGramas__max_df': [1.0, 0.95], 
    'FeatureExtraction__CargosLimpioNGramas__min_df': [0.0, 0.05],
    'FeatureExtraction__CargosLimpioNGramas__ngram_range': [(1, 1), (2, 2), (3, 3)], # n-grams length
    'SVMPred__C': [0.1, 1, 10], 
    'SVMPred__kernel':['linear', 'rbf'],
}

In [27]:
exp_results_df = run_CV(pipeline, 
                        "N-gramas + SVC", 
                        parameters, 
                        kf, 
                        X_train, Y_train)

experiments_results = experiments_results.append(exp_results_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### Experimentos modelo RF + TF-IDF

In [28]:
# crear modelo combinado de n-gramas (TF-IDF) y random forest
column_trans_2 = ColumnTransformer([
    # Applicar una transformación TFIDF a la columna 'cargos'  
    ('CargosLimpiosTFIDF',  TfidfVectorizer(), 'cargos limpio')
    ],
    remainder='drop' # ignore the remaining columns
)

pipeline_2 = Pipeline([
    # Primer paso: extracción de características
    ("FeatureExtraction", column_trans_2),
    # Segundo paso: predicción de la decisión
    ('RandForestPred', RandomForestClassifier(random_state=0, criterion="entropy",
                                              class_weight="balanced")) # freeze random state
])

In [29]:
# especificación de hiperparámetros a optimizar
parameters_2 = {
    # parámetros del la transformación de la columna Cargos Limpio
    'FeatureExtraction__CargosLimpiosTFIDF__ngram_range': [(1, 1), (2, 2), (3, 3)], # n-grams length
    'FeatureExtraction__CargosLimpiosTFIDF__max_features': [50, None], # cant de features a extraer
    'FeatureExtraction__CargosLimpiosTFIDF__max_df': [1.0, 0.95], 
    'FeatureExtraction__CargosLimpiosTFIDF__min_df': [0.0, 0.05],

    'RandForestPred__min_samples_leaf':[1, 3, 5, 7],
    'RandForestPred__bootstrap': [True, False]
}

In [None]:
exp_results_df = run_CV(pipeline_2, 
                        "TF-IDF + RF", 
                        parameters_2, 
                        kf, 
                        X_train, Y_train)

experiments_results = experiments_results.append(exp_results_df)

### Experimentos modelo SVC + TF-IDF

In [None]:
# crear modelo combinado de n-gramas (TF-IDF) y SVC
column_trans = ColumnTransformer([
    # Applicar una transformación TFIDF a la columna 'cargos'  
    ('CargosLimpiosTFIDF',  TfidfVectorizer(), 'cargos limpio')
    ],
    remainder='drop' # ignore the remaining columns
)

pipeline = Pipeline([
    # Primer paso: extracción de características
    ("FeatureExtraction", column_trans),
    # Segundo paso: predicción de la decisión
    ('SVMPred', SVC(random_state=0, class_weight="balanced")) # freeze random state
])

In [None]:
# especificación de hiperparámetros a optimizar
parameters = {
    # parámetros del la transformación de la columna Cargos Limpio
    'FeatureExtraction__CargosLimpiosTFIDF__ngram_range': [(1, 1), (2, 2), (3, 3)], # n-grams length
    'FeatureExtraction__CargosLimpiosTFIDF__max_features': [50, None], # cant de features a extraer
    'FeatureExtraction__CargosLimpiosTFIDF__max_df': [1.0, 0.95], 
    'FeatureExtraction__CargosLimpiosTFIDF__min_df': [0.0, 0.05],

    'SVMPred__C': [0.1, 1, 10], 
    'SVMPred__kernel':['linear', 'rbf'],
}

In [None]:
exp_results_df = run_CV(pipeline, 
                        "TF-IDF + SVC", 
                        parameters, 
                        kf, 
                        X_train, Y_train)

experiments_results = experiments_results.append(exp_results_df)

### Experimento Cargos Multicat + RF

In [None]:
# entrenar el multi label binarizer
mlb = MultiLabelBinarizer(classes=cargos_list)
mlb.fit([cargos_list])

# hay algunos errores que se debe corregir en la codificación
X_train_proc = mlb.transform(X_train["códigos"])

print(X_train_proc.shape)

In [None]:
# crear modelo combinado de cargos (multicat) y random forest

pipeline_3 = Pipeline([
    # predicción de la decisión
    ('RandForestPred', RandomForestClassifier(random_state=0, class_weight="balanced")) # freeze random state
])

In [None]:
# especificación de hiperparámetros a optimizar
parameters_3 = {
    'RandForestPred__criterion': ['entropy'], 
    'RandForestPred__min_samples_leaf':[1, 3, 5, 7],
    'RandForestPred__bootstrap': [True, False]
}

In [None]:
exp_results_df = run_CV(pipeline_3, 
                        "CargosMulticat + RF", 
                        parameters_3, 
                        kf, 
                        X_train_proc, Y_train)

experiments_results = experiments_results.append(exp_results_df)

### Experimento Cargos Multicat + SVC

In [None]:
# crear modelo combinado de cargos (multicat) y SVM
pipeline = Pipeline([
    # predicción de la decisión
    ('SVMPred', SVC(random_state=0, class_weight="balanced")) # freeze random state
])

In [None]:
# especificación de hiperparámetros a optimizar
parameters = {
    'SVMPred__C': [0.1, 1, 10], 
    'SVMPred__kernel':['linear', 'rbf'],
}

In [None]:
exp_results_df = run_CV(pipeline, 
                        "CargosMulticat + SVM", 
                        parameters, 
                        kf, 
                        X_train_proc, Y_train)

experiments_results = experiments_results.append(exp_results_df)

In [None]:
## Guardar los resultados

In [None]:
experiments_results.to_csv("resultados_experimentos.csv")