In [1]:
# ===========================================> Carga librerías <====================================================

import pandas as pd
import numpy as np
import pickle

# Transformación de datos
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import category_encoders as ce

# Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Seleccion de variables y tuning de hiperparámetros
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Métricas para evaluar un modelo de clasificación
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score, average_precision_score, confusion_matrix

# Librerías para visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado y modelado
# ------------------------------------------------------------------------------
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import accuracy_score, confusion_matrix, auc, plot_roc_curve, roc_curve, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, auc, RocCurveDisplay , roc_curve, classification_report

# Para que no se corten el listado de filas y columnas al ejecutar instrucciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# ===========================================> Carga Datos <========================================================

datos = pd.read_csv('../../data/processed/train.csv', low_memory=False)

In [5]:
# ===========================================> Carga Procesados Train <=============================================

cat_cols = pickle.load(open("../../columns/cat_pro.pkl", 'rb'))
num_cols = pickle.load(open("../../columns/num_pro.pkl", 'rb'))

In [6]:
# ===========================================> Definir variables <==================================================

IDENTIFIER = "MachineIdentifier"
LABEL = "HasDetections"

In [7]:
# ===========================================> Cambiamos tipos <====================================================

datos[cat_cols] = datos[cat_cols].astype("category")
datos[num_cols] = datos[num_cols].astype("float64")

In [8]:
# ===========================================> Datos <==============================================================

X_train, X_test, y_train, y_test = train_test_split(
                                        datos.drop(columns = LABEL)[cat_cols+num_cols],
                                        datos[LABEL],
                                        random_state = 123)

In [10]:
# ===========================================> DecisionTreeClassifier <=============================================

modelo_DecisionTreeClassifier = DecisionTreeClassifier(criterion= "entropy", max_depth=5, min_samples_leaf=3, random_state = 123)

modelo_DecisionTreeClassifier = modelo_DecisionTreeClassifier.fit(X_train, y_train)

predicciones_DecisionTreeClassifier = modelo_DecisionTreeClassifier.predict(X = X_test)
pred_proba_DecisionTreeClassifier = modelo_DecisionTreeClassifier.predict_proba(X = X_test)

print(classification_report(y_test, predicciones_DecisionTreeClassifier, digits=3, zero_division=True))

              precision    recall  f1-score   support

         0.0      0.691     0.443     0.540    111718
         1.0      0.589     0.801     0.679    111319

    accuracy                          0.622    223037
   macro avg      0.640     0.622     0.609    223037
weighted avg      0.640     0.622     0.609    223037



In [11]:
# ===========================================> LGBMClassifier <=====================================================

modelo_LGBMClassifier = lgb.LGBMClassifier(learning_rate = 0.2, max_depth=5, n_estimators= 25, num_leaves=50, random_state=123)

modelo_LGBMClassifier = modelo_LGBMClassifier.fit(X_train, y_train)

predicciones_LGBMClassifier = modelo_LGBMClassifier.predict(X = X_test)
pred_proba_LGBMClassifier = modelo_LGBMClassifier.predict_proba(X = X_test)

print(classification_report(y_test, predicciones_LGBMClassifier, digits=3, zero_division=True))

              precision    recall  f1-score   support

         0.0      0.660     0.608     0.633    111718
         1.0      0.636     0.686     0.660    111319

    accuracy                          0.647    223037
   macro avg      0.648     0.647     0.647    223037
weighted avg      0.648     0.647     0.646    223037



In [12]:
# ===========================================> LogisticRegression <=================================================

modelo_LogisticRegression = LogisticRegression(random_state = 123)

modelo_LogisticRegression = modelo_LogisticRegression.fit(X_train, y_train)

predicciones_LogisticRegression = modelo_LogisticRegression.predict(X = X_test)
pred_proba_LogisticRegression = modelo_LogisticRegression.predict_proba(X = X_test)

print(classification_report(y_test, predicciones_LogisticRegression, digits=3, zero_division=True))

              precision    recall  f1-score   support

         0.0      0.502     0.583     0.539    111718
         1.0      0.501     0.420     0.457    111319

    accuracy                          0.501    223037
   macro avg      0.501     0.501     0.498    223037
weighted avg      0.501     0.501     0.498    223037



In [13]:
# ===========================================> RandomForestClassifier <=============================================

modelo_RandomForestClassifier = RandomForestClassifier(criterion= "entropy", max_depth=5, min_samples_leaf=3, random_state = 123)

modelo_RandomForestClassifier = modelo_RandomForestClassifier.fit(X_train, y_train)

predicciones_RandomForestClassifier = modelo_RandomForestClassifier.predict(X = X_test)
pred_proba_RandomForestClassifier = modelo_RandomForestClassifier.predict_proba(X = X_test)

print(classification_report(y_test, predicciones_RandomForestClassifier, digits=3, zero_division=True))

              precision    recall  f1-score   support

         0.0      0.659     0.562     0.607    111718
         1.0      0.617     0.708     0.659    111319

    accuracy                          0.635    223037
   macro avg      0.638     0.635     0.633    223037
weighted avg      0.638     0.635     0.633    223037



In [14]:
# ===========================================> XGBClassifier <======================================================

modelo_XGBClassifier = xgb.XGBClassifier(booster="gbtree", max_depth=5, objective="binary:logistic", random_state=123)

modelo_XGBClassifier = modelo_XGBClassifier.fit(X_train, y_train)

predicciones_XGBClassifier = modelo_XGBClassifier.predict(X = X_test)
pred_proba_XGBClassifier = modelo_XGBClassifier.predict_proba(X = X_test)

print(classification_report(y_test, predicciones_XGBClassifier, digits=3, zero_division=True))

              precision    recall  f1-score   support

         0.0      0.663     0.650     0.656    111718
         1.0      0.655     0.668     0.662    111319

    accuracy                          0.659    223037
   macro avg      0.659     0.659     0.659    223037
weighted avg      0.659     0.659     0.659    223037

