In [29]:
# ===========================================> Carga librerías <====================================================

import pandas as pd
import numpy as np
import pickle

# Transformación de datos
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import category_encoders as ce

# Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb

# Seleccion de variables y tuning de hiperparámetros
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Métricas para evaluar un modelo de clasificación
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score, average_precision_score, confusion_matrix

# Librerías para visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado y modelado
# ------------------------------------------------------------------------------
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import accuracy_score, confusion_matrix, auc, plot_roc_curve, roc_curve, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, auc, RocCurveDisplay , roc_curve, classification_report

# Para que no se corten el listado de filas y columnas al ejecutar instrucciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [30]:
# ===========================================> Carga Datos <========================================================

datos = pd.read_csv('../../data/raw/train.csv', low_memory=False)

In [31]:
# ===========================================> Definir variables <==================================================

IDENTIFIER = "MachineIdentifier"
LABEL = "HasDetections"

cat_cols = []
num_cols = []

datos[IDENTIFIER] = datos[IDENTIFIER].astype("category")
datos[LABEL] = datos[LABEL].astype("float64")

In [32]:
# ===========================================> Eliminar valores nulos <=============================================

drop_cols_min_nulls = 0.7
drop_cols_nulls = []

for col in datos.columns:
    if col != IDENTIFIER and col != LABEL and datos[col].isna().sum() / len(datos) >= drop_cols_min_nulls:
        drop_cols_nulls.append(col)

datos.drop(columns=drop_cols_nulls, inplace=True)

In [33]:
# ===========================================> Eliminar valores mal balanceados <===================================

drop_cols_min_big_cat = 70
drop_cols_big_cat = []

for col in datos.columns:
    if col != IDENTIFIER and col != LABEL and datos[col].value_counts(normalize=True, dropna=False).values[0] * 100 >= drop_cols_min_big_cat:
        drop_cols_big_cat.append(col)

datos.drop(columns=drop_cols_big_cat, inplace=True)

In [34]:
# ===========================================> Separar datos por tipos categoricas <================================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()
cat_cols.remove(IDENTIFIER)

datos[cat_cols] = datos[cat_cols].astype("category")

In [35]:
# ===========================================> Separar datos por tipos numericas <==================================

num_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()
num_cols.remove(LABEL)

datos[num_cols] = datos[num_cols].astype("float64")

In [36]:
# ===========================================> Separar datos cat a numericas  <======================================

cat_cols_num = []

for col in cat_cols:
    if pd.to_numeric(datos[col], errors='coerce').notnull().all():
        cat_cols_num.append(col)

for col in cat_cols_num:
    num_cols.append(col)
    cat_cols.remove(col)

datos[cat_cols_num] = datos[cat_cols_num].astype("float64")

In [37]:
# ===========================================> Separar datos numericas a categoricas <==============================

num_cols_cat_max = 1
num_cols_cat = []

for col in num_cols:
    if (datos[col].nunique() / datos[col].count()) * 100 <= num_cols_cat_max:
        num_cols_cat.append(col) 

for col in num_cols_cat:
    cat_cols.append(col)
    num_cols.remove(col)

datos[cat_cols] = datos[cat_cols].astype("category")

In [38]:
# ===========================================> Guardamos columnas categoricas <=====================================

pickle.dump(cat_cols, open("../../columns/train/cat.pkl", 'wb'))

In [39]:
# ===========================================> Guardamos columnas numericas <=======================================

pickle.dump(num_cols, open("../../columns/train/num.pkl", 'wb'))

In [40]:
# ===========================================> Imputar nulos para tipos categoricas <===============================

cat_simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

cat_simple_imputer = cat_simple_imputer.fit(datos[cat_cols])
datos[cat_cols] = cat_simple_imputer.transform(datos[cat_cols])

In [41]:
# ===========================================> Guardamos SimpleImputer para tipos categoricas <=====================

pickle.dump(cat_simple_imputer, open("../../imputers/train/cat_simple.pkl", 'wb'))

In [42]:
# ===========================================> Procesar mascaras y versiones 3 partes <=============================

cat_cols_mask_3 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 2).all():
        cat_cols_mask_3.append(col)

for col in cat_cols_mask_3:
    datos[[col + "_1", col + "_2", col + "_3"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_3:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")

datos.drop(columns=cat_cols_mask_3, inplace=True)

In [43]:
# ===========================================> Guardamos columnas mascaras y versiones 3 partes <===================

pickle.dump(cat_cols_mask_3, open("../../columns/train/cat_mask_3.pkl", 'wb'))

In [44]:
# ===========================================> Procesar mascaras y versiones 4 partes <=============================

cat_cols_mask_4 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 3).all():
        cat_cols_mask_4.append(col)

for col in cat_cols_mask_4:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_4:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")

datos.drop(columns=cat_cols_mask_4, inplace=True)

In [45]:
# ===========================================> Guardamos columnas mascaras y versiones 4 partes <===================

pickle.dump(cat_cols_mask_4, open("../../columns/train/cat_mask_4.pkl", 'wb'))

In [46]:
# ===========================================> Procesar mascaras y versiones 5 partes <=============================

cat_cols_mask_5 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 4).all():
        cat_cols_mask_5.append(col)

for col in cat_cols_mask_5:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_5:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")

datos.drop(columns=cat_cols_mask_5, inplace=True)

In [47]:
# ===========================================> Guardamos columnas mascaras y versiones 5 partes <===================

pickle.dump(cat_cols_mask_5, open("../../columns/train/cat_mask_5.pkl", 'wb'))

In [48]:
# ===========================================> Procesar mascaras y versiones 6 partes <=============================

cat_cols_mask_6 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 5).all():
        cat_cols_mask_6.append(col)

for col in cat_cols_mask_6:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5", col + "_6"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_6:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")
    cat_cols.append(col + "_6")

datos.drop(columns=cat_cols_mask_6, inplace=True)

In [49]:
# ===========================================> Guardamos columnas mascaras y versiones 6 partes <===================

pickle.dump(cat_cols_mask_6, open("../../columns/train/cat_mask_6.pkl", 'wb'))

In [50]:
# ===========================================> Procesar target encoder <============================================

cat_cols_target_encoder_min = 5
cat_cols_target_encoder = []
num_cols_target_encoder = []

target_encoder = ce.TargetEncoder(handle_unknown='ignore')

for col in cat_cols:
    if datos[col].nunique() > cat_cols_target_encoder_min:
        cat_cols_target_encoder.append(col)

target_encoder = target_encoder.fit(datos[cat_cols_target_encoder], datos[LABEL])

datos[cat_cols_target_encoder] = target_encoder.transform(datos[cat_cols_target_encoder])

for col in cat_cols_target_encoder:
    cat_cols.remove(col)
    num_cols.append(col)
    num_cols_target_encoder.append(col)

In [51]:
# ===========================================> Guardamos target encoder <===========================================

pickle.dump(target_encoder, open("../../encoders/train/target.pkl", 'wb'))

In [52]:
# ===========================================> Guardamos columnas target encoder <==================================

pickle.dump(cat_cols_target_encoder, open("../../columns/train/cat_target_encoder.pkl", 'wb'))
pickle.dump(num_cols_target_encoder, open("../../columns/train/num_target_encoder.pkl", 'wb'))

In [53]:
# ===========================================> Procesar onehot encoder <============================================

cat_cols_onehot_encoder_max = 5
cat_cols_onehot_encoder = []
num_cols_onehot_encoder = []

# infrequent_if_exist
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

for col in cat_cols:
    if datos[col].nunique() <= cat_cols_onehot_encoder_max:
        cat_cols_onehot_encoder.append(col)

onehot_encoder = onehot_encoder.fit(datos[cat_cols_onehot_encoder])

onehot_datos = onehot_encoder.transform(datos[cat_cols_onehot_encoder]).toarray()
num_cols_onehot_encoder = onehot_encoder.get_feature_names_out(cat_cols_onehot_encoder)

datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]

datos.drop(columns=cat_cols_onehot_encoder, inplace=True)

for col in cat_cols_onehot_encoder:
    cat_cols.remove(col)

for col in num_cols_onehot_encoder:
    num_cols.append(col)    

In [54]:
# ===========================================> Guardamos onehot encoder <===========================================

pickle.dump(onehot_encoder, open("../../encoders/train/onehot.pkl", 'wb'))

In [55]:
# ===========================================> Guardamos columnas onehot encoder <==================================

pickle.dump(cat_cols_onehot_encoder, open("../../columns/train/cat_onehot_encoder.pkl", 'wb'))
pickle.dump(num_cols_onehot_encoder, open("../../columns/train/num_onehot_encoder.pkl", 'wb'))

In [56]:
# ===========================================> Imputar nulos para tipos numericas <=================================

num_simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

num_simple_imputer = num_simple_imputer.fit(datos[num_cols])
datos[num_cols] = num_simple_imputer.transform(datos[num_cols])

In [57]:
# ===========================================> Guardamos SimpleImputer para tipos numericos <=======================

pickle.dump(num_simple_imputer, open("../../imputers/train/num_simple.pkl", 'wb'))

In [58]:
# ===========================================> Calculamos el rango intercuartílico <================================

intercuartilico = {}

intercuartilico["Q1"] = datos[num_cols].quantile(0.25)
intercuartilico["Q3"] = datos[num_cols].quantile(0.75)
intercuartilico["IQR"] = intercuartilico["Q3"] - intercuartilico["Q1"]

In [59]:
# ===========================================> Guardamos Intercuartilico <==========================================

pickle.dump(intercuartilico, open("../../interquartiles/train/intercuartilico.pkl", 'wb'))

In [60]:
# ===========================================> Tratamiento de valores atípicos <====================================

#datos[num_cols] = datos[num_cols][~((datos[num_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) |(datos[num_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).any(axis=1)]

In [61]:
# ===========================================> Eliminar por Matriz de correlación <=================================

drop_cols_corr = []

corr_matrix = datos[num_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

drop_cols_corr = [column for column in upper.columns if any(upper[column] > 0.95)]

datos.drop(columns=drop_cols_corr, inplace=True)

for col in drop_cols_corr:
    num_cols.remove(col)

In [62]:
# ===========================================> Guardamos columnas Matriz correlación <==============================

pickle.dump(drop_cols_corr, open("../../columns/train/num_correlacion.pkl", 'wb'))

In [63]:
# ===========================================> Guardamos columnas categoricas procesadas <==========================

pickle.dump(cat_cols, open("../../columns/train/cat_pro.pkl", 'wb'))

In [64]:
# ===========================================> Guardamos columnas numericas procesadas <============================

pickle.dump(num_cols, open("../../columns/train/num_pro.pkl", 'wb'))

In [65]:
# ===========================================> Ordenar Datos <======================================================

datos = datos[[IDENTIFIER]+cat_cols+num_cols+[LABEL]]

In [66]:
# ===========================================> Guardamos los datos preprocesados <==================================

datos.to_csv('../../data/processed/train.csv', index=False)