In [139]:
# ===========================================> Carga librerías <====================================================

import pandas as pd
import numpy as np
import pickle

# Transformación de datos
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import category_encoders as ce

# Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Seleccion de variables y tuning de hiperparámetros
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Métricas para evaluar un modelo de clasificación
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score, average_precision_score, confusion_matrix

# Librerías para visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado y modelado
# ------------------------------------------------------------------------------
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import accuracy_score, confusion_matrix, auc, plot_roc_curve, roc_curve, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, auc, RocCurveDisplay , roc_curve, classification_report

# Para que no se corten el listado de filas y columnas al ejecutar instrucciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [140]:
# ===========================================> Carga Datos <========================================================

datos = pd.read_csv('../data/test.csv', low_memory=False)

cat_cols = pickle.load(open("../eda/cat_cols_raw.pkl", 'rb'))
numeric_cols = pickle.load(open("../eda/numeric_cols_raw.pkl", 'rb'))

target_encoder_cat = pickle.load(open("../eda/target_encoder_cat.pkl", 'rb'))

intercuartilico = pickle.load(open("../eda/intercuartilico.pkl", 'rb'))
imp_cat = pickle.load(open("../eda/symple_imputer_cat.pkl", 'rb'))
imp_num = pickle.load(open("../eda/symple_imputer_num.pkl", 'rb'))

In [141]:
datos["HasDetections"] = 0

In [142]:
# ===========================================> Eliminar valores nulos <=============================================

drop_cols_min_nulls = 0.7
drop_cols_nulls = []

for col in datos.columns:
    if datos[col].isna().sum() / len(datos) >= drop_cols_min_nulls:
        drop_cols_nulls.append(col)

datos.drop(columns=drop_cols_nulls, inplace=True)

In [143]:
# ===========================================> Separar datos por tipos categoricas <================================

datos[cat_cols] = datos[cat_cols].astype("category")

In [144]:
# ===========================================> Separar datos por tipos numericas <==================================

datos[numeric_cols] = datos[numeric_cols].astype("float64")

In [145]:
# ===========================================> Imputar nulos para tipos categoricas <===============================

datos[cat_cols] = imp_cat.transform(datos[cat_cols])

In [146]:
# ===========================================> Procesar mascaras y versiones 3 partes <=============================

mask_cols_3 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 2).all():
        mask_cols_3.append(col)

for col in mask_cols_3:
    datos[[col + "_1", col + "_2", col + "_3"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_3:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")

datos.drop(columns=mask_cols_3, inplace=True)

In [147]:
# ===========================================> Procesar mascaras y versiones 4 partes <=============================

mask_cols_4 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 3).all():
        mask_cols_4.append(col)

for col in mask_cols_4:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_4:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")

datos.drop(columns=mask_cols_4, inplace=True)

mask_cols_4

['EngineVersion', 'AppVersion', 'AvSigVersion', 'OsVer', 'Census_OSVersion']

In [149]:
# ===========================================> Procesar mascaras y versiones 5 partes <=============================

mask_cols_5 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 4).all():
        mask_cols_5.append(col)

for col in mask_cols_5:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_5:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")

datos.drop(columns=mask_cols_5, inplace=True)

In [150]:
# ===========================================> Procesar mascaras y versiones 6 partes <=============================

mask_cols_6 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 5).all():
        mask_cols_6.append(col)

for c in mask_cols_6:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5", col + "_6"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_6:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")
    cat_cols.append(col + "_6")

datos.drop(columns=mask_cols_6, inplace=True)

In [151]:
# ===========================================> Procesar target encoder <============================================

cat_cols_target_encoder_min = 5
cat_cols_target_encoder = []

for col in cat_cols:
    if datos[col].nunique() > cat_cols_target_encoder_min:
        cat_cols_target_encoder.append(col)

for col in cat_cols_target_encoder:
    cat_cols.remove(col)

encoded_data_cat_target_encoder = target_encoder_cat.transform(datos[cat_cols_target_encoder])
encoded_data_cat_target_encoder_test = target_encoder_cat.transform(X_test[cat_cols_target_encoder])

datos = pd.concat([datos.drop(columns=cat_cols_target_encoder), encoded_data_cat_target_encoder], axis=1)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- AppVersion_2
- AppVersion_3
- AppVersion_4
- AvSigVersion_2
- AvSigVersion_3
- ...
Feature names seen at fit time, yet now missing:
- AppVersion
- AutoSampleOptIn
- AvSigVersion
- Census_DeviceFamily
- Census_GenuineStateName
- ...


In [None]:
# ===========================================> Procesar onehot encoder <============================================

cat_cols_onehot_encoder_max = 5
cat_cols_onehot_encoder = []

onehot_encoder = OneHotEncoder(handle_unknown='ignore')

for col in cat_cols:
    if datos[col].nunique() <= cat_cols_onehot_encoder_max:
        cat_cols_onehot_encoder.append(col)

for col in cat_cols_onehot_encoder:
    cat_cols.remove(col)

encoded_cols_onehot_encoder = onehot_encoder.fit_transform(datos[cat_cols_onehot_encoder]).toarray()
encoded_data_cat_cols_onehot_encoder = pd.DataFrame(encoded_cols_onehot_encoder, columns=onehot_encoder.get_feature_names_out(cat_cols_onehot_encoder))

datos = pd.concat([datos.drop(columns=cat_cols_onehot_encoder), encoded_data_cat_cols_onehot_encoder], axis=1)

In [None]:
# ===========================================> Reiniciar Separar datos por tipos categoricas <======================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()

datos[cat_cols] = datos[cat_cols].astype("category")

In [None]:
# ===========================================> Reiniciar Separar datos por tipos numericas <========================

numeric_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()

datos[numeric_cols] = datos[numeric_cols].astype("float64")

In [None]:
# ===========================================> Reiniciar datos numericas a categoricas <============================

numeric_cols_cat_max = 1
numeric_cols_cat = []

for col in numeric_cols:
    if (datos[col].nunique() / datos[col].count()) * 100 <= numeric_cols_cat_max:
        numeric_cols_cat.append(col) 

for col in numeric_cols_cat:
    cat_cols.append(col)
    numeric_cols.remove(col)

datos[cat_cols] = datos[cat_cols].astype("category")

In [None]:
# ===========================================> Tratamiento de valores atípicos <====================================

datos[numeric_cols] = datos[numeric_cols][~((datos[numeric_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) |(datos[numeric_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).any(axis=1)]

In [None]:
# ===========================================> Eliminar por Matriz de correlación <=================================

drop_cols_corr = []

corr_matrix = datos[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

drop_cols_corr = [column for column in upper.columns if any(upper[column] > 0.95)]

datos.drop(columns=drop_cols_corr, inplace=True)

for col in drop_cols_corr:
    numeric_cols.remove(col)

In [None]:
# ===========================================> Imputar nulos para tipos numericas <=================================

datos[numeric_cols] = imp_num.transform(datos[numeric_cols])

In [None]:
# ===========================================> Reiniciar Separar datos por tipos categoricas <======================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()
cat_cols.remove('MachineIdentifier')

datos[cat_cols] = datos[cat_cols].astype("category")

In [None]:
# ===========================================> Reiniciar Separar datos por tipos numericas <========================

numeric_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()

datos[numeric_cols] = datos[numeric_cols].astype("float64")

In [None]:
# ===========================================> Reiniciar datos numericas a categoricas <============================

numeric_cols_cat_max = 1
numeric_cols_cat = []

for col in numeric_cols:
    if (datos[col].nunique() / datos[col].count()) * 100 <= numeric_cols_cat_max:
        numeric_cols_cat.append(col) 

for col in numeric_cols_cat:
    cat_cols.append(col)
    numeric_cols.remove(col)

datos[cat_cols] = datos[cat_cols].astype("category")

In [None]:
# ===========================================> Ordenar variables <==================================================

ordered_columns = cat_cols+numeric_cols
datos = datos[['MachineIdentifier']ordered_columns+['HasDetections']]

In [None]:
# ===========================================> Guardamos los datos preprocesados <==================================

datos.to_csv('../data/test_pre.csv', index=False)