In [142]:
# ===========================================> Carga librerías <====================================================

import pandas as pd
import numpy as np
import pickle

# Transformación de datos
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import category_encoders as ce

# Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Seleccion de variables y tuning de hiperparámetros
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Métricas para evaluar un modelo de clasificación
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score, average_precision_score, confusion_matrix

# Librerías para visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado y modelado
# ------------------------------------------------------------------------------
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import accuracy_score, confusion_matrix, auc, plot_roc_curve, roc_curve, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, auc, RocCurveDisplay , roc_curve, classification_report

# Para que no se corten el listado de filas y columnas al ejecutar instrucciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Libs
import data_type

In [143]:
# ===========================================> Carga Datos <========================================================

datos = pd.read_csv('../../data/raw/train.csv', low_memory=False)
#datos = datos.sample(20000)

In [144]:
# ===========================================> Definir variables <==================================================

IDENTIFIER = "MachineIdentifier"
LABEL = "HasDetections"

cat_cols = []
num_cols = []

#datos[IDENTIFIER] = datos[IDENTIFIER].astype("category")
#datos[LABEL] = datos[LABEL].astype("float64")

In [145]:
# ===========================================> Eliminar valores nulos <=============================================

drop_cols_min_nulls = 0.5
#drop_cols_min_nulls = 0.5
drop_cols_nulls = []

for col in datos.columns:
    if col != IDENTIFIER and col != LABEL and datos[col].isna().sum() / len(datos) >= drop_cols_min_nulls:
        drop_cols_nulls.append(col)

datos.drop(columns=drop_cols_nulls, inplace=True)

print(drop_cols_nulls)

['DefaultBrowsersIdentifier', 'PuaMode', 'Census_ProcessorClass', 'Census_IsFlightingInternal']


In [146]:
# ===========================================> Eliminar valores mal balanceados <===================================

drop_cols_min_big_cat = 95
drop_cols_big_cat = []

for col in datos.columns:
    if col != IDENTIFIER and col != LABEL and datos[col].value_counts(normalize=True, dropna=False).values[0] * 100 >= drop_cols_min_big_cat:
        drop_cols_big_cat.append(col)

datos.drop(columns=drop_cols_big_cat, inplace=True)

print(drop_cols_big_cat)

['ProductName', 'IsBeta', 'RtpStateBitfield', 'IsSxsPassiveMode', 'AVProductsEnabled', 'HasTpm', 'Platform', 'OsVer', 'AutoSampleOptIn', 'Firewall', 'UacLuaenable', 'Census_DeviceFamily', 'Census_IsPortableOperatingSystem', 'Census_IsFlightsDisabled', 'Census_IsVirtualDevice', 'Census_IsPenCapable']


In [147]:
# ===========================================> Separar datos por tipos <============================================

columns = data_type.get_type_data(datos)

cat_cols = columns["cat"]
if IDENTIFIER in cat_cols: cat_cols.remove(IDENTIFIER)
if LABEL in cat_cols: cat_cols.remove(LABEL)

num_cols = columns["num"]
if IDENTIFIER in num_cols: num_cols.remove(IDENTIFIER)
if LABEL in num_cols: num_cols.remove(LABEL)

#datos[cat_cols] = datos[cat_cols].astype("category")
#datos[num_cols] = datos[num_cols].astype("float64")

pickle.dump(cat_cols, open("../../columns/cat.pkl", 'wb'))
pickle.dump(num_cols, open("../../columns/num.pkl", 'wb'))

print("cat", cat_cols)
print("num", num_cols)

cat ['EngineVersion', 'AppVersion', 'AvSigVersion', 'AVProductsInstalled', 'Processor', 'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'IsProtected', 'SMode', 'SmartScreen', 'Census_MDC2FormFactor', 'Census_ProcessorManufacturerIdentifier', 'Census_PrimaryDiskTypeName', 'Census_HasOpticalDiskDrive', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSVersion', 'Census_OSArchitecture', 'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 'Census_OSWUAutoUpdateOptionsName', 'Census_GenuineStateName', 'Census_ActivationChannel', 'Census_FlightRing', 'Census_ThresholdOptIn', 'Census_IsSecureBootEnabled', 'Census_IsWIMBootEnabled', 'Census_IsTouchEnabled', 'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer']
num ['AVProductStatesIdentifier', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'OsBuild', 'OsSuite', 'IeVerIdentifier', 'Cen

In [148]:
# ===========================================> Imputar nulos para tipos categoricas <===============================

cat_simple_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

cat_simple_imputer = cat_simple_imputer.fit(datos[cat_cols])
datos[cat_cols] = cat_simple_imputer.transform(datos[cat_cols])

pickle.dump(cat_simple_imputer, open("../../imputers/cat_simple.pkl", 'wb'))

print(datos[cat_cols].isnull().sum())


EngineVersion                              0
AppVersion                                 0
AvSigVersion                               0
AVProductsInstalled                        0
Processor                                  0
OsPlatformSubRelease                       0
OsBuildLab                                 0
SkuEdition                                 0
IsProtected                                0
SMode                                      0
SmartScreen                                0
Census_MDC2FormFactor                      0
Census_ProcessorManufacturerIdentifier     0
Census_PrimaryDiskTypeName                 0
Census_HasOpticalDiskDrive                 0
Census_ChassisTypeName                     0
Census_PowerPlatformRoleName               0
Census_InternalBatteryType                 0
Census_OSVersion                           0
Census_OSArchitecture                      0
Census_OSBranch                            0
Census_OSEdition                           0
Census_OSS

In [149]:
# ===========================================> Procesar mascaras y versiones 3 partes <=============================

cat_cols_mask_3 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 2).all():
        cat_cols_mask_3.append(col)

for col in cat_cols_mask_3:
    datos[[col + "_1", col + "_2", col + "_3"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_3:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")

datos.drop(columns=cat_cols_mask_3, inplace=True)

pickle.dump(cat_cols_mask_3, open("../../columns/cat_mask_3.pkl", 'wb'))

print(cat_cols_mask_3)

[]


In [150]:
# ===========================================> Procesar mascaras y versiones 4 partes <=============================

cat_cols_mask_4 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 3).all():
        cat_cols_mask_4.append(col)

for col in cat_cols_mask_4:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_4:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")

datos.drop(columns=cat_cols_mask_4, inplace=True)

pickle.dump(cat_cols_mask_4, open("../../columns/cat_mask_4.pkl", 'wb'))

print(cat_cols_mask_4)

['EngineVersion', 'AppVersion', 'AvSigVersion', 'Census_OSVersion']


In [151]:
# ===========================================> Procesar mascaras y versiones 5 partes <=============================

cat_cols_mask_5 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 4).all():
        cat_cols_mask_5.append(col)

for col in cat_cols_mask_5:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_5:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")

datos.drop(columns=cat_cols_mask_5, inplace=True)

pickle.dump(cat_cols_mask_5, open("../../columns/cat_mask_5.pkl", 'wb'))

print(cat_cols_mask_5)

['OsBuildLab']


In [152]:
# ===========================================> Procesar mascaras y versiones 6 partes <=============================

cat_cols_mask_6 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 5).all():
        cat_cols_mask_6.append(col)

for col in cat_cols_mask_6:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5", col + "_6"]] = datos[col].str.split(".", expand=True)

for col in cat_cols_mask_6:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")
    cat_cols.append(col + "_6")

datos.drop(columns=cat_cols_mask_6, inplace=True)

pickle.dump(cat_cols_mask_6, open("../../columns/cat_mask_6.pkl", 'wb'))

print(cat_cols_mask_6)

[]


In [153]:
# ===========================================> Procesar target encoder <============================================

cat_cols_target_encoder_min = 5
cat_cols_target_encoder = []

target_encoder = ce.TargetEncoder(handle_unknown='ignore')

for col in cat_cols:
    if datos[col].nunique() > cat_cols_target_encoder_min:
        cat_cols_target_encoder.append(col)

target_encoder = target_encoder.fit(datos[cat_cols_target_encoder], datos[LABEL])

datos[cat_cols_target_encoder] = target_encoder.transform(datos[cat_cols_target_encoder])

for col in cat_cols_target_encoder:
    cat_cols.remove(col)
    num_cols.append(col)

pickle.dump(target_encoder, open("../../encoders/target.pkl", 'wb'))
pickle.dump(cat_cols_target_encoder, open("../../columns/cat_target_encoder.pkl", 'wb'))

print(cat_cols_target_encoder)

['AVProductsInstalled', 'OsPlatformSubRelease', 'SkuEdition', 'SmartScreen', 'Census_MDC2FormFactor', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSBranch', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 'Census_OSWUAutoUpdateOptionsName', 'Census_ActivationChannel', 'Census_FlightRing', 'EngineVersion_3', 'EngineVersion_4', 'AppVersion_2', 'AppVersion_3', 'AppVersion_4', 'AvSigVersion_2', 'AvSigVersion_3', 'Census_OSVersion_3', 'Census_OSVersion_4', 'OsBuildLab_1', 'OsBuildLab_2', 'OsBuildLab_4', 'OsBuildLab_5']


In [154]:
# ===========================================> Procesar onehot encoder <============================================

cat_cols_onehot_encoder_max = 5
cat_cols_onehot_encoder = []
num_cols_onehot_encoder = []

onehot_encoder = OneHotEncoder(handle_unknown='ignore')

for col in cat_cols:
    if datos[col].nunique() <= cat_cols_onehot_encoder_max:
        cat_cols_onehot_encoder.append(col)

onehot_encoder = onehot_encoder.fit(datos[cat_cols_onehot_encoder])

onehot_datos = onehot_encoder.transform(datos[cat_cols_onehot_encoder]).toarray()
num_cols_onehot_encoder = onehot_encoder.get_feature_names_out(cat_cols_onehot_encoder)

datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]

datos.drop(columns=cat_cols_onehot_encoder, inplace=True)

for col in cat_cols_onehot_encoder:
    cat_cols.remove(col)

for col in num_cols_onehot_encoder:
    num_cols.append(col) 

pickle.dump(onehot_encoder, open("../../encoders/onehot.pkl", 'wb'))
pickle.dump(cat_cols_onehot_encoder, open("../../columns/cat_onehot_encoder.pkl", 'wb'))
pickle.dump(num_cols_onehot_encoder, open("../../columns/num_onehot_encoder.pkl", 'wb'))

print("cat", cat_cols_onehot_encoder) 
print("num", num_cols_onehot_encoder)

  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]
  datos[num_cols_onehot_encoder] = pd.DataFrame(onehot_datos, columns=num_cols_onehot_encoder)[num_cols_onehot_encoder]


cat ['Processor', 'IsProtected', 'SMode', 'Census_ProcessorManufacturerIdentifier', 'Census_PrimaryDiskTypeName', 'Census_HasOpticalDiskDrive', 'Census_OSArchitecture', 'Census_GenuineStateName', 'Census_ThresholdOptIn', 'Census_IsSecureBootEnabled', 'Census_IsWIMBootEnabled', 'Census_IsTouchEnabled', 'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer', 'EngineVersion_1', 'EngineVersion_2', 'AppVersion_1', 'AvSigVersion_1', 'AvSigVersion_4', 'Census_OSVersion_1', 'Census_OSVersion_2', 'OsBuildLab_3']
num ['Processor_arm64' 'Processor_x64' 'Processor_x86' 'IsProtected_0.0'
 'IsProtected_1.0' 'SMode_0.0' 'SMode_1.0'
 'Census_ProcessorManufacturerIdentifier_1.0'
 'Census_ProcessorManufacturerIdentifier_3.0'
 'Census_ProcessorManufacturerIdentifier_5.0'
 'Census_ProcessorManufacturerIdentifier_10.0'
 'Census_PrimaryDiskTypeName_HDD' 'Census_PrimaryDiskTypeName_SSD'
 'Census_PrimaryDiskTypeName_UNKNOWN'
 'Census_PrimaryDiskTypeName_Unspecified' 'Census_HasOpticalDiskDrive_0'
 'Census_

In [155]:
# ===========================================> Imputar nulos para tipos numericas <=================================

num_simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

num_simple_imputer = num_simple_imputer.fit(datos[num_cols])
datos[num_cols] = num_simple_imputer.transform(datos[num_cols])

pickle.dump(num_simple_imputer, open("../../imputers/num_simple.pkl", 'wb'))

print(datos[num_cols].isnull().sum())

AVProductStatesIdentifier                            0
CountryIdentifier                                    0
CityIdentifier                                       0
OrganizationIdentifier                               0
GeoNameIdentifier                                    0
LocaleEnglishNameIdentifier                          0
OsBuild                                              0
OsSuite                                              0
IeVerIdentifier                                      0
Census_OEMNameIdentifier                             0
Census_OEMModelIdentifier                            0
Census_ProcessorCoreCount                            0
Census_ProcessorModelIdentifier                      0
Census_PrimaryDiskTotalCapacity                      0
Census_SystemVolumeTotalCapacity                     0
Census_TotalPhysicalRAM                              0
Census_InternalPrimaryDiagonalDisplaySizeInInches    0
Census_InternalPrimaryDisplayResolutionHorizontal    0
Census_Int

In [156]:
# ===========================================> Calculamos el rango intercuartílico <================================

intercuartilico = {}

for col in num_cols:
    Q1 = datos[col].quantile(0.25)
    Q3 = datos[col].quantile(0.75)
    IQR = Q3 - Q1
    intercuartilico[col] = {
        'Q1': Q1, 
        'Q3': Q3, 
        'IQR': IQR,
        "lower": Q1 - 1.5 * IQR,
        "upper" : Q3 + 1.5 * IQR
    }

#intercuartilico["Q1"] = datos[num_cols].quantile(0.25)
#intercuartilico["Q3"] = datos[num_cols].quantile(0.75)
#intercuartilico["IQR"] = intercuartilico["Q3"] - intercuartilico["Q1"]

pickle.dump(intercuartilico, open("../../interquartiles/intercuartilico.pkl", 'wb'))

print(intercuartilico)

{'AVProductStatesIdentifier': {'Q1': 49480.0, 'Q3': 53447.0, 'IQR': 3967.0, 'lower': 43529.5, 'upper': 59397.5}, 'CountryIdentifier': {'Q1': 51.0, 'Q3': 162.0, 'IQR': 111.0, 'lower': -115.5, 'upper': 328.5}, 'CityIdentifier': {'Q1': 38026.0, 'Q3': 121270.0, 'IQR': 83244.0, 'lower': -86840.0, 'upper': 246136.0}, 'OrganizationIdentifier': {'Q1': 24.860810405364862, 'Q3': 27.0, 'IQR': 2.1391895946351376, 'lower': 21.652026013412154, 'upper': 30.208784391952705}, 'GeoNameIdentifier': {'Q1': 89.0, 'Q3': 267.0, 'IQR': 178.0, 'lower': -178.0, 'upper': 534.0}, 'LocaleEnglishNameIdentifier': {'Q1': 75.0, 'Q3': 182.0, 'IQR': 107.0, 'lower': -85.5, 'upper': 342.5}, 'OsBuild': {'Q1': 15063.0, 'Q3': 17134.0, 'IQR': 2071.0, 'lower': 11956.5, 'upper': 20240.5}, 'OsSuite': {'Q1': 256.0, 'Q3': 768.0, 'IQR': 512.0, 'lower': -512.0, 'upper': 1536.0}, 'IeVerIdentifier': {'Q1': 111.0, 'Q3': 137.0, 'IQR': 26.0, 'lower': 72.0, 'upper': 176.0}, 'Census_OEMNameIdentifier': {'Q1': 1443.0, 'Q3': 2668.0, 'IQR': 1

In [157]:
# ===========================================> Tratamiento de valores atípicos <====================================
# ===========================================> NO SE APLICA PORQUE BAJA MUCHO <====================================

#for col in num_cols:
#    datos[col] = datos[col].apply(lambda x: x if intercuartilico[col]["lower"] <= x <= intercuartilico[col]["upper"] else intercuartilico[col]["mean"])


#for col in num_cols:
#    datos[col] = datos[col].apply(lambda x: None if x < intercuartilico[col]["lower"] or x > intercuartilico[col]["upper"] else x)
    
#datos[num_cols] = datos[num_cols][~((datos[num_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) |(datos[num_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).any(axis=1)]
#datos[num_cols] = datos[num_cols][~((datos[num_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) |(datos[num_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"])))].any(axis=1)

In [158]:
# ===========================================> Eliminar por Matriz de correlación <=================================

drop_cols_corr = []

corr_matrix = datos[num_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

drop_cols_corr = [column for column in upper.columns if any(upper[column] > 0.95)]

datos.drop(columns=drop_cols_corr, inplace=True)

for col in drop_cols_corr:
    num_cols.remove(col)

pickle.dump(drop_cols_corr, open("../../columns/num_correlacion.pkl", 'wb'))

print(drop_cols_corr)

['Census_OSUILocaleIdentifier', 'Census_OSSkuName', 'AppVersion_3', 'AvSigVersion_2', 'OsBuildLab_1', 'OsBuildLab_5', 'Processor_x86', 'IsProtected_1.0', 'SMode_1.0', 'Census_ProcessorManufacturerIdentifier_5.0', 'Census_ProcessorManufacturerIdentifier_10.0', 'Census_HasOpticalDiskDrive_1', 'Census_OSArchitecture_amd64', 'Census_OSArchitecture_arm64', 'Census_OSArchitecture_x86', 'Census_ThresholdOptIn_1.0', 'Census_IsSecureBootEnabled_1', 'Census_IsTouchEnabled_1', 'Census_IsAlwaysOnAlwaysConnectedCapable_1.0', 'Wdft_IsGamer_1.0', 'AvSigVersion_1_1', 'Census_OSVersion_1_6', 'Census_OSVersion_2_0', 'Census_OSVersion_2_3', 'OsBuildLab_3_amd64fre', 'OsBuildLab_3_arm64fre', 'OsBuildLab_3_x86fre']


In [159]:
# ===========================================> Quitamos columnas no importantes RFECV <=============================
# ===========================================> No se usa porque baja mucho la puntuación <==========================

columnas_importantes_xgb_classiffier = ['AVProductStatesIdentifier' 'CountryIdentifier' 'CityIdentifier'
 'OrganizationIdentifier' 'GeoNameIdentifier'
 'LocaleEnglishNameIdentifier' 'OsBuild' 'OsSuite' 'IeVerIdentifier'
 'Census_OEMNameIdentifier' 'Census_OEMModelIdentifier'
 'Census_ProcessorCoreCount' 'Census_ProcessorModelIdentifier'
 'Census_PrimaryDiskTotalCapacity' 'Census_SystemVolumeTotalCapacity'
 'Census_TotalPhysicalRAM'
 'Census_InternalPrimaryDiagonalDisplaySizeInInches'
 'Census_InternalPrimaryDisplayResolutionHorizontal'
 'Census_InternalPrimaryDisplayResolutionVertical'
 'Census_InternalBatteryNumberOfCharges' 'Census_OSBuildNumber'
 'Census_OSBuildRevision' 'Census_OSInstallLanguageIdentifier'
 'Census_FirmwareManufacturerIdentifier'
 'Census_FirmwareVersionIdentifier' 'Wdft_RegionIdentifier'
 'AVProductsInstalled' 'OsPlatformSubRelease' 'SkuEdition' 'SmartScreen'
 'Census_MDC2FormFactor' 'Census_ChassisTypeName'
 'Census_PowerPlatformRoleName' 'Census_OSBranch' 'Census_OSEdition'
 'Census_OSInstallTypeName' 'Census_OSWUAutoUpdateOptionsName'
 'Census_ActivationChannel' 'EngineVersion_3' 'EngineVersion_4'
 'AppVersion_2' 'AppVersion_4' 'AvSigVersion_3' 'Census_OSVersion_3'
 'Census_OSVersion_4' 'OsBuildLab_2' 'OsBuildLab_4'
 'Census_PrimaryDiskTypeName_HDD' 'Census_PrimaryDiskTypeName_SSD'
 'Census_PrimaryDiskTypeName_UNKNOWN'
 'Census_PrimaryDiskTypeName_Unspecified' 'Census_ThresholdOptIn_0.0'
 'Census_IsSecureBootEnabled_0' 'Census_IsWIMBootEnabled_0.0'
 'Wdft_IsGamer_0.0' 'EngineVersion_1_1' 'EngineVersion_2_1'
 'OsBuildLab_3_amd64fre' 'OsBuildLab_3_arm64fre']

#for col in cat_cols:
#    if col not in columnas_importantes_xgb_classiffier:
#        cat_cols.remove(col)

#for col in num_cols:
#    if col not in columnas_importantes_xgb_classiffier:
#        num_cols.remove(col)       

In [160]:
# ===========================================> Ordenar Datos <======================================================

datos = datos[[IDENTIFIER]+cat_cols+num_cols+[LABEL]]

In [161]:
# ===========================================> Exploración valores y estadísticas <=================================

stats_2 = []
# Unique Values         col 
# Unique Values         datos[col].nunique() 
# Unique Values %       (datos[col].nunique() / datos[col].count()) * 100
# Missing Values %      datos[col].isnull().sum() * 100 / datos.shape[0]
# Biggest Category %    datos[col].value_counts(normalize=True, dropna=False).values[0] * 100
# Type                  datos[col].dtype

for col in datos.columns:
    stats_2.append((col, datos[col].nunique(), (datos[col].nunique() / datos[col].count()) * 100, datos[col].isnull().sum() * 100 / datos.shape[0], datos[col].value_counts(normalize=True, dropna=False).values[0] * 100, datos[col].dtype))
    
stats_2 = pd.DataFrame(stats_2, columns=['Feature', 'Unique Values', 'Unique Values %', 'Missing Values %', 'Biggest Category %', 'Type'])
stats_2.sort_values('Feature', ascending=False)

Unnamed: 0,Feature,Unique Values,Unique Values %,Missing Values %,Biggest Category %,Type
26,Wdft_RegionIdentifier,16,0.001793,0.0,20.19564,float64
70,Wdft_IsGamer_0.0,2,0.000224,0.0,72.669333,float64
30,SmartScreen,11,0.001233,0.0,83.959948,float64
29,SkuEdition,8,0.000897,0.0,61.801181,float64
53,SMode_0.0,2,0.000224,0.0,99.953707,float64
51,Processor_x64,2,0.000224,0.0,90.868443,float64
50,Processor_arm64,2,0.000224,0.0,99.996077,float64
8,OsSuite,10,0.001121,0.0,62.31679,float64
28,OsPlatformSubRelease,9,0.001009,0.0,43.944502,float64
49,OsBuildLab_4,41,0.004596,0.0,43.944502,float64


In [162]:
# ===========================================> Guardamos los datos preprocesados <==================================

pickle.dump(cat_cols, open("../../columns/cat_pro.pkl", 'wb'))
pickle.dump(num_cols, open("../../columns/num_pro.pkl", 'wb'))

datos.to_csv('../../data/processed/train.csv', index=False)