In [None]:
# ===========================================> Carga librerías <====================================================

import pandas as pd
import numpy as np
import pickle

# Transformación de datos
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import category_encoders as ce

# Modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Seleccion de variables y tuning de hiperparámetros
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Métricas para evaluar un modelo de clasificación
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_curve, roc_auc_score, average_precision_score, confusion_matrix

# Librerías para visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocesado y modelado
# ------------------------------------------------------------------------------
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import accuracy_score, confusion_matrix, auc, plot_roc_curve, roc_curve, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, auc, RocCurveDisplay , roc_curve, classification_report

# Para que no se corten el listado de filas y columnas al ejecutar instrucciones
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# ===========================================> Carga Datos <========================================================

datos = pd.read_csv('../data/dev.csv', low_memory=False)         # 20.000 filas del fichero train para trabajar en desarrollo

In [None]:
# ===========================================> Exploración número de columnas <=====================================

len(datos.columns)

In [None]:
# ===========================================> Exploración estadisticas basicas col numericas <=====================

datos.describe()

In [None]:
# ===========================================> Exploración estadisticas basicas col objectos <=====================

datos.describe(include=object)

In [None]:
# ===========================================> Exploración 20 primeras filas <======================================

datos.head(20)

In [None]:
# ===========================================> Exploración tipos <==================================================

datos.info()

In [None]:
# ===========================================> Exploración nulos <==================================================

datos.isna().sum().sort_values(ascending=False)

In [195]:
# ===========================================> Exploración valores y estadísticas <=================================

stats_1 = []
# Unique Values         col 
# Unique Values         datos[col].nunique() 
# Unique Values %       (datos[col].nunique() / datos[col].count()) * 100
# Missing Values %      datos[col].isnull().sum() * 100 / datos.shape[0]
# Biggest Category %    datos[col].value_counts(normalize=True, dropna=False).values[0] * 100
# Type                  datos[col].dtype

for col in datos.columns:
    stats_1.append((col, datos[col].nunique(), (datos[col].nunique() / datos[col].count()) * 100, datos[col].isnull().sum() * 100 / datos.shape[0], datos[col].value_counts(normalize=True, dropna=False).values[0] * 100, datos[col].dtype))
    
stats = pd.DataFrame(stats_1, columns=['Feature', 'Unique Values', 'Unique Values %', 'Missing Values %', 'Biggest Category %', 'Type'])
stats.sort_values('Unique Values %', ascending=False)

Unnamed: 0,Feature,Unique Values,Unique Values %,Missing Values %,Biggest Category %,Type
0,MachineIdentifier,20000,100.0,0.0,0.005,category
13,Census_SystemVolumeTotalCapacity,12171,60.855,0.0,1.26,category
10,Census_OEMModelIdentifier,6160,30.8,0.0,5.08,category
3,CityIdentifier,5826,29.13,0.0,4.94,category
25,Census_FirmwareVersionIdentifier,5403,27.015,0.0,2.765,category
32,AvSigVersion_3,1373,6.865,0.0,1.26,category
11,Census_ProcessorModelIdentifier,1197,5.985,0.0,3.64,category
1,AVProductStatesIdentifier,808,4.04,0.0,65.955,category
17,Census_InternalBatteryNumberOfCharges,698,3.49,0.0,59.38,category
9,Census_OEMNameIdentifier,399,1.995,0.0,15.975,category


# ===========================================> LIMPIEZA INICIAL <===================================================

In [None]:
# ===========================================> Eliminar valores nulos <=============================================

drop_cols_min_nulls = 0.7
drop_cols_nulls = []

for col in datos.columns:
    if datos[col].isna().sum() / len(datos) >= drop_cols_min_nulls:
        drop_cols_nulls.append(col)

datos.drop(columns=drop_cols_nulls, inplace=True)

drop_cols_nulls

In [None]:
# ===========================================> Eliminar valores identificadores <===================================

drop_cols_id = []

datos.drop(columns=drop_cols_id, inplace=True)

drop_cols_id

# ===========================================> SEPARAR DATOS POR TIPOS <============================================


In [None]:
# ===========================================> Separar datos por tipos categoricas <================================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()

datos[cat_cols] = datos[cat_cols].astype("category")

cat_cols

In [None]:
# ===========================================> Separar datos por tipos numericas <==================================

numeric_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()
numeric_cols.remove('HasDetections')

datos[numeric_cols] = datos[numeric_cols].astype("float64")

numeric_cols

In [None]:
# ===========================================> Separar datos numericas a categoricas <==============================

numeric_cols_cat_min = 1
numeric_cols_cat = []

for col in numeric_cols:
    if (datos[col].nunique() / datos[col].count()) * 100 >= numeric_cols_cat_min:
        numeric_cols_cat.append(col) 

for col in numeric_cols_cat:
    cat_cols.append(col)

for col in numeric_cols_cat:
    datos[col] = datos[col].astype("category")

for col in numeric_cols_cat:
    numeric_cols.remove(col)
    
numeric_cols_cat

# ===========================================> CATEGORICAS <========================================================

In [None]:
# ===========================================> Imputar nulos para tipos categoricas <===============================

imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_cat.fit(datos[cat_cols])

datos[cat_cols] = imp_cat.transform(datos[cat_cols])

datos.isnull().sum()

In [None]:
# ===========================================> Procesar mascaras y versiones 3 partes <=============================

mask_cols_3 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 2).all():
        mask_cols_3.append(c)

for col in mask_cols_3:
    datos[[col + "_1", col + "_2", col + "_3"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_3:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")

datos.drop(columns=mask_cols_3, inplace=True)

mask_cols_3

In [None]:
# ===========================================> Procesar mascaras y versiones 4 partes <=============================

mask_cols_4 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 3).all():
        mask_cols_4.append(col)

for col in mask_cols_4:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_4:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")

datos.drop(columns=mask_cols_4, inplace=True)

mask_cols_4

In [None]:
# ===========================================> Procesar mascaras y versiones 5 partes <=============================

mask_cols_5 = []

for col in cat_cols:
    if datos[col].notnull().all() and datos[col].astype(str).apply(lambda x: x.count('.') == 4).all():
        mask_cols_5.append(col)

for col in mask_cols_5:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5"]] = datos[col].str.split(".", expand=True)

for col in mask_cols_5:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")

datos.drop(columns=mask_cols_5, inplace=True)

mask_cols_5

In [None]:
# ===========================================> Procesar mascaras y versiones 6 partes <=============================

mask_cols_6 = []

for c in cat_cols:
    if datos[c].notnull().all() and datos[c].astype(str).apply(lambda x: x.count('.') == 5).all():
        mask_cols_6.append(c)

for c in mask_cols_6:
    datos[[col + "_1", col + "_2", col + "_3", col + "_4", col + "_5", col + "_6"]] = datos[c].str.split(".", expand=True)

for col in mask_cols_6:
    cat_cols.remove(col)
    cat_cols.append(col + "_1")
    cat_cols.append(col + "_2")
    cat_cols.append(col + "_3")
    cat_cols.append(col + "_4")
    cat_cols.append(col + "_5")
    cat_cols.append(col + "_6")

datos.drop(columns=mask_cols_6, inplace=True)

mask_cols_6

In [None]:
# ===========================================> Procesar target encoder <============================================

cat_cols_target_encoder_max = 5
cat_cols_target_encoder = []

target_encoder = ce.TargetEncoder()

for c in cat_cols:
    if datos[c].nunique() > cat_cols_target_encoder_max:
        cat_cols_onehot.append(c)

datos[cat_cols_target_encoder] = target_encoder.fit_transform(datos[cat_cols_target_encoder], datos['HasDetections'])

cat_cols_onehot

In [None]:
# ===========================================> Reiniciar Separar datos por tipos categoricas <======================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()

for col in cat_cols:
    datos[col] = datos[col].astype("category")

cat_cols

In [None]:
# ===========================================> Reiniciar Separar datos por tipos numericas <========================

numeric_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()
numeric_cols.remove('HasDetections')

numeric_cols

# ===========================================> NUMERICAS <==========================================================

In [None]:
# ===========================================> Calculamos el rango intercuartílico <================================

intercuartilico = {}

intercuartilico["Q1"] = datos[numeric_cols].quantile(0.25)
intercuartilico["Q3"] = datos[numeric_cols].quantile(0.75)
intercuartilico["IQR"] = intercuartilico["Q3"] - intercuartilico["Q1"]

((datos[numeric_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) | (datos[numeric_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).sum()

In [None]:
# ===========================================> Tratamiento de valores atípicos <====================================

datos[numeric_cols] = datos[numeric_cols][~((datos[numeric_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) |(datos[numeric_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).any(axis=1)]

((datos[numeric_cols] < (intercuartilico["Q1"] - 1.5 * intercuartilico["IQR"])) | (datos[numeric_cols] > (intercuartilico["Q3"] + 1.5 * intercuartilico["IQR"]))).sum()

In [None]:
# ===========================================> Eliminar por Matriz de correlación <=================================

drop_cols_corr = []

corr_matrix = datos[numeric_cols].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

drop_cols_corr = [column for column in upper.columns if any(upper[column] > 0.95)]

datos.drop(columns=drop_cols_corr, inplace=True)

for col in drop_cols_corr:
    numeric_cols.remove(col)

drop_cols_corr

In [None]:
# ===========================================> Imputar nulos para tipos numericas <=================================

imp_num = SimpleImputer(missing_values=np.nan, strategy='mean')
datos[numeric_cols] = imp_num.fit_transform(datos[numeric_cols])

datos.isnull().sum()

In [None]:
# ===========================================> Reiniciar Separar datos por tipos categoricas <======================

cat_cols = datos.select_dtypes(include=['object', 'category']).columns.to_list()

for col in cat_cols:
    datos[col] = datos[col].astype("category")

cat_cols

In [None]:
# ===========================================> Reiniciar Separar datos por tipos numericas <========================

numeric_cols = datos.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.to_list()
numeric_cols.remove('HasDetections')

numeric_cols

In [None]:
# ===========================================> Ordenar variables <==================================================

ordered_columns = cat_cols+numeric_cols
datos = datos[ordered_columns+['HasDetections']]

ordered_columns

In [None]:
# ===========================================> Codificar variables <================================================

preprocessor = ColumnTransformer(
                    [('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                    remainder='passthrough')

In [None]:
datos2 = preprocessor.fit_transform(datos[ordered_columns])

encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names_out(cat_cols)
labels = np.concatenate([encoded_cat, numeric_cols])

datos2 = pd.DataFrame(datos2.toarray(), columns=labels)
datos2['HasDetections'] = datos['HasDetections']

datos2.to_csv('../data/dev_pre.csv', index=False)

In [None]:
# ===========================================> Train Test Split <===================================================

X_train, X_test, y_train, y_test = train_test_split(
                                        datos.drop(columns = 'HasDetections')[ordered_columns],
                                        datos['HasDetections'],
                                        random_state = 123)

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names_out(cat_cols)
labels = np.concatenate([encoded_cat, numeric_cols])

# Conversión a dataframe
X_train_prep = pd.DataFrame(X_train_prep.toarray(), columns=labels)
X_test_prep  = pd.DataFrame(X_test_prep.toarray(), columns=labels)
X_train_prep.info()


In [None]:
# ==================================================================================================================
# ===========================================> Modelo <=============================================================
# ==================================================================================================================

modelo = DecisionTreeClassifier(criterion= "entropy", max_depth=5, min_samples_leaf=3, random_state = 123)

modelo.fit(X_train_prep, y_train)

#fig, ax = plt.subplots(figsize=(16, 6))

#plot = plot_tree(
#            decision_tree = modelo,
#            feature_names = labels.tolist(),
#            class_names   = 'HasDetections',
#            filled        = True,
#            impurity      = False,
#            fontsize      = 7,
#            ax            = ax)

In [None]:
predicciones = modelo.predict(X = X_test_prep,)
pred_proba = modelo.predict_proba(X = X_test_prep)

In [None]:
importancia_predictores = pd.DataFrame({
    "predictor": labels.tolist(),
    "importancia": modelo.feature_importances_
})

fig, ax = plt.subplots(figsize=(16, 6))

importancia_predictores.set_index("predictor").sort_values("importancia", ascending=True)[10:].plot(kind="barh", ax=ax)

In [None]:
print(f'Accuracy: {round(100*accuracy_score(y_test, predicciones),1)}%')

In [None]:
print(classification_report(y_test, predicciones, digits=3, zero_division=True))

In [None]:
fpr, tpr, _ = roc_curve(y_test, pred_proba[:,1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize = (6,5))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
# ==================================================================================================================
# ===========================================> Procesar Unique Tipos Cat <==========================================
# ==================================================================================================================

#typed_min_unique = 10
#typed_cols = []

#for c in datos.columns:
#    if ((datos[c].nunique() / datos[c].count()) * 100) <= typed_min_unique:
#        typed_cols.append(c)
        
#for c in typed_cols:
#    datos[c] = datos[c].astype("category")

#print(typed_cols)

In [None]:
# ==================================================================================================================
# ===========================================> Procesar Object Tipos Cat <==========================================
# ==================================================================================================================

#typed_cols = []
#cat_cols = []

#for c in datos.columns:
#    if datos[c].dtype == "object":
#        typed_cols.append(c)
#        cat_cols.append(c)
        
#for c in typed_cols:
#   datos[c] = datos[c].astype("category")

#datos["HasDetections"] = datos["HasDetections"].astype(int)

#print(typed_cols)

In [None]:
#from sklearn.compose import ColumnTransformer


#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score


# Dividir los datos en características (X) y etiquetas (y)
#X = datostos.drop('HasDetections', axis=1)
#y = datos['HasDetections']

# Dividir los datos en conjuntos de entrenamiento y prueba
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar un modelo (por ejemplo, RandomForestClassifier)
#model = RandomForestClassifier()
#model.fit(X_train, y_train)


# Realizar predicciones en el conjunto de prueba
#y_pred = model.predict(X_test)

# Calcular la precisión del modelo
#accuracy = accuracy_score(y_test, y_pred)
#print(f"Precisión del modelo: {accuracy}")
