In [1]:
# --- Librerías estándar ---
import os  # Rutas de los archivos

# --- Librerías científicas y de análisis ---
import matplotlib.pyplot as plt # Gráficos
import numpy as np  # Funciones matemáticas
import pandas as pd  # Manejo de dataframes
import sweetviz as sviz # EDA

# --- Modelos de clasificación ---
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier, 
                              ExtraTreesClassifier, GradientBoostingClassifier, 
                              RandomForestClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# --- Metricas y evaluación --- 
from sklearn.metrics import (
    accuracy_score, precision_score, f1_score, recall_score
)
from sklearn.model_selection import cross_val_score, train_test_split

# --- Optimizador ---
import optuna

# --- Elaboración propia ---
from ProbadorHipotesis import ProbadorHipotesis
from ModeladorDatos import ModeladorDatos

  from .autonotebook import tqdm as notebook_tqdm


Lectura de datos

In [4]:
ruta_actual = os.getcwd()
ruta_datos = os.path.join(ruta_actual, '../..', 'data/base.csv') 
ruta_normalizada = os.path.normpath(ruta_datos) 
base = pd.read_csv(ruta_normalizada)

In [6]:
base.head()

Unnamed: 0,ZONA,REGION,V1,V2A,V3,V4,V5,V6,V6A,V7A,...,V19,V21,R4A,TamViv,A4,A5,A6,CondMig,NivInst,A22A
0,Urbana,Central,En fila o contigua,Alquiler o cesión,Block o ladrillo,Lámina de metal o zinc,Sí,"Mosaico, cerámica, terrazo",No,Bueno,...,SÍ,No,No,2.0,Hombre,45 a 59,Separado/Divorciado/Viudo,No migrante,Secundaria completa,Sí
1,Urbana,Central,En fila o contigua,Alquiler o cesión,Block o ladrillo,Lámina de metal o zinc,Sí,"Mosaico, cerámica, terrazo",No,Bueno,...,SÍ,No,No,2.0,Mujer,13 a 17,Soltero(a),Migrante interno,Primaria completa,No
2,Urbana,Central,En fila o contigua,Propiedad,Otro,Lámina de metal o zinc,Sí,"Mosaico, cerámica, terrazo",No,Bueno,...,SÍ,No,No,3.0,Mujer,45 a 59,Casado(a),Migrante externo,Primaria completa,No
3,Urbana,Central,En fila o contigua,Propiedad,Otro,Lámina de metal o zinc,Sí,"Mosaico, cerámica, terrazo",No,Bueno,...,SÍ,No,No,3.0,Hombre,60 a 74,Casado(a),Migrante externo,Primaria completa,Sí
4,Urbana,Central,En fila o contigua,Propiedad,Otro,Lámina de metal o zinc,Sí,"Mosaico, cerámica, terrazo",No,Bueno,...,SÍ,No,No,3.0,Mujer,25 a 34,Soltero(a),No migrante,Superior,Sí


# Se evaluan los modelos

Test de normalidad de Kolmogorov

In [8]:
# test = ProbadorHipotesis(base)
# resultados_normalidad = test.normalidad_ks(base)
# ruta_resultado_KS = os.path.join(ruta_actual, '..', 'Resultados/TestNormalidadKS.xlsx')
# ruta_resultado_KS_norm = os.path.normpath(ruta_resultado_KS)
# resultados_normalidad.to_excel(ruta_resultado_KS_norm, index=False)

In [11]:
base['V21'] = np.where(base['V21']=='No',0,1)

In [12]:
base['V21'].value_counts()

V21
0    21406
1     5499
Name: count, dtype: int64

In [14]:
feature_conf = sviz.FeatureConfig(force_num = ['V21']) #sustituir '-' con el nombre de la variable objetivo
eda = sviz.analyze(base, target_feat = 'V21', feat_cfg = feature_conf)
eda.show_html('EDA.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:03 -> (00:00 left)


Report EDA.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Se evaluan los modelos

In [None]:
models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "KNN": KNeighborsClassifier(),
    "NeuralNetwork": MLPClassifier(),
    "LDA": LinearDiscriminantAnalysis(),
    "CatBoost": CatBoostClassifier(),
    "LightGBM": LGBMClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "NaiveBayes": GaussianNB(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "Bagging": BaggingClassifier(base_estimator=DecisionTreeClassifier()),
    "ExtraTrees": ExtraTreesClassifier()
}
modelador = ModeladorDatos(df = base, var_objetivo = 'Class', modelos = models)

In [15]:
results_df = modelador.evaluar_modelos()

Training LogisticRegression...
Training DecisionTree...
Training SVM...
Training RandomForest...
Training XGBoost...
Training KNN...
Training NeuralNetwork...
Training LDA...
Training CatBoost...




0:	learn: 0.5342062	total: 161ms	remaining: 1m 20s
1:	learn: 0.4003576	total: 164ms	remaining: 40.7s
2:	learn: 0.3209170	total: 165ms	remaining: 27.3s
3:	learn: 0.2521842	total: 166ms	remaining: 20.6s
4:	learn: 0.2031222	total: 167ms	remaining: 16.5s
5:	learn: 0.1676343	total: 169ms	remaining: 13.9s
6:	learn: 0.1372828	total: 170ms	remaining: 11.9s
7:	learn: 0.1229757	total: 171ms	remaining: 10.5s
8:	learn: 0.1047626	total: 172ms	remaining: 9.38s
9:	learn: 0.0926162	total: 173ms	remaining: 8.48s
10:	learn: 0.0834909	total: 175ms	remaining: 7.79s
11:	learn: 0.0748838	total: 177ms	remaining: 7.21s
12:	learn: 0.0651809	total: 180ms	remaining: 6.74s
13:	learn: 0.0606059	total: 181ms	remaining: 6.29s
14:	learn: 0.0545338	total: 182ms	remaining: 5.9s
15:	learn: 0.0496220	total: 184ms	remaining: 5.55s
16:	learn: 0.0462638	total: 185ms	remaining: 5.25s
17:	learn: 0.0423514	total: 186ms	remaining: 4.98s
18:	learn: 0.0385021	total: 187ms	remaining: 4.74s
19:	learn: 0.0357103	total: 188ms	remaini



Training ExtraTrees...


In [None]:
# Guardar en Excel
ruta_resultado_modelos = os.path.join(ruta_actual, '../..', 'Resultados/resultados_modelos.xlsx')
ruta_resultado_modelos_norm = os.path.normpath(ruta_resultado_modelos)
results_df.drop(columns=["Predictions"]).to_excel(ruta_resultado_modelos_norm, index=False)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(base.drop(['-'], axis = 1),
                                                    base['-'],
                                                    test_size = 0.2,
                                                    #stratify = base['-'],
                                                    random_state = 2
                                                    )

In [None]:
def objective_tree(trial):
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 20, 100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 20, 50)
    max_features = trial.suggest_float('max_features', 0.5, 1)
    max_leaf_nodes =  trial.suggest_int('max_leaf_nodes', 10, 100)

    model = DecisionTreeClassifier(
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf,
        max_features = max_features,
        max_leaf_nodes = max_leaf_nodes
    ) 

    # Manejo de errores para evitar que valores 'None' causen problemas
    try:
        score = cross_val_score(model, X_train, y_train, n_jobs = -1, cv = 3, scoring = 'f1_macro').mean()
        if score is None: # Si el puntaje es None, retorna un valor muy bajo
            return float("-inf")
        return score
    except Exception as e:
        print(f'Error en el ensayo: {e}')
        return float('-inf') # Valor de retorno bajo en caso de error

In [None]:
study = optuna.create_study(direction = 'maximize', sampler = optuna.sampler.RandomSampler(seed = 2))

In [None]:
study.optimize(objective_tree, n_trials = 50)

In [None]:
best_parameters = study.best_params
best_score = study.best_value
print(f'Best Hyperparameters: {best_parameters}')
print(f'Best Accuracy: {best_score}')

In [None]:
model = DecisionTreeClassifier(
        max_depth = best_parameters['max_depth'],
        min_samples_leaf = best_parameters['min_samples_leaf'],
        max_features = best_parameters['max_features'],
        max_leaf_nodes = best_parameters['max_leaf_nodes']
    ) 


In [None]:
X_test_dummies = pd.get_dummies(X_test)
results_model = []

y_pred_proba = model.predict_proba(X_test_dummies)[:, 1]

for i in range(0,11):
    y_pred_class = y_pred_proba >= i*0.1
    accuracy = accuracy_score(y_test, y_pred_class)
    precision = precision_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    f1 = f1_score(y_test, y_pred_class)

    results_model.append([accuracy, precision, recall, f1])

results_model = pd.DataFrame(results_model)
results_model.columns = ['Accuracy', 'Precisión', 'Recall', 'F1']

ruta_resultado_modelo = os.path.join(ruta_normalizada, 'Resultados/Resultado_modelo_optuna')
results_model.to_excel(ruta_resultado_modelo, index = False)