In [1]:
import pandas as pd
import numpy as np

# sklearn tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Cargar el dataset procesado
dataset = pd.read_csv('../data/processed/features_for_model.csv')
# dataset = pd.read_csv('features_for_model.csv')

In [3]:
X = dataset.drop(['HighSatisfaction'], axis=1)
y = dataset['HighSatisfaction']

In [4]:
# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2025)

In [5]:
# Escalar los datos
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)

In [6]:
# Guardar el scaler
import pickle
with open('../artifacts/std_scaler.pkl', 'wb') as f:
    pickle.dump(std_scaler, f)

#with open('std_scaler.pkl', 'wb') as f:
    #pickle.dump(std_scaler, f)

In [7]:
# Definir modelos e hiperparámetros
# Modelo 1: Random Forest
modelo_rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=2025) #aqui se modifican los hiperparametros n_estimators, max_depth y random_state
modelo_rf.fit(X_train_std, y_train)
y_preds_rf = modelo_rf.predict(X_test_std)
accuracy_rf = accuracy_score(y_test, y_preds_rf)

In [8]:
# Modelo 2: Regresión Logística
modelo_rl = LogisticRegression(C=1.0, solver='liblinear', random_state=2025) #los hiperparametros son C, solver y random_state
modelo_rl.fit(X_train_std, y_train)
y_preds_rl = modelo_rl.predict(X_test_std)
accuracy_rl = accuracy_score(y_test, y_preds_rl)

In [9]:
# Modelo 3: SVC
modelo_svc = SVC(C=1.0, kernel='rbf', random_state=2025) # hiperparametros C, kernel y random_state
modelo_svc.fit(X_train_std, y_train)
y_preds_svc = modelo_svc.predict(X_test_std)
accuracy_svc = accuracy_score(y_test, y_preds_svc)

In [10]:
# Modelo 4: K-Nearest Neighbors
modelo_knn = KNeighborsClassifier(n_neighbors=5, weights='uniform') # hiperparametros n_neighbors y weights
modelo_knn.fit(X_train_std, y_train)
y_preds_knn = modelo_knn.predict(X_test_std)
accuracy_knn = accuracy_score(y_test, y_preds_knn)

In [11]:
# Modelo 5: Árbol de Decisión
modelo_dt = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=2025) #hiper parametros max_depth, min_samples_split y random_state
modelo_dt.fit(X_train_std, y_train)
y_preds_dt = modelo_dt.predict(X_test_std)
accuracy_dt = accuracy_score(y_test, y_preds_dt)

In [12]:
# Comparar resultados
resultados = {
    'RandomForest': accuracy_rf,
    'LogisticRegression': accuracy_rl,
    'SVC': accuracy_svc,
    'KNeighbors': accuracy_knn,
    'DecisionTree': accuracy_dt
}

In [13]:
print("Resultados de precisión por modelo:")
for modelo, accuracy in resultados.items():
    print(f"{modelo}: {accuracy:.4f}")

Resultados de precisión por modelo:
RandomForest: 0.9615
LogisticRegression: 0.7656
SVC: 0.8608
KNeighbors: 0.7674
DecisionTree: 0.8608


In [14]:
with open('../models/random_forest_v1.pkl', 'wb') as f:
    pickle.dump(modelo_rf,f)

#Aqui se guarda el modelo que haya dado los mejores resultados, en mi caso Random Forest
#with open('random_forest_v1.pkl', 'wb') as f:
    #pickle.dump(modelo_rf,f)