In [6]:
!pip install scikit-learn==1.2.2



In [7]:
# Importar librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [8]:
sklearn.__version__


'1.2.2'

In [9]:
!python --version

Python 3.10.12


In [10]:
# Cargar datos
data = pd.read_csv('data_evaluacion.csv')

In [11]:
print("Primeras filas del dataset:")
print(data.head())

Primeras filas del dataset:
   39         State-gov   77516  Bachelors  13       Never-married  \
0  50  Self-emp-not-inc   83311  Bachelors  13  Married-civ-spouse   
1  38           Private  215646    HS-grad   9            Divorced   
2  53           Private  234721       11th   7  Married-civ-spouse   
3  28           Private  338409  Bachelors  13  Married-civ-spouse   
4  37           Private  284582    Masters  14  Married-civ-spouse   

        Adm-clerical  Not-in-family  White    Male  2174  0  40  \
0    Exec-managerial        Husband  White    Male     0  0  13   
1  Handlers-cleaners  Not-in-family  White    Male     0  0  40   
2  Handlers-cleaners        Husband  Black    Male     0  0  40   
3     Prof-specialty           Wife  Black  Female     0  0  40   
4    Exec-managerial           Wife  White  Female     0  0  40   

   United-States  <=50K  
0  United-States  <=50K  
1  United-States  <=50K  
2  United-States  <=50K  
3           Cuba  <=50K  
4  United-States  

In [12]:
# Preprocesamiento
data_dummies = pd.get_dummies(data, drop_first=True)


In [13]:
# Ajustar nombre de la columna objetivo
target_column = '<=50K_>50K'

In [14]:
# Verificar si la columna objetivo existe en los datos
if target_column not in data_dummies.columns:
    raise KeyError(f"La columna '{target_column}' no se encuentra en los datos preprocesados.")

X = data_dummies.drop(target_column, axis=1)
y = data_dummies[target_column]

In [15]:
# Dividir datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
# Guardar las columnas del modelo
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')

['model_columns.pkl']

In [17]:
# Escalar características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Guardar el escalador
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [19]:
# Entrenar el modelo
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

In [20]:
# Evaluar el modelo
def evaluate_model(model, X_train, y_train, X_test, y_test):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_accuracy = np.mean(train_pred == y_train)
    test_accuracy = np.mean(test_pred == y_test)

    print(f"Precisión en entrenamiento: {train_accuracy:.4f}")
    print(f"Precisión en prueba: {test_accuracy:.4f}")

    if train_accuracy - test_accuracy > 0.1:
        print("Posible overfitting")
    elif test_accuracy - train_accuracy > 0.1:
        print("Posible underfitting")
    else:
        print("Modelo bien ajustado")

print("Regresión Logística")
print("Confusión en entrenamiento:")
print(confusion_matrix(y_train, log_reg.predict(X_train_scaled)))
print("Reporte de clasificación en entrenamiento:")
print(classification_report(y_train, log_reg.predict(X_train_scaled)))

print("Confusión en prueba:")
print(confusion_matrix(y_test, log_reg.predict(X_test_scaled)))
print("Reporte de clasificación en prueba:")
print(classification_report(y_test, log_reg.predict(X_test_scaled)))


Regresión Logística
Confusión en entrenamiento:
[[24215  1773]
 [ 3241  4959]]
Reporte de clasificación en entrenamiento:
              precision    recall  f1-score   support

       False       0.88      0.93      0.91     25988
        True       0.74      0.60      0.66      8200

    accuracy                           0.85     34188
   macro avg       0.81      0.77      0.79     34188
weighted avg       0.85      0.85      0.85     34188

Confusión en prueba:
[[10425   741]
 [ 1408  2079]]
Reporte de clasificación en prueba:
              precision    recall  f1-score   support

       False       0.88      0.93      0.91     11166
        True       0.74      0.60      0.66      3487

    accuracy                           0.85     14653
   macro avg       0.81      0.76      0.78     14653
weighted avg       0.85      0.85      0.85     14653



In [21]:
# Evaluar underfitting y overfitting
evaluate_model(log_reg, X_train_scaled, y_train, X_test_scaled, y_test)


Precisión en entrenamiento: 0.8533
Precisión en prueba: 0.8533
Modelo bien ajustado


In [22]:
# Guardar el modelo
joblib.dump(log_reg, 'mejor_modelo_log_reg.pkl')

['mejor_modelo_log_reg.pkl']