In [None]:
# Importar las librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Cargar el dataset
data = pd.read_csv('/content/data_adults.csv')  

In [None]:
# Identificar características y objetivo
X = data.drop('income', axis=1)  
y = data['income']  

In [4]:
# Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [5]:
# Preprocesamiento: escalado para características numéricas y One-Hot Encoding denso para categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)


In [6]:
# Definir los pipelines para cada modelo
pipelines = {
    'SVC': Pipeline([
        ('preprocessor', preprocessor),
        ('svc', SVC())
    ]),
    'Naive Bayes': Pipeline([
        ('preprocessor', preprocessor),
        ('nb', GaussianNB())
    ]),
    'K-Nearest Neighbors': Pipeline([
        ('preprocessor', preprocessor),
        ('knn', KNeighborsClassifier())
    ]),
    'Gradient Boosting': Pipeline([
        ('preprocessor', preprocessor),
        ('gboost', GradientBoostingClassifier())
    ])
}

In [7]:
# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Entrenar y evaluar cada pipeline
results = {}
for model_name, pipeline in pipelines.items():
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)

    # Predecir
    y_pred = pipeline.predict(X_test)

    # Calcular la precisión y el informe de clasificación
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Guardar los resultados
    results[model_name] = {
        'Accuracy': accuracy,
        'Classification Report': report
    }
    print(f"Modelo: {model_name}")
    print(f"Precisión: {accuracy}")
    print(f"Reporte de Clasificación:\n{report}")
    print("="*50)

Modelo: SVC
Precisión: 0.5915651550824035
Reporte de Clasificación:
              precision    recall  f1-score   support

       <=50K       0.60      0.94      0.73      4936
      <=50K.       0.93      0.08      0.15      2478
        >50K       0.52      0.58      0.54      1562
       >50K.       0.78      0.02      0.04       793

    accuracy                           0.59      9769
   macro avg       0.71      0.41      0.37      9769
weighted avg       0.68      0.59      0.50      9769

Modelo: Naive Bayes
Precisión: 0.2241785239021394
Reporte de Clasificación:
              precision    recall  f1-score   support

       <=50K       0.74      0.11      0.18      4936
      <=50K.       0.37      0.31      0.34      2478
        >50K       0.17      0.13      0.14      1562
       >50K.       0.12      0.87      0.21       793

    accuracy                           0.22      9769
   macro avg       0.35      0.36      0.22      9769
weighted avg       0.50      0.22      0.

In [9]:
# Comparación de modelos
best_model = max(results, key=lambda x: results[x]['Accuracy'])
print(f"El mejor modelo es: {best_model} con una precisión de {results[best_model]['Accuracy']}")

El mejor modelo es: Gradient Boosting con una precisión de 0.5976046678267991
