<a href="https://colab.research.google.com/github/JuanEan/Juan/blob/main/Actividad%20%231.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Cargar y preparar los datos
Data = pd.read_csv("/content/data_adults.csv")

# Eliminar columnas irrelevantes
Data_cop = Data.drop(["fnlwgt", "education-num"], axis=1)

# Separar características (X) y variable objetivo (y)
X = Data_cop.drop("income", axis=1)
y = Data_cop['income'].isin(['>50K.', '>50K']).astype(int)

# Identificar características numéricas y categóricas
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Transformador para características numéricas
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Transformador para características categóricas para SVM y Gradient Boosting
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocesador general para SVM y Gradient Boosting
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Transformador para características categóricas específico para Naive Bayes
categorical_transformer_nb = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Usar sparse_output=False
])

# Preprocesador específico para Naive Bayes
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer_nb, categorical_features)
    ])

# División de los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Usar una muestra más pequeña de los datos para pruebas rápidas
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Función para mostrar los mejores parámetros y resultados
def mostrar_resultados(random_search, model_name):
    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best score for {model_name}: {random_search.best_score_}")

# Optimización de Hiperparámetros

# 1. SVM
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC())])

param_grid_svm = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [1, 0.1, 0.01],
    'classifier__kernel': ['linear', 'rbf']
}

random_search_svm = RandomizedSearchCV(svm_pipeline, param_distributions=param_grid_svm, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_svm.fit(X_train_sample, y_train_sample)
mostrar_resultados(random_search_svm, "SVM")

# 2. Gradient Boosting
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', GradientBoostingClassifier())])

param_grid_gb = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.05],
    'classifier__max_depth': [3, 4]
}

random_search_gb = RandomizedSearchCV(gb_pipeline, param_distributions=param_grid_gb, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_gb.fit(X_train_sample, y_train_sample)
mostrar_resultados(random_search_gb, "Gradient Boosting")

# 3. Naive Bayes
nb_pipeline = Pipeline(steps=[('preprocessor', preprocessor_nb),
                              ('classifier', GaussianNB())])

param_grid_nb = {
    'classifier__var_smoothing': np.logspace(0, -9, num=10)  # Reducir rango de var_smoothing para rapidez
}

random_search_nb = RandomizedSearchCV(nb_pipeline, param_distributions=param_grid_nb, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_nb.fit(X_train_sample, y_train_sample)
mostrar_resultados(random_search_nb, "Naive Bayes")


Best parameters for SVM: {'classifier__kernel': 'linear', 'classifier__gamma': 0.1, 'classifier__C': 1}
Best score for SVM: 0.8446687143483985
Best parameters for Gradient Boosting: {'classifier__n_estimators': 150, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.1}
Best score for Gradient Boosting: 0.8636829018575399
Best parameters for Naive Bayes: {'classifier__var_smoothing': 1.0}
Best score for Naive Bayes: 0.794793037882112
