<a href="https://colab.research.google.com/github/JosueHuarauyaFabian/Machin/blob/main/untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sagemaker
!pip install jupyter
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install sklearn
!pip install catboost

Collecting sagemaker
  Downloading sagemaker-2.168.0.tar.gz (844 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/844.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/844.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m844.7/844.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3<2.0,>=1.26.131 (from sagemaker)
  Downloading boto3-1.26.163-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.9/135.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf3-to-dict<1.0,>=0.1.5 (from sagemaker)
  Downloading protobuf3-to-dict-0.1.5.tar.gz (3.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting smdebug_rulesconfig==1.0.1 (from sagemaker)
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none

In [None]:
# Importando las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Cargando el conjunto de datos
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=column_names)

# Manejando datos faltantes
# Reemplazando '?' con NaN
data = data.replace(' ?', np.nan)

# Imputando los valores más frecuentes en los datos faltantes
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Codificación de variables categóricas
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
le = LabelEncoder()
for feature in cat_features:
    data_imputed[feature] = le.fit_transform(data_imputed[feature])

# Dividiendo el conjunto de datos en conjuntos de entrenamiento y prueba
X = data_imputed.drop('income', axis=1)
y = data_imputed['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definiendo la métrica de evaluación
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)

# Instanciando el modelo
model = CatBoostClassifier(verbose=False)

# Entrenando el modelo
model.fit(X_train, y_train)

# Evaluando el modelo
accuracy = evaluate(model, X_test, y_test)
print("Accuracy: ", accuracy)

# Ajuste de hiperparámetros usando GridSearch
param_grid = {'depth': [6, 8, 10],
              'learning_rate': [0.01, 0.05, 0.1],
              'iterations': [30, 50, 100]}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid.fit(X_train, y_train)

print("Best parameters found: ", grid.best_params_)

# Entrenando el modelo con los mejores parámetros encontrados
model_best = CatBoostClassifier(depth=grid.best_params_['depth'],
                                learning_rate=grid.best_params_['learning_rate'],
                                iterations=grid.best_params_['iterations'],
                                verbose=False)
model_best.fit(X_train, y_train)

# Evaluando el modelo
accuracy_best = evaluate(model_best, X_test, y_test)
print("Accuracy after hyperparameter tuning: ", accuracy_best)

from sklearn import dummy
# Importando el clasificador Dummy
# Importando el clasificador Dummy
from sklearn.dummy import DummyClassifier

# Instanciando el modelo
dummy = DummyClassifier(strategy='most_frequent')

# Entrenando el modelo
dummy.fit(X_train, y_train)

# Evaluando el modelo
dummy_accuracy = evaluate(dummy, X_test, y_test)
print("Dummy accuracy: ", dummy_accuracy)

# Selección de características usando la importancia de características de CatBoost
feature_importances = model_best.get_feature_importance()
feature_names = X_train.columns
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})

# Mostrando las características ordenadas por importancia
print(feature_importances_df.sort_values(by='importance', ascending=False))

# Seleccionando las características más importantes
important_features = feature_importances_df[feature_importances_df['importance'] > 10]['feature']

# Entrenando el modelo solo con las características más importantes
model_important = CatBoostClassifier(depth=grid.best_params_['depth'],
                                     learning_rate=grid.best_params_['learning_rate'],
                                     iterations=grid.best_params_['iterations'],
                                     verbose=False)
model_important.fit(X_train[important_features], y_train)

# Evaluando el modelo
accuracy_important = evaluate(model_important, X_test[important_features], y_test)
print("Accuracy after feature selection: ", accuracy_important)


Accuracy:  0.8748656533087671
Best parameters found:  {'depth': 10, 'iterations': 100, 'learning_rate': 0.1}
Accuracy after hyperparameter tuning:  0.8717948717948718
Dummy accuracy:  0.7587901120835252
           feature  importance
7     relationship   20.839096
4    education-num   13.059669
0              age   11.406642
5   marital-status   10.739558
10    capital-gain   10.264734
6       occupation    8.491052
12  hours-per-week    7.394297
11    capital-loss    4.810380
1        workclass    3.128793
9              sex    3.039696
2           fnlwgt    2.527099
3        education    2.328740
8             race    1.396844
13  native-country    0.573399
Accuracy after feature selection:  0.8545984953170582
