In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Cargar datos
data = pd.read_csv('data_evaluacion.csv', header=None)
data.columns = ['Edad', 'Trabajo', 'Peso', 'Educacion', 'EduNum', 'EstadoCivil', 'Ocupacion', 'Relacion', 'Raza', 'Sexo', 'GananciaCapital', 'PerdidaCapital', 'HorasSemana', 'PaisOrigen', 'Ingreso']
data.head()

Unnamed: 0,Edad,Trabajo,Peso,Educacion,EduNum,EstadoCivil,Ocupacion,Relacion,Raza,Sexo,GananciaCapital,PerdidaCapital,HorasSemana,PaisOrigen,Ingreso
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
# Convertir la variable objetivo a binaria
data['Ingreso'] = data['Ingreso'].apply(lambda x: 1 if x == '>50K' else 0)

In [12]:
# Convertir etiquetas categóricas a numéricas
label_encoders = {}
categorical_columns = ['Trabajo', 'Educacion', 'EstadoCivil', 'Ocupacion', 'Relacion', 'Raza', 'Sexo', 'PaisOrigen']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [15]:
data.head(10)


Unnamed: 0,Edad,Trabajo,Peso,Educacion,EduNum,EstadoCivil,Ocupacion,Relacion,Raza,Sexo,GananciaCapital,PerdidaCapital,HorasSemana,PaisOrigen,Ingreso
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
8,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39,1
9,42,4,159449,9,13,2,4,0,4,1,5178,0,40,39,1


In [13]:
# División de datos
X = data.drop('Ingreso', axis=1)
y = data['Ingreso']

# Normalizar los datos
scaler = StandardScaler()
X = scaler.fit_transform(X)

# División en prueba y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Entrenar y evaluar el modelo Naive Bayes
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes - Accuracy: {accuracy}')
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

Naive Bayes - Accuracy: 0.8028457365134609
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      7414
           1       0.69      0.32      0.44      2355

    accuracy                           0.80      9769
   macro avg       0.76      0.64      0.66      9769
weighted avg       0.79      0.80      0.77      9769

