In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [3]:
# Cargar datos
data = pd.read_csv('data_evaluacion.csv', header=None)
data.columns = ['Edad', 'Trabajo', 'Peso', 'Educacion', 'EduNum', 'EstadoCivil', 'Ocupacion', 'Relacion', 'Raza', 'Sexo', 'GananciaCapital', 'PerdidaCapital', 'HorasSemana', 'PaisOrigen', 'Ingreso']
data.head()

Unnamed: 0,Edad,Trabajo,Peso,Educacion,EduNum,EstadoCivil,Ocupacion,Relacion,Raza,Sexo,GananciaCapital,PerdidaCapital,HorasSemana,PaisOrigen,Ingreso
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Convertir la variable objetivo a binaria
data['Ingreso'] = data['Ingreso'].apply(lambda x: 1 if x == '>50K' else 0)

In [5]:
# División de datos 
X = data.drop('Ingreso', axis=1)
y = data['Ingreso']

# Division en prueba y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Separar características numéricas y categóricas
num = ['Edad', 'Peso', 'EduNum', 'GananciaCapital', 'PerdidaCapital', 'HorasSemana']
cat = ['Trabajo', 'Educacion', 'EstadoCivil', 'Ocupacion', 'Relacion', 'Raza', 'Sexo', 'PaisOrigen']

# Escalar características numéricas
scaler = StandardScaler()
X_train[num] = scaler.fit_transform(X_train[num])
X_test[num] = scaler.transform(X_test[num])

In [7]:
# Codificar características categóricas
encoder = OneHotEncoder(drop='first',handle_unknown='ignore', sparse=False)
X_train_cat = pd.DataFrame(encoder.fit_transform(X_train[cat]), columns=encoder.get_feature_names_out(cat))
X_test_cat = pd.DataFrame(encoder.transform(X_test[cat]), columns=encoder.get_feature_names_out(cat))



In [8]:
# Concatenar características numéricas y categóricas
X_train = pd.concat([X_train[num].reset_index(drop=True), X_train_cat.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test[num].reset_index(drop=True), X_test_cat.reset_index(drop=True)], axis=1)


In [9]:
k_values = range(1, 21)
best_k = 1
best_score = 0

for k in k_values:
    model_knn = KNeighborsClassifier(n_neighbors=k)
    model_knn.fit(X_train, y_train)
    y_pred_knn = model_knn.predict(X_test)
    score = accuracy_score(y_test, y_pred_knn)
    
    if score > best_score:
        best_k = k
        best_score = score
    
    print(f"k-NN con k={k} - Accuracy:", score)

print(f"Mejor k: {best_k} con Accuracy: {best_score}")


k-NN con k=1 - Accuracy: 0.7973180468829972
k-NN con k=2 - Accuracy: 0.8157436789845429
k-NN con k=3 - Accuracy: 0.8219879209745112
k-NN con k=4 - Accuracy: 0.8271061521138294
k-NN con k=5 - Accuracy: 0.8303818200429931
k-NN con k=6 - Accuracy: 0.8352953219367386
k-NN con k=7 - Accuracy: 0.8344764049544477
k-NN con k=8 - Accuracy: 0.8387757191114751
k-NN con k=9 - Accuracy: 0.8381615313747569
k-NN con k=10 - Accuracy: 0.8408230115672024
k-NN con k=11 - Accuracy: 0.839594636093766
k-NN con k=12 - Accuracy: 0.8413348346811342
k-NN con k=13 - Accuracy: 0.8404135530760569
k-NN con k=14 - Accuracy: 0.8430750332685024
k-NN con k=15 - Accuracy: 0.8445081379875116
k-NN con k=16 - Accuracy: 0.8461459719520934
k-NN con k=17 - Accuracy: 0.8436892210052206
k-NN con k=18 - Accuracy: 0.8457365134609479
k-NN con k=19 - Accuracy: 0.8445081379875116
k-NN con k=20 - Accuracy: 0.8457365134609479
Mejor k: 16 con Accuracy: 0.8461459719520934
