# 1. Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import time
import re

# 2. Data Pre-processing

In [4]:
# Load data
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
    'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]

train_data = pd.read_csv('adult.data', header=None, names=column_names, sep=r',\s+', engine='python')
test_data = pd.read_csv('adult.test', header=None, names=column_names, sep=r',\s+', engine='python', skiprows=1)

test_data['income'] = test_data['income'].apply(lambda x: re.sub(r'\.', '', x))

# Processing missing value
train_data.replace('?', np.nan, inplace=True)
test_data.replace('?', np.nan, inplace=True)
train_data.fillna(train_data.mode().iloc[0], inplace=True)
test_data.fillna(test_data.mode().iloc[0], inplace=True)

# Encoding label
le = LabelEncoder()
train_data['income'] = le.fit_transform(train_data['income'])
test_data['income'] = le.transform(test_data['income'])

# One-Hot Encoding
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
train_data = pd.get_dummies(train_data, columns=categorical_features)
test_data = pd.get_dummies(test_data, columns=categorical_features)

# Ensure that the columns of the training and test sets are aligned
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# Separate features and labels
X_train = train_data.drop('income', axis=1)
y_train = train_data['income']
X_test = test_data.drop('income', axis=1)
y_test = test_data['income']

# Feature standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. The Model: Decision Tree& K-Nearest Neighbors

In [6]:
# Decision Tree Model
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4, 6] 
}

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1)
start_time = time.time()
grid_search_dt.fit(X_train, y_train)
train_time_dt = time.time() - start_time

# apply model
start_time = time.time()
y_pred_dt = grid_search_dt.predict(X_test)
test_time_dt = time.time() - start_time

# K-Nearest Neighbors Model
knn = KNeighborsClassifier()
param_grid_knn = {
    'n_neighbors': [10, 20, 30, 35, 40],
    'weights': ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5, n_jobs=-1)
start_time = time.time()
grid_search_knn.fit(X_train_scaled, y_train)
train_time_knn = time.time() - start_time

# apply model
start_time = time.time()
y_pred_knn = grid_search_knn.predict(X_test_scaled)
test_time_knn = time.time() - start_time

# Best hyperparameters
print("Best hyperparameters of the Decision Tree:", grid_search_dt.best_params_)
print("Best hyperparameters of the KNN:", grid_search_knn.best_params_)

print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_dt))
print(f"Decision Tree Train Time: {train_time_dt:.4f}s, Test Time: {test_time_dt:.4f}s")

print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_knn))
print("KNN Accuracy: ", accuracy_score(y_test, y_pred_knn))
print(f"KNN Train Time: {train_time_knn:.4f}s, Test Time: {test_time_knn:.4f}s")

Best hyperparameters of the Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best hyperparameters of the KNN: {'n_neighbors': 30, 'weights': 'uniform'}

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     12435
           1       0.77      0.60      0.67      3846

    accuracy                           0.86     16281
   macro avg       0.83      0.77      0.79     16281
weighted avg       0.86      0.86      0.86     16281

Decision Tree Accuracy:  0.862600577360113
Decision Tree Train Time: 38.7501s, Test Time: 0.0114s

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.89     12435
           1       0.69      0.53      0.60      3846

    accuracy                           0.83     16281
   macro avg       0.78      0.73      0.75     16281
weighted avg       0.82      0.83      0.83     16281
