In [13]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier

### Loading the cleaned dataset

In [14]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [15]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

### Model

In [16]:
# Random Forest Classifier model
dt_model = RandomForestClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [17]:
# Predictions
dt_preds = dt_model.predict(X_test)
# Accuracy evaluation
dt_accuracy = accuracy_score(y_test, dt_preds)

print(classification_report(y_test, dt_preds))
print("Decision Tree Accuracy:", dt_accuracy)

              precision    recall  f1-score   support

           0       0.80      0.83      0.82      1175
           1       0.67      0.62      0.64       625

    accuracy                           0.76      1800
   macro avg       0.73      0.73      0.73      1800
weighted avg       0.76      0.76      0.76      1800

Decision Tree Accuracy: 0.7594444444444445


In [ ]:
param_grid = {
        'n_neighbors': np.arange(2, 30, 1),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
}
    
knn = KNeighborsClassifier()

# Perform grid search with 5-fold cross-validation
knn_cv = GridSearchCV(knn, param_grid=param_grid, cv=5, verbose=1)

knn_cv.fit(X_encoded, y_encoded)

# Get cross-validation scores
cv_results = knn_cv.cv_results_
mean_scores = cv_results['mean_test_score']
std_scores = cv_results['std_test_score']

# Calculate the best mean score and its standard deviation
best_mean_score = np.max(mean_scores)
std_of_best_mean = np.std(mean_scores[mean_scores >= best_mean_score])

# Find hyperparameters within 1 standard deviation of the best mean score
candidate_params = []
for params, mean_score, std_score in zip(cv_results['params'], mean_scores, std_scores):
    if mean_score >= best_mean_score - std_of_best_mean:
        candidate_params.append((params, mean_score, std_score))

# Choose the set of hyperparameters with the fewest hyperparameters
best_params, _, _ = min(candidate_params, key=lambda x: len(x[0]))
best_model = knn_cv.best_estimator_ # best model

accuracy = best_model.score(X_test, y_test)

print("Best Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", accuracy)

In [18]:
# save model
save_model(dt_model, '../output/saved_models/decision_tree_model.joblib')