In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


In [42]:
# Load the Preprocessed Dataset
data = pd.read_csv("../data/income_cleaned.csv")
print("Dataset shape:", data.shape)

Dataset shape: (23499, 11)


In [43]:
# Separate Features and Target Variable
X = data.drop("income", axis=1)
y = data["income"]

In [44]:
numeric_cols = ["age", "capital-gain", "capital-loss", "hours-per-week"]
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "gender"]

# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (18799, 10)
Test set shape: (4700, 10)


In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

In [46]:
pipelines = {
    "Logistic Regression": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "Decision Tree": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "Random Forest": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
    ])
}

In [47]:
accuracies = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    
    print(f"\n--- {name} ---")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


--- Logistic Regression ---
Accuracy: 0.8242553191489361
Confusion Matrix:
 [[3466  266]
 [ 560  408]]
Classification Report:
               precision    recall  f1-score   support

       <=50K       0.86      0.93      0.89      3732
        >50K       0.61      0.42      0.50       968

    accuracy                           0.82      4700
   macro avg       0.73      0.68      0.70      4700
weighted avg       0.81      0.82      0.81      4700


--- Decision Tree ---
Accuracy: 0.778936170212766
Confusion Matrix:
 [[3293  439]
 [ 600  368]]
Classification Report:
               precision    recall  f1-score   support

       <=50K       0.85      0.88      0.86      3732
        >50K       0.46      0.38      0.41       968

    accuracy                           0.78      4700
   macro avg       0.65      0.63      0.64      4700
weighted avg       0.77      0.78      0.77      4700


--- Random Forest ---
Accuracy: 0.7921276595744681
Confusion Matrix:
 [[3296  436]
 [ 541  427]]

In [48]:
best_model_name = max(accuracies, key=accuracies.get)
best_pipeline = pipelines[best_model_name]

model_path = "../models/best_income_model_pipeline.pkl"
joblib.dump(best_pipeline, model_path)

print(f"\n✅ Best model pipeline ({best_model_name}) saved as '{model_path}'")


✅ Best model pipeline (Logistic Regression) saved as '../models/best_income_model_pipeline.pkl'
