In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


In [90]:
# Load the Preprocessed Dataset
data = pd.read_csv("../data/income_cleaned.csv")
print("Dataset shape:", data.shape)

Dataset shape: (31284, 11)


In [91]:
# Separate Features and Target Variable
X = data.drop("income", axis=1)
y = data["income"]

In [92]:
numeric_cols = ["age", "capital-gain", "capital-loss", "hours-per-week"]
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "gender"]

# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (25027, 10)
Test set shape: (6257, 10)


In [93]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

In [94]:
pipelines = {
    "Logistic Regression": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "Decision Tree": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "Random Forest": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
    ])
}

In [95]:
accuracies = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    
    print(f"\n--- {name} ---")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


--- Logistic Regression ---
Accuracy: 0.8213201214639604
Confusion Matrix:
 [[4216  409]
 [ 709  923]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88      4625
           1       0.69      0.57      0.62      1632

    accuracy                           0.82      6257
   macro avg       0.77      0.74      0.75      6257
weighted avg       0.81      0.82      0.82      6257


--- Decision Tree ---
Accuracy: 0.813968355441905
Confusion Matrix:
 [[4122  503]
 [ 661  971]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.89      0.88      4625
           1       0.66      0.59      0.63      1632

    accuracy                           0.81      6257
   macro avg       0.76      0.74      0.75      6257
weighted avg       0.81      0.81      0.81      6257


--- Random Forest ---
Accuracy: 0.8246763624740291
Confusion Matrix:
 [[4132  493]
 [ 604 1028]]

In [96]:
best_model_name = max(accuracies, key=accuracies.get)
best_pipeline = pipelines[best_model_name]

model_path = "../models/best_income_model_pipeline.pkl"
joblib.dump(best_pipeline, model_path)

print(f"\n✅ Best model pipeline ({best_model_name}) saved as '{model_path}'")


✅ Best model pipeline (Random Forest) saved as '../models/best_income_model_pipeline.pkl'
