# Libraries

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
# Load data using preprocessing.py functions
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.preprocessing import load_data, split_data, get_preprocessor

# Load Dataset

In [47]:
df = load_data("../data/raw/adult.csv")
df['class'] = df['class'].map({
    '<=50K': 1,
    '>50K': 0
})
X_train, X_test, y_train, y_test = split_data(df)
preprocessor = get_preprocessor(X_train)

# Define Models

In [49]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42)
}

# Evaluate models

In [38]:
for name, model in models.items():
    pipeline = Pipeline([("preprocess", preprocessor) , ("model", model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

Model: LogisticRegression
Accuracy: 0.8533114955471389
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91      7431
        >50K       0.74      0.60      0.66      2338

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.78      9769
weighted avg       0.85      0.85      0.85      9769

Model: RandomForest
Accuracy: 0.8601699252738254
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      7431
        >50K       0.74      0.63      0.68      2338

    accuracy                           0.86      9769
   macro avg       0.82      0.78      0.80      9769
weighted avg       0.85      0.86      0.86      9769



# Train Logistic Regeression

In [50]:
# Model
log_reg = LogisticRegression(max_iter=1000)
# Pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", log_reg)
])
# Fitting model
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)

## Precision & Recall

In [51]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision, recall

(0.8830169317598768, 0.9263894496030144)

## Confusion Matrix

In [52]:
cm = confusion_matrix(y_test, y_pred)

In [53]:
cm

array([[1426,  912],
       [ 547, 6884]], dtype=int64)

In [54]:
y_scores = model.predict_proba(X_test)[:, 1]  # class 1 = <=50K

from sklearn.metrics import precision_recall_curve
import numpy as np

precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)

precisions = precisions[:-1]
recalls = recalls[:-1]

target_precision = 0.92
valid_idxs = np.where(precisions >= target_precision)[0]

if len(valid_idxs) > 0:
    best_idx = valid_idxs[0]
    best_threshold = thresholds[best_idx]
else:
    best_threshold = 0.5

y_pred_custom = (y_scores >= best_threshold).astype(int)

precision_new = precision_score(y_test, y_pred_custom)
recall_new = recall_score(y_test, y_pred_custom)

print("Threshold:", best_threshold)
print("Precision:", precision_new)
print("Recall:", recall_new)

Threshold: 0.6708405720878097
Precision: 0.9200870195794054
Recall: 0.853720898936886
