# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
# Load data using preprocessing.py functions
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.preprocessing import load_data, split_data, get_preprocessor

# Load Dataset

In [2]:
df = load_data("../data/raw/adult.csv")
X_train, X_test, y_train, y_test = split_data(df)
preprocessor = get_preprocessor(X_train)

# Define Models

In [3]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42)
}

# Evaluate models

In [13]:
for name, model in models.items():
    pipeline = Pipeline([("preprocess", preprocessor) , ("model", model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

Model: LogisticRegression
Accuracy: 0.8533114955471389
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.91      7431
        >50K       0.74      0.60      0.66      2338

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.78      9769
weighted avg       0.85      0.85      0.85      9769

Model: RandomForest
Accuracy: 0.8601699252738254
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      7431
        >50K       0.74      0.63      0.68      2338

    accuracy                           0.86      9769
   macro avg       0.82      0.78      0.80      9769
weighted avg       0.85      0.86      0.86      9769



# Train Logistic Regeression