In [3]:
# train_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

# 1) Load your data
# Replace 'adult.csv' with your actual CSV path
df = pd.read_csv("adult 3.csv")

# 2) Define the features & target
FEATURES = [
    "age",
    "workclass",
    "education",
    "occupation",
    "gender",
    "marital-status",
    "capital-gain",
    "hours-per-week",
]
TARGET = "income"  # should be '>50K' or '<=50K'

# 3) (Optional) Clean or filter your data 
#    – e.g. drop rows with missing values
df = df.dropna(subset=FEATURES + [TARGET])

# 4) Split into train/test
X = df[FEATURES].copy()
y = df[TARGET].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5) Fit LabelEncoders for all categorical features + the target
encoders = {}
for col in FEATURES:
    if X_train[col].dtype == object:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col]  = le.transform(X_test[col])
        encoders[col] = le

# Target encoder
target_encoder = LabelEncoder()
y_train_enc = target_encoder.fit_transform(y_train)
y_test_enc  = target_encoder.transform(y_test)

# 6) Train the Gradient Boosting Classifier
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
model.fit(X_train, y_train_enc)

# 7) Evaluate on test set
y_pred = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:\n", classification_report(y_test_enc, y_pred, target_names=target_encoder.classes_))

# 8) Serialize artifacts for your Streamlit app
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

# Save the exact feature order your app expects
with open("columns.pkl", "wb") as f:
    pickle.dump(FEATURES, f)

with open("target_encoder.pkl", "wb") as f:
    pickle.dump(target_encoder, f)

print("Artifacts saved: model.pkl, encoders.pkl, columns.pkl, target_encoder.pkl")


Test Accuracy: 0.8639574163169209

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.95      0.91      7431
        >50K       0.78      0.60      0.68      2338

    accuracy                           0.86      9769
   macro avg       0.83      0.77      0.80      9769
weighted avg       0.86      0.86      0.86      9769

Artifacts saved: model.pkl, encoders.pkl, columns.pkl, target_encoder.pkl
