In [15]:
# model_training.py

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
df = pd.read_csv("adult 3.csv")
# Add 'experience' column manually
df['experience'] = df['age'] - 22
df['experience'] = df['experience'].apply(lambda x: max(x, 0))  # No negative experience


# ✅ Keep only the selected 5 input features + target
df = df[['age', 'education', 'occupation', 'hours-per-week', 'experience', 'income']]

# Define features (X) and target (y)
X = df.drop("income", axis=1)
y = df["income"]

# Define categorical and numerical columns
categorical_cols = ["education", "occupation"]
numerical_cols = ["age", "hours-per-week", "experience"]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
], remainder='passthrough')  # Pass through numerical columns

# Full pipeline with preprocessing + classifier
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", GradientBoostingClassifier())
])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model pipeline
joblib.dump(model, "best_model.pkl")
print("\u2705 Model trained and saved as best_model.pkl with only 5 input features.")


✅ Model trained and saved as best_model.pkl with only 5 input features.
