In [3]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Step 2: Load the data
df = pd.read_csv('data/adult 3.csv')

# Step 3: Clean the data
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)

# Step 4: Encode text data (like workclass, education, etc.)
cat_cols = df.select_dtypes(include='object').columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Step 5: Split data
X = df.drop('income', axis=1)
y = df['income']  # income is the target column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Evaluate
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 8: Save model and encoders
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(model, 'model/salary_model.pkl')
joblib.dump(encoders, 'model/encoders.pkl')
joblib.dump(X.columns.tolist(), 'model/features.pkl')


✅ Accuracy: 0.7925069096120381
              precision    recall  f1-score   support

           0       0.82      0.94      0.87      7479
           1       0.62      0.30      0.41      2290

    accuracy                           0.79      9769
   macro avg       0.72      0.62      0.64      9769
weighted avg       0.77      0.79      0.76      9769



['model/features.pkl']