## Income Classifier Inference

This notebook demonstrates how to load the trained income classifier, preprocess user-supplied attributes, and generate a predicted income bracket.


In [3]:
import pandas as pd
import joblib
import numpy as np

# Load cleaned data
data = pd.read_csv('../data/cleaned.csv')

# Load the saved best model (XGBoost or any other model)
best_model = joblib.load('../models/best_xgboost_model.pkl')

# Load the scalers for preprocessing
hours_per_week_scaler = joblib.load('../tools/hours_scaler.pkl')
education_encoder = joblib.load('../tools/education_encoder.pkl')

# Mapping from user-friendly race terms to dataset's race labels
race_mapping = {
    'White': 'White',
    'Black': 'Black',
    'Asian': 'Asian-Pac-Islander',
    'Hispanic': 'Amer-Indian-Eskimo',
    'Other': 'Other'
}

# Function to preprocess user input with Label Encoding for 'education' and One-Hot Encoding for others
def preprocess_input(user_input):
    # Label encode 'education' column (since it is label encoded)
    user_input['education'] = education_encoder.transform([user_input['education']])[0]

    # Map user-friendly race input to dataset's race values
    race_value = user_input['race'].iloc[0]
    user_input['race'] = race_mapping.get(race_value, 'Other')  # Default to 'Other' if the input doesn't match


    # One-Hot Encode categorical variables (e.g., 'gender', 'workclass', 'marital-status', etc.)
    user_input = pd.get_dummies(user_input, columns=['gender', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'])
    
    # Scale 'hours-per-week' field
    user_input['hours-per-week'] = hours_per_week_scaler.transform(user_input[['hours-per-week']])

    # Log scaling for 'capital-gain' and 'capital-loss'
    user_input['capital-gain'] = np.log1p(user_input['capital-gain'])  # log(x + 1) to avoid log(0)
    user_input['capital-loss'] = np.log1p(user_input['capital-loss'])  # log(x + 1) to avoid log(0)

    # Ensure all columns match the training data (add missing columns with 0s)
    required_columns = [col for col in data.columns if col != 'income']  # Get feature names from the model
    
    # Identify missing columns and add them with 0s
    missing_cols = set(required_columns) - set(user_input.columns)
    for col in missing_cols:
        user_input[col] = 0  # Add missing columns and set to 0
    
    # Reorder columns to match training data
    user_input = user_input[required_columns]
    
    return user_input


user_input = pd.DataFrame([{
    'age': 40,
    'education': 'HS-grad',  
    'capital-gain': 0,
    'capital-loss': 0,
    'hours-per-week': 10,
    'workclass': 'Private',
    'marital-status': 'Married-civ-spouse',
    'occupation': 'Machine-op-inspct',
    'relationship': 'Wife',
    'race': 'Black',
    'native-country': 'United-States',
    'gender': 'Female'  # This will be one-hot encoded
}])

# Preprocess the input data
user_input_processed = preprocess_input(user_input)

# Output the processed dataframe for inspection
pd.set_option('display.max_columns', None)

print("Processed User Input DataFrame:")
print(user_input_processed)

# Make prediction using the trained model
prediction = best_model.predict(user_input_processed)

# Decode the prediction
prediction_label = ">=50K" if prediction[0] == 1 else "<=50K"

# Output the prediction
print(f"Predicted Income Class: {prediction_label}")

Processed User Input DataFrame:
   age  education  capital-gain  capital-loss  hours-per-week  \
0   40         11           0.0           0.0            10.0   

   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
0                      0                    0                       0   

   workclass_Private  workclass_Self-emp-inc  workclass_Self-emp-not-inc  \
0               True                       0                           0   

   workclass_State-gov  workclass_Unknown  workclass_Without-pay  \
0                    0                  0                      0   

   marital-status_Divorced  marital-status_Married-AF-spouse  \
0                        0                                 0   

   marital-status_Married-civ-spouse  marital-status_Married-spouse-absent  \
0                               True                                     0   

   marital-status_Never-married  marital-status_Separated  \
0                             0                      

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
