In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('/content/adult 3.csv')  # or upload your own file

df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Replace '?' with NaN and drop such rows
df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)

# Drop columns with too many categories or less relevance
df.drop(['fnlwgt', 'education', 'native-country'], axis=1, inplace=True)

# Encode categorical columns
label_encoders = {}
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()


In [None]:
X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import joblib

joblib.dump(model, 'salary_model.pkl')


In [None]:
import gradio as gr
import numpy as np

# Load saved model
model = joblib.load("salary_model.pkl")

def predict_salary(age, education_num, marital_status, occupation, relationship, race, gender,
                   capital_gain, capital_loss, hours_per_week, workclass):
    # Encode categorical inputs (same label encoders used earlier)
    input_data = pd.DataFrame([[
        age,
        workclass,
        education_num,
        marital_status,
        occupation,
        relationship,
        race,
        gender,
        capital_gain,
        capital_loss,
        hours_per_week
    ]], columns=[
        'age', 'workclass', 'educational-num', 'marital-status',
        'occupation', 'relationship', 'race', 'gender',
        'capital-gain', 'capital-loss', 'hours-per-week'
    ])

    # Apply label encoding
    for col in input_data.columns:
        if col in label_encoders:
            input_data[col] = label_encoders[col].transform(input_data[col])

    prediction = model.predict(input_data)[0]
    return "Income >50K" if prediction == 1 else "Income <=50K"
