In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

In [None]:
data=pd.read_csv("/content/adult 3.csv")

In [None]:
data.occupation.replace({'?':'Not-listed'},inplace=True)
data.workclass.replace({'?':'Others'},inplace=True)

In [None]:
data = data[data['workclass']!='Without-pay']
data = data[data['workclass']!='Never-worked']
data = data[data['education']!='5th-6th']
data = data[data['education']!='1st-4th']
data = data[data['education']!='Preschool']
data = data[data['marital-status']!='Married-AF-spouse']
data = data[data['occupation'] != 'Armed-Forces']

In [None]:
#redundancy
data.drop(columns='educational-num' , inplace=True)
data.drop(columns='race' , inplace=True)
data.drop(columns='relationship' , inplace=True)
data.drop(columns='fnlwgt',inplace=True)

In [None]:
plt.boxplot(data['age'])
plt.show()

In [None]:
data=data[(data['age']<=75)& (data['age']>=17)]

In [None]:
plt.boxplot(data['age'])
plt.show()

In [None]:
data

In [None]:
# Define features & label
X = data.drop(columns=['income'])  # 'salary' is your target column
y = data["income"]

In [None]:
# Define column types
categorical_cols = ["workclass","education", "marital-status", "occupation", "gender", "native-country"]
numerical_cols = ["age", "capital-gain", "capital-loss", "hours-per-week"]

In [None]:
# Define preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ("num", MinMaxScaler(), numerical_cols)
    ]
)

In [None]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GradientBoostingClassifier())
])

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {accuracy:.4f}")

# Detailed Classification Report
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
joblib.dump(pipeline, "model.pkl")
print("✅ Model pipeline saved as model.pkl")

In [None]:
import joblib
model = joblib.load("model.pkl")
print(type(model))