In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ==============================================
# EMPLOYEE SALARY PREDICTION USING ML MODELS
# ==============================================

# Salary prediction determines whether a person earns <=50K or >50K
# based on demographic and employment-related features.


# ==============================================
# Step 1: Import Libraries
# ==============================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# ==============================================
# Step 2: Load Dataset (YOUR CORRECT PATH)
# ==============================================

data = pd.read_csv(
    "/kaggle/input/datasets/priyamchoksi/adult-census-income-dataset/adult.csv"
)


# ==============================================
# Step 3: Data Preprocessing (FIXED VERSION)
# ==============================================

# Clean column names
data.columns = data.columns.str.strip()

# Replace '?' with NaN
data.replace("?", np.nan, inplace=True)

# Separate numeric & categorical columns
numeric_cols = data.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = data.select_dtypes(include=["object"]).columns

# Fill missing values correctly
data[numeric_cols] = data[numeric_cols].fillna(0)
data[categorical_cols] = data[categorical_cols].fillna("Unknown")

# Encode categorical columns safely
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])


# ==============================================
# Step 4: Split Dataset
# ==============================================

X = data.drop("income", axis=1)
y = data["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ==============================================
# Step 5: Random Forest
# ==============================================

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print("===== Random Forest Classification Report =====")
print(classification_report(y_test, rf_pred))


# ==============================================
# Step 6: Gradient Boosting
# ==============================================

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

gb_pred = gb.predict(X_test)

print("===== Gradient Boosting Classification Report =====")
print(classification_report(y_test, gb_pred))


# ==============================================
# Step 7: AdaBoost
# ==============================================

ab = AdaBoostClassifier(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)

ab_pred = ab.predict(X_test)

print("===== AdaBoost Classification Report =====")
print(classification_report(y_test, ab_pred))


# ==============================================
# Step 8: Comparison Table
# ==============================================

results = []

results.append([
    "Random Forest",
    accuracy_score(y_test, rf_pred),
    precision_score(y_test, rf_pred),
    recall_score(y_test, rf_pred),
    f1_score(y_test, rf_pred)
])

results.append([
    "Gradient Boosting",
    accuracy_score(y_test, gb_pred),
    precision_score(y_test, gb_pred),
    recall_score(y_test, gb_pred),
    f1_score(y_test, gb_pred)
])

results.append([
    "AdaBoost",
    accuracy_score(y_test, ab_pred),
    precision_score(y_test, ab_pred),
    recall_score(y_test, ab_pred),
    f1_score(y_test, ab_pred)
])

comparison = pd.DataFrame(
    results,
    columns=["Algorithm", "Accuracy", "Precision", "Recall", "F1-Score"]
)

print("\n===== Algorithm Comparison =====")
print(comparison)


# ==============================================
# Step 9: Best Algorithm
# ==============================================

best = comparison.loc[comparison["F1-Score"].idxmax()]

print("\n===== Best Algorithm Based on F1-Score =====")
print(best)