In [None]:
# Imports, Loading data, and preprocessing
# Imports
import pandas as pd
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
# noinspection PyUnresolvedReferences
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Load data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]
df = pd.read_csv(url, header=None, names=columns, na_values=' ?').dropna()

# Encode categorical features
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Train-test split
X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Smote
def apply_smote(X_train, y_train, seed=42):
    sm = SMOTE(random_state=seed)
    return sm.fit_resample(X_train, y_train)


In [None]:
# Model Training and Evaluation Function
def train_and_evaluate(model, name, X_train, y_train):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n🧠 {name} Results")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


In [None]:

# Import required libraries
from sklearn.ensemble import RandomForestClassifier
# noinspection PyUnresolvedReferences - this is still required; scikit-learn a metrics common problem
from sklearn.metrics import accuracy_score, classification_report

# random forest
X_rf, y_rf = apply_smote(X_train, y_train, seed=1)
rf = RandomForestClassifier(n_estimators=150, max_depth=15, random_state=42)
train_and_evaluate(rf, "Random Forest", X_rf, y_rf)


In [None]:
# XGBoost
X_xgb, y_xgb = apply_smote(X_train, y_train, seed=2)
xgb = XGBClassifier(eval_metric='logloss', n_estimators=150, max_depth=7, random_state=42)
train_and_evaluate(xgb, "XGBoost", X_xgb, y_xgb)

In [None]:
# Gradient Boosting Classifier
X_gbc, y_gbc = apply_smote(X_train, y_train, seed=3)
gbc = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
train_and_evaluate(gbc, "Gradient Boosting", X_gbc, y_gbc)

In [None]:
# LightGBM
X_lgb, y_lgb = apply_smote(X_train, y_train, seed=4)
lgbm = LGBMClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
train_and_evaluate(lgbm, "LightGBM", X_lgb, y_lgb)


In [None]:
# using optuna for hyperparameter tuning (slow and often does not get you the best ranges, but if you're willing to sacrifice a lot of time it could help.)
# Make sure optuna is installed
import optuna
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 600)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        random_state=42
    )

    # Use cross-validation for stability
    score = cross_val_score(model, X_resampled, y_resampled, cv=3, scoring='accuracy').mean()
    return score


# Run optimization
study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=30) # Try 30 different combos - 15 is the best combo
study.optimize(objective, n_trials=18) # A more practical take

# Show the best result
print("✅ Best Parameters:", study.best_params)



In [None]:
# 📦 Import stacking-related modules
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# noinspection PyUnresolvedReferences
from sklearn.metrics import accuracy_score, classification_report

# ✅ Define base models (level-0)
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss', random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('lgb', LGBMClassifier(random_state=42))
]

# ✅ Define meta-model (level-1)
# You can use LogisticRegression for speed or GradientBoosting for power
meta_model = GradientBoostingClassifier(random_state=42)

# ✅ Build Stacking Classifier
stack = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)
#  FixMe: Cpt!
# ✅ Fit the stack on SMOTE-resampled data - THIS IS A VERSION CONFLICT - INVALID
# stack.fit(X_resampled, y_resampled)

# ✅ Evaluate on untouched test data
# y_pred_stack = stack.predict(X_test)
# print("📊 Stacked Accuracy:", accuracy_score(y_test, y_pred_stack))
# print(classification_report(y_test, y_pred_stack))



In [None]:
# Using SHAP
import shap


# ✅ Initialize the model
model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    min_samples_leaf=10,
    random_state=42
)
model.fit(X_train, y_train)

# Tree-based models only!
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Plot summary
shap.summary_plot(shap_values, X_test, plot_type="bar")
shap.summary_plot(shap_values, X_test)


In [None]:
# threshold tuning
from sklearn.metrics import precision_recall_curve
import numpy as np

probs = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, probs)

# Find best F1 threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
print("🔍 Best Threshold:", best_threshold)



In [None]:
# Use the best threshold to make final predictions
final_preds = (probs > 0.428).astype(int)

# Evaluate again
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("✅ Accuracy (Custom Threshold):", accuracy_score(y_test, final_preds))
print(classification_report(y_test, final_preds))
print(confusion_matrix(y_test, final_preds))


In [None]:
# shap dropping unimportant features
import shap
from sklearn.ensemble import GradientBoostingClassifier

# Train model
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, random_state=42)
model.fit(X_resampled, y_resampled)

# Get SHAP values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Plot feature importance
shap.summary_plot(shap_values, X, plot_type="bar")


In [None]:
import numpy as np
import pandas as pd

# Calculate mean absolute SHAP value for each feature
shap_abs_mean = np.abs(shap_values).mean(axis=0)
shap_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': shap_abs_mean
}).sort_values(by='importance', ascending=False)

print(shap_importance)


In [None]:
# Keep top N most important features
top_k = 10  # choose how many you want
top_features = shap_importance['feature'].head(top_k).tolist()

# Reduce X accordingly
X_top = X[top_features]


In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

# Train again
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_probs = model.predict_proba(X_test)[:, 1]
y_preds = (y_probs > 0.428).astype(int)  # use your threshold

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy (retrained):", accuracy_score(y_test, y_preds))
print(classification_report(y_test, y_preds))


Rdd13r grade is A, not A+ - version-specific changes required.