<a href="https://colab.research.google.com/github/NetaYinon26/projects/blob/main/Ensemble_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **predictions for FLAML, TPOT, H2O models**

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import shap
from sklearn.ensemble import RandomForestClassifier


# FLAML AutoML
from flaml import AutoML

# TPOT AutoML
from tpot import TPOTClassifier

# H2O AutoML
import h2o
from h2o.automl import H2OAutoML

# ==============
# loading data
# ==============
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

TARGET = 'smoking'
ID_COL = 'id'

# data pre-processing
X = train_df.drop([TARGET, ID_COL], axis=1)
y = train_df[TARGET]
X_test = test_df.drop(ID_COL, axis=1)

# ==============
# FLAML AutoML
# ==============
flaml_automl = AutoML()
flaml_settings = {
    "time_budget": 3600,
    "metric": "roc_auc",
    "task": "classification",
    "log_file_name": "flaml_log.log",
    "seed": 42
}

flaml_automl.fit(X, y, **flaml_settings)

print(f"FLAML Best Hyperparameters: {flaml_automl.best_config}")
print(f"FLAML Best Model: {flaml_automl.best_estimator}")

# prediction for train and test set
flaml_train_preds = flaml_automl.predict_proba(X)[:, 1]
flaml_test_preds = flaml_automl.predict_proba(X_test)[:, 1]

# AUC for training set
flaml_auc = roc_auc_score(y, flaml_train_preds)
print(f"FLAML AUC (train): {flaml_auc:.4f}")

# ==============
# TPOT AutoML
# ==============
tpot = TPOTClassifier(
    generations=3,
    population_size=30,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

tpot.fit(X, y)
print(f"TPOT Best Pipeline: {tpot.fitted_pipeline_}")

# prediction for train and test set
tpot_train_preds = tpot.predict_proba(X)[:, 1]
tpot_test_preds  = tpot.predict_proba(X_test)[:, 1]

# AUC for training set
tpot_auc = roc_auc_score(y, tpot_train_preds)
print(f"TPOT AUC (train): {tpot_auc:.4f}")


# ==============
# H2O AutoML
# ==============
h2o.init()
train_h2o = h2o.H2OFrame(train_df)
train_h2o[TARGET] = train_h2o[TARGET].asfactor()
test_h2o = h2o.H2OFrame(test_df)

aml = H2OAutoML(max_runtime_secs=3600, seed=42)
aml.train(x=X.columns.tolist(), y=TARGET, training_frame=train_h2o)

# prediction for train and test set
h2o_train_preds = aml.leader.predict(train_h2o).as_data_frame()['p1'].values
h2o_test_preds  = aml.leader.predict(test_h2o ).as_data_frame()['p1'].values

# AUC for training set
h2o_auc = roc_auc_score(y, h2o_train_preds)
print(f"H2O AUC (train): {h2o_auc:.4f}")

print("H2O Best Model:", aml.leader.algo)
print("Hyperparameters of the best model:")
print(aml.leader.params)


# **creating weighted voting ensemble model for predictions from 3 models**

In [None]:
# computing test predictions according to train AUC score of each model
combined_proba = (
    (flaml_test_preds * flaml_auc) +
    (tpot_test_preds  * tpot_auc) +
    (h2o_test_preds   * h2o_auc)
) / (flaml_auc + tpot_auc + h2o_auc)

# prediction for ensemble training set
combined_train_proba = (
                         (flaml_train_preds * flaml_auc) +
                         (tpot_train_preds * tpot_auc) +
                         (h2o_train_preds * h2o_auc)
                 ) / (flaml_auc + tpot_auc + h2o_auc)

combined_train_proba_auc = roc_auc_score(y, combined_train_proba)
print("combined train auc: ", combined_train_proba_auc)

submission_combined_proba = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET: combined_proba
})

# ==============
# submission files
# ==============

submission_flaml = pd.DataFrame({ID_COL: test_df[ID_COL], TARGET: flaml_test_preds})
submission_flaml.to_csv('submission_flaml.csv', index=False)

submission_tpot = pd.DataFrame({ID_COL: test_df[ID_COL], TARGET: tpot_test_preds})
submission_tpot.to_csv('submission_tpot.csv', index=False)

submission_h2o = pd.DataFrame({ID_COL: test_df[ID_COL], TARGET: h2o_test_preds})
submission_h2o.to_csv('submission_h2o.csv', index=False)

submission_combined_proba.to_csv('submission_combined_proba.csv', index=False)

print("Saved submission_combined_proba.csv successfully!")

print("Saved all submission files successfully!")


# **SHAP Analysis with KernelExplainer**

In [None]:
# 1) Define a function for model prediction (since KernelExplainer needs a callable function)
def model_predict(data):
    """Function that takes input features and returns FLAML model predictions"""
    data_df = pd.DataFrame(data, columns=X.columns)  # Ensure correct feature names
    return flaml_automl.predict_proba(data_df)[:, 1]  # Return probabilities for class 1

# 2) Create background data (sampling 100 random rows for efficiency)
X_background = X.sample(100, random_state=42)

# 3) Initialize KernelExplainer
explainer = shap.KernelExplainer(
    model=model_predict,
    data=X_background
)

# 4) Compute SHAP values for a sample of 200 instances (to reduce computation time)
X_shap_eval = X.sample(200, random_state=42)
shap_values = explainer.shap_values(X_shap_eval, nsamples=100)

# (Optional) Initialize JS-based visuals (if using Jupyter Notebook)
shap.initjs()

# 5) Local Explanations for 3 specific samples
sample_indices = [0, 1, 2]  # Pick any 3 rows from the sampled data
for idx in sample_indices:
    row = X_shap_eval.iloc[idx]
    # Waterfall plot for each sample
    shap.plots.waterfall(
        shap.Explanation(
            values=shap_values[idx],
            base_values=explainer.expected_value,
            data=row,
            feature_names=X.columns
        )
    )

# 6) Global Feature Importance: Summary Plot
shap.summary_plot(shap_values, X_shap_eval, feature_names=X.columns)

# 7) Identify Top 5 Features by Mean Absolute SHAP Value
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)
top5_indices  = np.argsort(mean_abs_shap)[-5:][::-1]
top5_features = X.columns[top5_indices]
print("Top 5 Important Features:", top5_features.tolist())

# 8) SHAP Dependence Plots for Top 5 Features with customized y-axis label
for feat in top5_features:
    # Generate the dependence plot but do not immediately display it
    shap.dependence_plot(
        feat,
        shap_values,
        X_shap_eval,
        interaction_index=None,  # Let SHAP auto-select an interaction feature
        show=False
    )
    # Customize the y-axis label
    plt.ylabel("SHAP Value")
    plt.title(f"SHAP Dependence Plot for {feat}")
    # Show the plot
    plt.show()