Gradient Boosting Machines (GBMs) with neural network backbones

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import Necessary Libraries

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

Load and Prepare the Dataset

In [3]:
# # Load your dataset
# clean_df = pd.read_csv('clean_df.csv')
# clean_df.head(5)

file_path = '/content/drive/MyDrive/DU_AI_Bootcamp/23_FinalProject/House_of_Hope/clean_df.csv'
clean_df = pd.read_csv(file_path)
clean_df.head()

# Separate features and target
X = clean_df.drop('REASON', axis=1)
y = clean_df['REASON']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Create DMatrix for XGBoost

In [4]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


Initial parameter setting

In [7]:
# Initial parameter setting
params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1,
    #'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'auc'
}


Cross-validation to find the best number of boosting rounds

In [14]:
# Cross-validation to find the best number of boosting rounds
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    metrics='auc',
    early_stopping_rounds=10,
    as_pandas=True,
    seed=42
)

# Best number of boosting rounds
best_num_boost_round = cv_results['test-auc-mean'].idxmax()
print(f'Best number of boosting rounds: {best_num_boost_round}')


Best number of boosting rounds: 999


Train final model with the best number of boosting rounds

In [15]:
# Train final model with the best number of boosting rounds
# model = xgb.train(
#     params,
#     dtrain,
#     num_boost_round=best_num_boost_round
# )
evals = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(
    params,
    dtrain,
    num_boost_round=best_num_boost_round,
    evals=evals,
    early_stopping_rounds=10
)


[0]	train-auc:0.76687	eval-auc:0.76663
[1]	train-auc:0.81327	eval-auc:0.81353
[2]	train-auc:0.82144	eval-auc:0.82171
[3]	train-auc:0.82488	eval-auc:0.82529
[4]	train-auc:0.82828	eval-auc:0.82872
[5]	train-auc:0.83333	eval-auc:0.83375
[6]	train-auc:0.83403	eval-auc:0.83448
[7]	train-auc:0.83410	eval-auc:0.83454
[8]	train-auc:0.83572	eval-auc:0.83613
[9]	train-auc:0.83738	eval-auc:0.83776
[10]	train-auc:0.83867	eval-auc:0.83900
[11]	train-auc:0.83966	eval-auc:0.83995
[12]	train-auc:0.84182	eval-auc:0.84208
[13]	train-auc:0.84235	eval-auc:0.84261
[14]	train-auc:0.84335	eval-auc:0.84358
[15]	train-auc:0.84563	eval-auc:0.84588
[16]	train-auc:0.84643	eval-auc:0.84666
[17]	train-auc:0.84756	eval-auc:0.84779
[18]	train-auc:0.84832	eval-auc:0.84855
[19]	train-auc:0.84928	eval-auc:0.84952
[20]	train-auc:0.85059	eval-auc:0.85081
[21]	train-auc:0.85106	eval-auc:0.85126
[22]	train-auc:0.85201	eval-auc:0.85222
[23]	train-auc:0.85247	eval-auc:0.85265
[24]	train-auc:0.85322	eval-auc:0.85337
[25]	train

Make predictions

In [16]:
# Make predictions
y_pred = model.predict(dtest)
predictions = [round(value) for value in y_pred]


Evaluate model

In [17]:
from sklearn.metrics import accuracy_score

# Evaluate model
accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')


Accuracy: 0.8228
ROC AUC: 0.9020


Plot training results

In [21]:
# Plot training results
import matplotlib.pyplot as plt

results = model.evals_result()
epochs = len(results['validation_0']['auc'])
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
plt.ylabel('AUC')
plt.title('XGBoost AUC')
plt.show()


NameError: name 'results' is not defined

Grid Search for Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc'),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best ROC AUC: {grid_search.best_score_}')

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
print(f'Test ROC AUC: {roc_auc:.4f}')



Fitting 5 folds for each of 324 candidates, totalling 1620 fits


# STOP HERE

Train Gradient Boosting Machine

In [None]:
# Initialize and train Gradient Boosting Classifier
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm.fit(X_train, y_train)

# Make predictions with GBM
gbm_train_preds = gbm.predict_proba(X_train)[:, 1]
gbm_test_preds = gbm.predict_proba(X_test)[:, 1]

Prepare Data for Neural Network

In [None]:
# Combine original features with GBM predictions
X_train_nn = np.hstack((X_train, gbm_train_preds.reshape(-1, 1)))
X_test_nn = np.hstack((X_test, gbm_test_preds.reshape(-1, 1)))

In [None]:
# One-hot encode the target variable
y_train_nn = to_categorical(y_train)
y_test_nn = to_categorical(y_test)

Build and Train the Neural Network

In [None]:
# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train_nn.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))  # categorical_crossentropy
#model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification
# model.add(Dense(1, activation='softmax'))  # Assuming binary classification with softmax

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_nn, y_train_nn, epochs=10, batch_size=32, validation_data=(X_test_nn, y_test_nn))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7be23bc04fa0>

Evaluate the Model

In [None]:
# Evaluate the model on the test data
y_pred_nn = model.predict(X_test_nn)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)
print(y_pred_nn)

[[0.99071026 0.0092898 ]
 [0.72661775 0.27338213]
 [0.7915172  0.2084829 ]
 ...
 [0.02831719 0.97168285]
 [0.08901953 0.91098046]
 [0.02422837 0.9757716 ]]


Calculate Accuracy

In [None]:
from sklearn.metrics import accuracy_score
# Calculate accuracy
# accuracy = accuracy_score(y_test, y_pred_nn)
# print('Accuracy:', accuracy)
# Assuming y_test needs to be converted to binary format
y_test_binary = np.where(y_test > 0.5, 1, 0) # Convert probabilities to binary (adjust threshold if needed)


# Calculate accuracy using the binary version of y_test
accuracy = accuracy_score(y_test_binary, y_pred_nn_classes)
print('Accuracy:', accuracy)

Accuracy: 0.8125955721287221


In [None]:
# # Classification report and ROC AUC score
# print(classification_report(y_test, y_pred_nn_classes))
# print('ROC AUC Score:', roc_auc_score(y_test, y_pred_nn[:, 1]))

# Classification report and ROC AUC score
print(classification_report(y_test, y_pred_nn_classes))
print('ROC AUC Score:', roc_auc_score(y_test_nn[:, 1], y_pred_nn[:, 1]))

              precision    recall  f1-score   support

           0       0.78      0.68      0.73    472472
           1       0.83      0.89      0.86    815822

    accuracy                           0.81   1288294
   macro avg       0.80      0.79      0.79   1288294
weighted avg       0.81      0.81      0.81   1288294

ROC AUC Score: 0.8921170350566425


In [None]:
# confusion_matrix for predicted values
conf_matrix = confusion_matrix(y_test, y_pred_nn_classes)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[322645 149827]
 [ 91605 724217]]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define the confusion matrix
conf_matrix = np.array([[322645, 149827], [91605, 724217]])

# Create the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])

# Add labels and title
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix Heatmap')
plt.show()



In [None]:
# mse, rmse, rsquared, mae # between y_true_value(actuals) and y_predict

In [None]:
# class_validations report # How the model is performing (CV scores)
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(gbm, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

Cross-Validation Scores: [0.76917924 0.71932028 0.76780766 0.75125088 0.77515829]
Mean Cross-Validation Score: 0.7565432696509167


Save the GBM Model

In [None]:
# Save the model
joblib.dump(gbm, '/content/drive/MyDrive/DU_AI_Bootcamp/23_FinalProject/House_of_Hope/gbm_model_classified_crossentropy.pkl')

['/content/drive/MyDrive/DU_AI_Bootcamp/23_FinalProject/House_of_Hope/gbm_model_classified_crossentropy.pkl']

Load the Saved GBM Model

In [None]:
# Load the model
gbm_loaded = joblib.load('/content/drive/MyDrive/DU_AI_Bootcamp/23_FinalProject/House_of_Hope/gbm_model_classified_crossentropy.pkl')

# Verify the loaded model
print(gbm_loaded.predict(X_test[:5]))  # Predict using the loaded model to verify

[0 1 0 0 1]


Summary


*   Load and preprocess the data: Normalize features and split the dataset.
*   Train a Gradient Boosting Machine (GBM): Use the GBM to predict probabilities.
*   Prepare data for the neural network: Combine the original features with GBM predictions and one-hot encode the target.
*   Build and train a neural network: Use Keras to define, compile, and train the neural network.
*   Evaluate the model: Assess the performance using classification metrics and ROC AUC score.

This approach leverages the strengths of both GBMs and neural networks, potentially leading to better performance on complex datasets.