In [None]:
# SMOTE
from imblearn.over_sampling import SMOTENC
sm = SMOTENC(categorical_features=[0,1,3,4,5,6,17,18,19,20,21], random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
X_res_test = X_test.copy()
y_res_test = y_test.copy()
# Get percentage of 1 in y_res
print("Percentage of 'Yes' in y_res: ", np.sum(y_res == 1) / len(y_res))
# Round synth values to integer in X_res
discrete_columns = [8,10,11,12,13,14,15]
#for col in discrete_columns:
#X_res.iloc[:,col] = np.round(X_res.iloc[:,col])
X_res

In [None]:
# Do a triangle correlation so we do not have duplicates
corr = X_res.corr()
triangle_mask = np.triu(np.ones_like(corr, dtype=bool))
# Get the correlation values
new_correlation = corr.mask(triangle_mask).stack().reset_index()
# Rename the columns
new_correlation.columns = ['Variable 1', 'Variable 2', 'Taux de correlation']
# Drop NA Corr if there are
new_correlation.dropna(subset = ["Taux de correlation"], inplace=True)
# Drop the rows where the correlation is 1 (because it means the variable is correlated with itself)
# Round up the values and set them to absolute
new_correlation['Taux de correlation'] = new_correlation['Taux de correlation'].round(2)
new_correlation['Taux de correlation'] = new_correlation['Taux de correlation'].abs()
# Sort the correlations by descending order
new_correlation.sort_values(by='Taux de correlation', ascending=False, inplace=True)

# Get the list of unique variables sin the "Variable 2" column where correlation >= 0.5
unique_variables = new_correlation[new_correlation['Taux de correlation'] >= 0.5]['Variable 2'].unique()
#X_train
X_res = X_res.drop(columns=unique_variables)
X_res_test = X_res_test.drop(columns=unique_variables)

In [None]:
# Transform all object columns to category
for col in X_res.select_dtypes(include=['object']).columns:
    X_res[col] = X_res[col].astype('category')
    X_res_test[col] = X_res_test[col].astype('category')

# Retrain the model
xgb_smotenc = XGBClassifier(n_estimators=900, enable_categorical=True, tree_method="gpu_hist")
eval_set_smotenc = [(X_res, y_res), (X_res_test, y_res_test)]
eval_metric = ["auc","error"]
xgb_smotenc.fit(X_res, y_res, eval_metric=eval_metric, eval_set=eval_set_smotenc)
xgb_smotenc_pred = xgb_smotenc.predict(X_res)

# Plot error
results = xgb_smotenc.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('error')
plt.title('Model Classification Error')
plt.show()

In [None]:
#ROC Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_res_test))]
# predict probabilities
xgb_probs = xgb_smotenc.predict_proba(X_res_test)
# keep probabilities for the positive outcome only
xgb_probs = xgb_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_res_test, ns_probs)
xgb_auc = roc_auc_score(y_res_test, xgb_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('XGB: ROC AUC=%.3f' % (xgb_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_res_test, ns_probs)
xgb_fpr, xgb_tpr, _ = roc_curve(y_res_test, xgb_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(xgb_fpr, xgb_tpr, marker='.', label='XGB')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()