In [515]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, log_loss, brier_score_loss
import xgboost as xgb
from sklearn.calibration import CalibratedClassifierCV
import warnings

from sklearn.preprocessing import label_binarize


warnings.filterwarnings('ignore')


In [516]:
# Load the dataset
data = pd.read_csv('EPL_Updated.csv')

# Display the first few rows of the dataset to understand its structure
data.tail()


Unnamed: 0,season,week,date,home_team,home_xg,score,away_xg,away_team,referee,game_id,...,away_points_to_date,home_form,away_form,match_result,League Division,Home Team,Away Team,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
1895,2324,38,2024-05-19,crystal palace,2.5,5-0,0.9,aston villa,Darren Bond,c975c7a6,...,68,2.6,1.6,0,E0,Crystal Palace,Aston Villa,1.63,4.64,4.99
1896,2324,38,2024-05-19,liverpool,4.5,2-0,0.5,wolves,Chris Kavanagh,d4823ed5,...,46,1.6,0.6,0,E0,Liverpool,Wolves,1.13,10.69,16.27
1897,2324,38,2024-05-19,luton town,2.0,2-4,1.1,fulham,Matt Donohue,0fde9d70,...,44,0.2,1.0,2,E0,Luton,Fulham,3.11,3.99,2.19
1898,2324,38,2024-05-19,manchester city,1.9,3-1,0.4,west ham,John Brooks,29335211,...,52,3.0,0.8,0,E0,Man City,West Ham,1.07,15.01,27.84
1899,2324,38,2024-05-19,sheffield utd,1.0,0-3,3.1,tottenham,Andy Madley,273a89b4,...,63,0.0,0.6,2,E0,Sheffield United,Tottenham,7.99,6.11,1.35


In [517]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          1900 non-null   int64  
 1   week                            1900 non-null   int64  
 2   date                            1900 non-null   object 
 3   home_team                       1900 non-null   object 
 4   home_xg                         1900 non-null   float64
 5   score                           1900 non-null   object 
 6   away_xg                         1900 non-null   float64
 7   away_team                       1900 non-null   object 
 8   referee                         1900 non-null   object 
 9   game_id                         1900 non-null   object 
 10  home_team_elo                   1900 non-null   float64
 11  away_team_elo                   1900 non-null   float64
 12  home_starters                   19

In [518]:
# Combine X and y into a single dataframe to drop rows with missing values together
data_cleaned = data.drop(columns=['week','home_xg', 'score', 'away_xg','game_id','home_match_points', 'away_match_points','home_goals_scored', 'away_goals_scored', 'referee', 'date','home_team', 'away_team', 'League Division', 'Home Team', 'Away Team', 'home_starters', 'away_starters' ]).dropna()


In [519]:
data_cleaned.tail()

Unnamed: 0,season,home_team_elo,away_team_elo,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,home_xG_against_to_date,away_xG_against_to_date,home_goals_scored_to_date,...,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,match_result,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
1895,2324,1742.560303,1787.528564,0.387,0.387,46.2,62.6,51.3,57.5,52,...,58,56,46,68,2.6,1.6,0,1.63,4.64,4.99
1896,2324,1897.319702,1681.174927,1.122,0.225,83.3,46.5,45.3,63.2,84,...,41,63,79,46,1.6,0.6,0,1.13,10.69,16.27
1897,2324,1572.322388,1707.862427,-0.017,0.207,40.3,49.7,76.7,61.0,50,...,81,59,26,44,0.2,1.0,2,3.11,3.99,2.19
1898,2324,2048.724609,1728.056885,1.685,0.455,78.7,52.1,35.0,69.4,93,...,33,71,88,52,3.0,0.8,0,1.07,15.01,27.84
1899,2324,1522.744995,1784.94458,0.0,0.512,37.6,64.9,73.6,62.5,35,...,101,61,16,63,0.0,0.6,2,7.99,6.11,1.35


In [520]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          1900 non-null   int64  
 1   home_team_elo                   1900 non-null   float64
 2   away_team_elo                   1900 non-null   float64
 3   home_team_strength              1900 non-null   float64
 4   away_team_strength              1900 non-null   float64
 5   home_xG_to_date                 1900 non-null   float64
 6   away_xG_to_date                 1900 non-null   float64
 7   home_xG_against_to_date         1900 non-null   float64
 8   away_xG_against_to_date         1900 non-null   float64
 9   home_goals_scored_to_date       1900 non-null   int64  
 10  away_goals_scored_to_date       1900 non-null   int64  
 11  home_goals_conceded_to_date     1900 non-null   int64  
 12  away_goals_conceded_to_date     19

In [521]:
# Split the data by season using integer comparison
train_data = data_cleaned[data_cleaned['season'] < 2324]
test_data = data_cleaned[data_cleaned['season'] == 2324]

# Drop the season column as it is no longer needed
train_data = train_data.drop(['season'], axis=1)
test_data = test_data.drop(['season'], axis=1)

# Separate features and target variables
X_train = train_data.drop(['match_result'], axis=1)
y_train = train_data['match_result']
X_test = test_data.drop(['match_result'], axis=1)
y_test = test_data['match_result']

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [522]:

# Separate X and y after dropping missing values
#X = train_data.drop('match_result', axis=1)
#y = test_data['match_result']



In [523]:
# Standardize the features
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)


In [524]:
# Split the dataset into training set and test set using stratified sampling
#X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [525]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=3)

# Train the model
xgb_model.fit(X_train, y_train)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_class=3, num_parallel_tree=None, objective='multi:softprob', ...)

In [526]:
# Predict on the test set
y_pred = xgb_model.predict(X_test_scaled)
y_prob = xgb_model.predict_proba(X_test_scaled)

# Generate a classification report
class_report = classification_report(y_test, y_pred, target_names=['Home Win', 'Away Win', 'Draw'])
print(class_report)

# Calculate the ROC AUC score
roc_auc = roc_auc_score(label_binarize(y_test, classes=[0, 1, 2]), y_prob, multi_class="ovr", average="macro")
print(f"ROC AUC Score: {roc_auc:.4f}")

# Calculate Log Loss
log_loss_score = log_loss(y_test, y_prob)
print(f"Log Loss: {log_loss_score:.4f}")

# Calculate Brier Score for each class
brier_scores = []
for i in range(y_prob.shape[1]):
    y_test_bin_class = (y_test == i).astype(int)
    brier_score_class = brier_score_loss(y_test_bin_class, y_prob[:, i])
    brier_scores.append(brier_score_class)

mean_brier_score = np.mean(brier_scores)
print(f"Mean Brier Score: {mean_brier_score:.4f}")


              precision    recall  f1-score   support

    Home Win       0.50      0.75      0.60       175
    Away Win       0.24      0.05      0.08        82
        Draw       0.51      0.41      0.45       123

    accuracy                           0.49       380
   macro avg       0.41      0.40      0.38       380
weighted avg       0.44      0.49      0.44       380

ROC AUC Score: 0.5897
Log Loss: 1.1192
Mean Brier Score: 0.2158


# Perform stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
roc_auc_scores = []

for train_index, test_index in skf.split(X_test_scaled, y_test):
    X_train_cv, X_test_cv = X_test_scaled[train_index], X_test_scaled[test_index]
    y_train_cv, y_test_cv = y_test[train_index], y_test[test_index]
    
    xgb_model_cv = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=3)
    xgb_model_cv.fit(X_train_cv, y_train_cv)
    
    y_prob_cv = xgb_model_cv.predict_proba(X_test_cv)
    roc_auc_cv = roc_auc_score(label_binarize(y_test_cv, classes=[0, 1, 2]), y_prob_cv, multi_class="ovr", average="macro")
    roc_auc_scores.append(roc_auc_cv)

print("Cross-Validated ROC AUC Scores:", roc_auc_scores)
print(f"Mean ROC AUC Score: {np.mean(roc_auc_scores):.4f}")


## Ensemble Modeling ##

In [527]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings


warnings.filterwarnings('ignore')

In [528]:
from sklearn.metrics import make_scorer, roc_auc_score

# Define a custom scoring function for multiclass ROC AUC
def multiclass_roc_auc_score(y_true, y_pred, average="macro"):
    y_true = label_binarize(y_true, classes=[0, 1, 2])
    return roc_auc_score(y_true, y_pred, average=average, multi_class="ovr")

# Create a scorer using make_scorer
roc_auc_scorer = make_scorer(multiclass_roc_auc_score, needs_proba=True)


In [529]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Split the data by season
train_data = data_cleaned[data_cleaned['season'] < 2324]
test_data = data_cleaned[data_cleaned['season'] == 2324]

# Prepare features and target variables
X_train = train_data.drop(['match_result'], axis=1)
y_train = train_data['match_result']
X_test = test_data.drop(['match_result'], axis=1)
y_test = test_data['match_result']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [530]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [80],
    'learning_rate': [0.00001],
    'max_depth': [2],
    'subsample': [0.2],
    'colsample_bytree': [.8]
}

# Initialize the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob', num_class=3)

# Initialize GridSearchCV for XGBoost with the custom scorer
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring=roc_auc_scorer, verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search_xgb.fit(X_train_scaled, y_train)


# Get the best parameters and the best model
best_xgb_model = grid_search_xgb.best_estimator_
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 1e-05, 'max_depth': 2, 'n_estimators': 80, 'subsample': 0.2}


In [531]:
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [90, 100, 120, 130, 150],
    'max_depth': [2,3],
    'min_samples_split': [2,3,4,5,6],
    'min_samples_leaf': [3,4,5,6,7,8],
    'bootstrap': [True]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Initialize GridSearchCV for Random Forest with the custom scorer
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, scoring=roc_auc_scorer, verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search_rf.fit(X_train_scaled, y_train)

# Get the best parameters and the best model
best_rf_model = grid_search_rf.best_estimator_
print("Best Random Forest Parameters:", grid_search_rf.best_params_)


Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best Random Forest Parameters: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 120}


In [532]:
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.0001, 0.001, 0.01],
    'solver': ['lbfgs'],
    'penalty': ['l2'],  # L1 penalty can be used with 'liblinear'
    'max_iter': [100000, 50000, 30000]
}

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(multi_class='multinomial')

# Initialize GridSearchCV for Logistic Regression with the custom scorer
grid_search_lr = GridSearchCV(estimator=logistic_model, param_grid=param_grid_lr, cv=5, scoring=roc_auc_scorer, verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search_lr.fit(X_train_scaled, y_train)

# Get the best parameters and the best model
best_lr_model = grid_search_lr.best_estimator_
print("Best Logistic Regression Parameters:", grid_search_lr.best_params_)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Logistic Regression Parameters: {'C': 0.001, 'max_iter': 100000, 'penalty': 'l2', 'solver': 'lbfgs'}


In [533]:
from sklearn.ensemble import VotingClassifier

# Create a soft voting classifier with the best models
voting_clf_soft = VotingClassifier(estimators=[
    ('lr', best_lr_model), ('rf', best_rf_model), ('xgb', best_xgb_model)
], voting='soft')

# Train the soft voting classifier
voting_clf_soft.fit(X_train_scaled, y_train)

# Predict using the tuned ensemble model
y_pred_soft_ensemble = voting_clf_soft.predict(X_test_scaled)

# Evaluate the tuned ensemble model
y_prob_soft_ensemble = voting_clf_soft.predict_proba(X_test_scaled)

# Generate a classification report
class_report_ensemble = classification_report(y_test, y_pred_soft_ensemble, target_names=['Home Win', 'Away Win', 'Draw'])
print(class_report_ensemble)

# Calculate the ROC AUC score
roc_auc_ensemble = roc_auc_score(label_binarize(y_test, classes=[0, 1, 2]), y_prob_soft_ensemble, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Tuned Ensemble): {roc_auc_ensemble:.4f}")

# Calculate Log Loss
log_loss_ensemble = log_loss(y_test, y_prob_soft_ensemble)
print(f"Log Loss (Tuned Ensemble): {log_loss_ensemble:.4f}")

# Calculate Brier Score for each class
brier_scores_ensemble = []
for i in range(y_prob_soft_ensemble.shape[1]):
    y_test_bin_class = (y_test == i).astype(int)
    brier_score_class_ensemble = brier_score_loss(y_test_bin_class, y_prob_soft_ensemble[:, i])
    brier_scores_ensemble.append(brier_score_class_ensemble)

mean_brier_score_ensemble = np.mean(brier_scores_ensemble)
print(f"Mean Brier Score (Tuned Ensemble): {mean_brier_score_ensemble:.4f}")


              precision    recall  f1-score   support

    Home Win       0.61      0.85      0.71       175
    Away Win       0.00      0.00      0.00        82
        Draw       0.58      0.65      0.61       123

    accuracy                           0.60       380
   macro avg       0.40      0.50      0.44       380
weighted avg       0.47      0.60      0.53       380

ROC AUC Score (Tuned Ensemble): 0.7107
Log Loss (Tuned Ensemble): 0.9683
Mean Brier Score (Tuned Ensemble): 0.1910


Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best Random Forest Parameters: {'bootstrap': True, 'max_depth': 2, 'min_samples_leaf': 7, 'min_samples_split': 5, 'n_estimators': 150}
Selection deleted

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Logistic Regression Parameters: {'C': 0.001, 'max_iter': 100000, 'penalty': 'l2', 'solver': 'lbfgs'}

# Train the models on the original training data (no SMOTE applied)
xgb_model_no_smote = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=3,
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=4,
    n_estimators=80,
    subsample=0.8
)

rf_model_no_smote = RandomForestClassifier(
    bootstrap=True,
    max_depth=4,
    min_samples_leaf=7,
    min_samples_split=5,
    n_estimators=150
)

lr_model_no_smote = LogisticRegression(
    C=0.1,
    solver='lbfgs',
    max_iter=100000,
    penalty='l2',
    multi_class='multinomial'
)

# Fit models without SMOTE
xgb_model_no_smote.fit(X_train_scaled, y_train)
rf_model_no_smote.fit(X_train_scaled, y_train)
lr_model_no_smote.fit(X_train_scaled, y_train)

# Create a soft voting classifier without SMOTE
voting_clf_soft_no_smote = VotingClassifier(
    estimators=[
        ('xgb', xgb_model_no_smote),
        ('rf', rf_model_no_smote),
        ('lr', lr_model_no_smote)
    ],
    voting='soft'
)

# Train the voting classifier without SMOTE
voting_clf_soft_no_smote.fit(X_train_scaled, y_train)

# Predict and evaluate the model without SMOTE
y_pred_no_smote = voting_clf_soft_no_smote.predict(X_test_scaled)
y_prob_no_smote = voting_clf_soft_no_smote.predict_proba(X_test_scaled)

# Generate a classification report
print("Ensemble Model without SMOTE:")
print(classification_report(y_test, y_pred_no_smote, target_names=['Home Win', 'Draw', 'Away Win']))

# Calculate and print ROC AUC Score
roc_auc_no_smote = roc_auc_score(label_binarize(y_test, classes=[0, 1, 2]), y_prob_no_smote, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Ensemble without SMOTE): {roc_auc_no_smote:.4f}")

# Calculate and print Log Loss
log_loss_no_smote = log_loss(y_test, y_prob_no_smote)
print(f"Log Loss (Ensemble without SMOTE): {log_loss_no_smote:.4f}")

# Calculate and print Brier Score
brier_scores_no_smote = []
for i in range(y_prob_no_smote.shape[1]):
    y_test_bin_class = (y_test == i).astype(int)
    brier_score_class_no_smote = brier_score_loss(y_test_bin_class, y_prob_no_smote[:, i])
    brier_scores_no_smote.append(brier_score_class_no_smote)

mean_brier_score_no_smote = np.mean(brier_scores_no_smote)
print(f"Mean Brier Score (Ensemble without SMOTE): {mean_brier_score_no_smote:.4f}")


In [534]:
# BEST CURRENT MODEL
from sklearn.calibration import CalibratedClassifierCV

# Step 1: Initialize the base models
xgb_model_no_smote = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=3,
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=4,
    n_estimators=80,
    subsample=0.8
)

rf_model_no_smote = RandomForestClassifier(
    bootstrap=True,
    max_depth=4,
    min_samples_leaf=7,
    min_samples_split=5,
    n_estimators=150
)

lr_model_no_smote = LogisticRegression(
    C=0.1,
    solver='lbfgs',
    max_iter=100000,
    penalty='l2',
    multi_class='multinomial'
)

# Step 2: Calibrate each base model without using cv='prefit'
calibrated_xgb = CalibratedClassifierCV(base_estimator=xgb_model_no_smote, method='isotonic', cv=5)
calibrated_rf = CalibratedClassifierCV(base_estimator=rf_model_no_smote, method='isotonic', cv=5)
calibrated_lr = CalibratedClassifierCV(base_estimator=lr_model_no_smote, method='isotonic', cv=5)

# Step 3: Fit the calibration models
calibrated_xgb.fit(X_train_scaled, y_train)
calibrated_rf.fit(X_train_scaled, y_train)
calibrated_lr.fit(X_train_scaled, y_train)

# Step 4: Create a soft voting classifier with the calibrated models
voting_clf_soft_calibrated = VotingClassifier(
    estimators=[
        ('xgb', calibrated_xgb),
        ('rf', calibrated_rf),
        ('lr', calibrated_lr)
    ],
    voting='soft'
)

# Train the voting classifier with calibrated models
voting_clf_soft_calibrated.fit(X_train_scaled, y_train)

# Step 5: Predict and evaluate the calibrated ensemble
y_pred_calibrated_ensemble = voting_clf_soft_calibrated.predict(X_test_scaled)
y_prob_calibrated_ensemble = voting_clf_soft_calibrated.predict_proba(X_test_scaled)

# Generate a classification report
print("Calibrated Ensemble Model without SMOTE:")
print(classification_report(y_test, y_pred_calibrated_ensemble, target_names=['Home Win', 'Draw', 'Away Win']))

# Calculate and print ROC AUC Score
roc_auc_calibrated_ensemble = roc_auc_score(label_binarize(y_test, classes=[0, 1, 2]), y_prob_calibrated_ensemble, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Calibrated Ensemble): {roc_auc_calibrated_ensemble:.4f}")

# Calculate and print Log Loss
log_loss_calibrated_ensemble = log_loss(y_test, y_prob_calibrated_ensemble)
print(f"Log Loss (Calibrated Ensemble): {log_loss_calibrated_ensemble:.4f}")

# Calculate and print Brier Score
brier_scores_calibrated_ensemble = []
for i in range(y_prob_calibrated_ensemble.shape[1]):
    y_test_bin_class = (y_test == i).astype(int)
    brier_score_class_calibrated_ensemble = brier_score_loss(y_test_bin_class, y_prob_calibrated_ensemble[:, i])
    brier_scores_calibrated_ensemble.append(brier_score_class_calibrated_ensemble)

mean_brier_score_calibrated_ensemble = np.mean(brier_scores_calibrated_ensemble)
print(f"Mean Brier Score (Calibrated Ensemble): {mean_brier_score_calibrated_ensemble:.4f}")


Calibrated Ensemble Model without SMOTE:
              precision    recall  f1-score   support

    Home Win       0.61      0.85      0.71       175
        Draw       0.00      0.00      0.00        82
    Away Win       0.60      0.67      0.63       123

    accuracy                           0.61       380
   macro avg       0.40      0.51      0.45       380
weighted avg       0.48      0.61      0.53       380

ROC AUC Score (Calibrated Ensemble): 0.7109
Log Loss (Calibrated Ensemble): 0.9087
Mean Brier Score (Calibrated Ensemble): 0.1766


## Forward Chaining Cross-Validation ##

In [None]:
# Assuming you have already defined your dataset and preprocessing steps.
# Make sure to run this cell before running the cross-validation loop.

# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np

# Drop the season column as it is no longer needed
data_cleaned = data_cleaned.drop(['season'], axis=1)


# Define your features and target
X = data_cleaned.drop(columns=['match_result'])  # Replace 'data' with your actual DataFrame
y = data_cleaned['match_result']  # Replace 'data' with your actual DataFrame

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize TimeSeriesSplit for forward-chaining cross-validation
tscv = TimeSeriesSplit(n_splits=10)

# To store the results across folds
roc_auc_scores = []
log_loss_scores = []
mean_brier_scores = []

# Cross-validation loop
for fold, (train_index, test_index) in enumerate(tscv.split(X_scaled)):
    print(f"Training fold {fold + 1}...")
    
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # Step 1: Initialize the base models
    xgb_model_no_smote = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=3,
        colsample_bytree=0.8,
        learning_rate=0.01,
        max_depth=4,
        n_estimators=80,
        subsample=0.8
    )

    rf_model_no_smote = RandomForestClassifier(
        bootstrap=True,
        max_depth=4,
        min_samples_leaf=7,
        min_samples_split=5,
        n_estimators=150
    )

    lr_model_no_smote = LogisticRegression(
        C=0.1,
        solver='lbfgs',
        max_iter=100000,
        penalty='l2',
        multi_class='multinomial'
    )

    # Step 2: Calibrate each base model without using cv='prefit'
    calibrated_xgb = CalibratedClassifierCV(base_estimator=xgb_model_no_smote, method='isotonic', cv=5)
    calibrated_rf = CalibratedClassifierCV(base_estimator=rf_model_no_smote, method='isotonic', cv=5)
    calibrated_lr = CalibratedClassifierCV(base_estimator=lr_model_no_smote, method='isotonic', cv=5)

    # Step 3: Fit the calibration models
    calibrated_xgb.fit(X_train_fold, y_train_fold)
    calibrated_rf.fit(X_train_fold, y_train_fold)
    calibrated_lr.fit(X_train_fold, y_train_fold)

    # Step 4: Create a soft voting classifier with the calibrated models
    voting_clf_soft_calibrated = VotingClassifier(
        estimators=[
            ('xgb', calibrated_xgb),
            ('rf', calibrated_rf),
            ('lr', calibrated_lr)
        ],
        voting='soft'
    )
    
    # Train the voting classifier with calibrated models
    voting_clf_soft_calibrated.fit(X_train_fold, y_train_fold)

    # Step 5: Predict and evaluate the calibrated ensemble on the test fold
    y_pred_calibrated_ensemble = voting_clf_soft_calibrated.predict(X_test_fold)
    y_prob_calibrated_ensemble = voting_clf_soft_calibrated.predict_proba(X_test_fold)

    # Generate a classification report for each fold
    print(f"Classification Report for Fold {fold + 1}:")
    print(classification_report(y_test_fold, y_pred_calibrated_ensemble, target_names=['Home Win', 'Draw', 'Away Win']))

    # Calculate and store ROC AUC Score
    roc_auc_calibrated_ensemble = roc_auc_score(label_binarize(y_test_fold, classes=[0, 1, 2]), y_prob_calibrated_ensemble, multi_class="ovr", average="macro")
    roc_auc_scores.append(roc_auc_calibrated_ensemble)

    # Calculate and store Log Loss
    log_loss_calibrated_ensemble = log_loss(y_test_fold, y_prob_calibrated_ensemble)
    log_loss_scores.append(log_loss_calibrated_ensemble)

    # Calculate and store Brier Score
    brier_scores_calibrated_ensemble = []
    for i in range(y_prob_calibrated_ensemble.shape[1]):
        y_test_bin_class = (y_test_fold == i).astype(int)
        brier_score_class_calibrated_ensemble = brier_score_loss(y_test_bin_class, y_prob_calibrated_ensemble[:, i])
        brier_scores_calibrated_ensemble.append(brier_score_class_calibrated_ensemble)
    mean_brier_scores.append(np.mean(brier_scores_calibrated_ensemble))

# After cross-validation, print the average scores across all folds
print(f"Average ROC AUC Score (Calibrated Ensemble): {np.mean(roc_auc_scores):.4f}")
print(f"Average Log Loss (Calibrated Ensemble): {np.mean(log_loss_scores):.4f}")
print(f"Average Mean Brier Score (Calibrated Ensemble): {np.mean(mean_brier_scores):.4f}")


## Testing w/ 23/24 as unseen data ##

In [536]:
data_cleaned.columns

Index(['home_team_elo', 'away_team_elo', 'home_team_strength',
       'away_team_strength', 'home_xG_to_date', 'away_xG_to_date',
       'home_xG_against_to_date', 'away_xG_against_to_date',
       'home_goals_scored_to_date', 'away_goals_scored_to_date',
       'home_goals_conceded_to_date', 'away_goals_conceded_to_date',
       'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form',
       'match_result', 'Pinnacle Closing Home Win Odds',
       'Pinnacle Closing Draw Odds', 'Pinnacle Closing Away Win Odds'],
      dtype='object')

from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit

# Step 1: Split the data by season
train_data = data_cleaned[data_cleaned['season'] < 2324]
test_data = data_cleaned[data_cleaned['season'] == 2324]

# Step 2: Extract features and target variable from training data
X_train = train_data.drop('match_result', axis=1)
y_train = train_data['match_result']

# Standardize the features (if not already done)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Initialize forward-chaining cross-validation (TimeSeriesSplit)
n_splits = 10  # Adjust as necessary
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize lists to store the results
roc_auc_scores = []
log_losses = []
mean_brier_scores = []

for fold, (train_index, test_index) in enumerate(tscv.split(X_train_scaled)):
    print(f"Training fold {fold + 1}...")
    X_fold_train, X_fold_test = X_train_scaled[train_index], X_train_scaled[test_index]
    y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

    # Initialize the base models
    xgb_model_no_smote = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=3,
        colsample_bytree=0.8,
        learning_rate=0.01,
        max_depth=4,
        n_estimators=80,
        subsample=0.8
    )

    rf_model_no_smote = RandomForestClassifier(
        bootstrap=True,
        max_depth=4,
        min_samples_leaf=7,
        min_samples_split=5,
        n_estimators=150
    )

    lr_model_no_smote = LogisticRegression(
        C=0.1,
        solver='lbfgs',
        max_iter=100000,
        penalty='l2',
        multi_class='multinomial'
    )

    # Fit the base models
    xgb_model_no_smote.fit(X_fold_train, y_fold_train)
    rf_model_no_smote.fit(X_fold_train, y_fold_train)
    lr_model_no_smote.fit(X_fold_train, y_fold_train)

    # Calibrate each model (without using prefit)
    calibrated_xgb = CalibratedClassifierCV(base_estimator=xgb_model_no_smote, method='isotonic', cv=5)
    calibrated_rf = CalibratedClassifierCV(base_estimator=rf_model_no_smote, method='isotonic', cv=5)
    calibrated_lr = CalibratedClassifierCV(base_estimator=lr_model_no_smote, method='isotonic', cv=5)

    # Fit the calibration models
    calibrated_xgb.fit(X_fold_train, y_fold_train)
    calibrated_rf.fit(X_fold_train, y_fold_train)
    calibrated_lr.fit(X_fold_train, y_fold_train)

    # Create the ensemble voting classifier
    voting_clf_soft_calibrated = VotingClassifier(
        estimators=[
            ('xgb', calibrated_xgb),
            ('rf', calibrated_rf),
            ('lr', calibrated_lr)
        ],
        voting='soft'
    )

    # Train the voting classifier
    voting_clf_soft_calibrated.fit(X_fold_train, y_fold_train)

    # Evaluate the model
    y_fold_pred = voting_clf_soft_calibrated.predict(X_fold_test)
    y_fold_prob = voting_clf_soft_calibrated.predict_proba(X_fold_test)

    print(f"Classification Report for Fold {fold + 1}:")
    print(classification_report(y_fold_test, y_fold_pred, target_names=['Home Win', 'Draw', 'Away Win']))

    roc_auc_score_fold = roc_auc_score(label_binarize(y_fold_test, classes=[0, 1, 2]), y_fold_prob, multi_class="ovr", average="macro")
    roc_auc_scores.append(roc_auc_score_fold)

    log_loss_fold = log_loss(y_fold_test, y_fold_prob)
    log_losses.append(log_loss_fold)

    brier_scores_fold = []
    for i in range(y_fold_prob.shape[1]):
        y_test_bin_class = (y_fold_test == i).astype(int)
        brier_score_class_fold = brier_score_loss(y_test_bin_class, y_fold_prob[:, i])
        brier_scores_fold.append(brier_score_class_fold)
    mean_brier_score_fold = np.mean(brier_scores_fold)
    mean_brier_scores.append(mean_brier_score_fold)

    print(f"Fold {fold + 1} - ROC AUC: {roc_auc_score_fold:.4f}, Log Loss: {log_loss_fold:.4f}, Mean Brier Score: {mean_brier_score_fold:.4f}")

# Final performance metrics
average_roc_auc = np.mean(roc_auc_scores)
average_log_loss = np.mean(log_losses)
average_mean_brier_score = np.mean(mean_brier_scores)

print(f"Average ROC AUC Score (Calibrated Ensemble): {average_roc_auc:.4f}")
print(f"Average Log Loss (Calibrated Ensemble): {average_log_loss:.4f}")
print(f"Average Mean Brier Score (Calibrated Ensemble): {average_mean_brier_score:.4f}")


## SHAP values ##

import shap

# SHAP Explainer for XGBoost model
explainer_xgb = shap.TreeExplainer(calibrated_xgb)
shap_values_xgb = explainer_xgb.shap_values(X_test_scaled)

# Plot the SHAP summary plot
shap.summary_plot(shap_values_xgb, X_test_scaled, feature_names=X_test.columns)


# SHAP Explainer for Random Forest model
explainer_rf = shap.TreeExplainer(calibrated_rf)
shap_values_rf = explainer_rf.shap_values(X_test_scaled)

# Summary plot for Random Forest model
shap.summary_plot(shap_values_rf, X_test_scaled, feature_names=X_train.columns)


In [None]:
import pickle

# Save the ensemble model without SMOTE to a file
with open('ensemble_model_without_smote.pkl', 'wb') as model_file:
    pickle.dump(voting_clf_soft_no_smote, model_file)

print("Model saved successfully as 'ensemble_model_without_smote.pkl'!")


In [537]:
import pickle

# Save the ensemble model without SMOTE to a file
with open('ensemble_model_without_smote.pkl', 'wb') as model_file:
    pickle.dump(voting_clf_soft_calibrated, model_file)

print("Model saved successfully as 'ensemble_model_without_smote.pkl'!")


Model saved successfully as 'ensemble_model_without_smote.pkl'!


In [538]:
data

Unnamed: 0,season,week,date,home_team,home_xg,score,away_xg,away_team,referee,game_id,...,away_points_to_date,home_form,away_form,match_result,League Division,Home Team,Away Team,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,1920,1,2019-08-09,liverpool,1.8,4-1,0.9,norwich city,Michael Oliver,928467bd,...,0,0.0,0.0,0,E0,Liverpool,Norwich,1.14,10.43,19.63
1,1920,1,2019-08-10,bournemouth,1.3,1-1,1.3,sheffield utd,Kevin Friend,d402cacd,...,0,0.0,0.0,1,E0,Bournemouth,Sheffield United,1.98,3.67,4.06
2,1920,1,2019-08-10,burnley,0.9,3-0,1.2,southampton,Graham Scott,34b99058,...,0,0.0,0.0,0,E0,Burnley,Southampton,2.71,3.19,2.90
3,1920,1,2019-08-10,crystal palace,0.9,0-0,1.1,everton,Jonathan Moss,a802f51e,...,0,0.0,0.0,1,E0,Crystal Palace,Everton,3.37,3.45,2.27
4,1920,1,2019-08-10,tottenham,2.4,3-1,0.7,aston villa,Chris Kavanagh,404ee5d3,...,0,0.0,0.0,0,E0,Tottenham,Aston Villa,1.39,5.35,8.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,2324,38,2024-05-19,crystal palace,2.5,5-0,0.9,aston villa,Darren Bond,c975c7a6,...,68,2.6,1.6,0,E0,Crystal Palace,Aston Villa,1.63,4.64,4.99
1896,2324,38,2024-05-19,liverpool,4.5,2-0,0.5,wolves,Chris Kavanagh,d4823ed5,...,46,1.6,0.6,0,E0,Liverpool,Wolves,1.13,10.69,16.27
1897,2324,38,2024-05-19,luton town,2.0,2-4,1.1,fulham,Matt Donohue,0fde9d70,...,44,0.2,1.0,2,E0,Luton,Fulham,3.11,3.99,2.19
1898,2324,38,2024-05-19,manchester city,1.9,3-1,0.4,west ham,John Brooks,29335211,...,52,3.0,0.8,0,E0,Man City,West Ham,1.07,15.01,27.84


# Get predicted probabilities for each outcome
predicted_probabilities = voting_clf_soft_calibrated.predict_proba(X_test_scaled)

# Create a DataFrame to store the probabilities
probability_df = pd.DataFrame(predicted_probabilities, columns=['Probability_Home_win', 'Probability_Draw','Probability_Away_win'])

probability_df

# Get predicted probabilities for each outcome
predicted_probabilities = voting_clf_soft_no_smote.predict_proba(X_test_scaled)

# Create a DataFrame to store the probabilities
probability_df = pd.DataFrame(predicted_probabilities, columns=['Probability_Home_win', 'Probability_Draw','Probability_Away_win'])

probability_df

In [540]:
# Assuming the probability_df has the predicted probabilities
# with columns ['Home Win Probability', 'Away Win Probability', 'Draw Probability']

# Calculate the average probability for each outcome
average_probabilities = probability_df.mean()

# Display the average probabilities
print("Average Probability for Each Outcome:")
print(average_probabilities)


Average Probability for Each Outcome:
Probability_Home_win    0.486375
Probability_Draw        0.199365
Probability_Away_win    0.314259
dtype: float64


In [541]:
csv_file_path = '/Users/lkimball/Desktop/Flatiron/CapstoneProject/test_predicted_prob.csv'
probability_df.to_csv(csv_file_path, index=True)


In [542]:
# Reset the index of the test set to ensure it matches with the probabilities
test_data_reset = test_data.reset_index(drop=True)

# Combine the predicted probabilities with the test set
combined_df = pd.concat([test_data_reset, probability_df], axis=1)



In [543]:
# Display the combined DataFrame
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          380 non-null    int64  
 1   home_team_elo                   380 non-null    float64
 2   away_team_elo                   380 non-null    float64
 3   home_team_strength              380 non-null    float64
 4   away_team_strength              380 non-null    float64
 5   home_xG_to_date                 380 non-null    float64
 6   away_xG_to_date                 380 non-null    float64
 7   home_xG_against_to_date         380 non-null    float64
 8   away_xG_against_to_date         380 non-null    float64
 9   home_goals_scored_to_date       380 non-null    int64  
 10  away_goals_scored_to_date       380 non-null    int64  
 11  home_goals_conceded_to_date     380 non-null    int64  
 12  away_goals_conceded_to_date     380 

In [544]:
combined_df

Unnamed: 0,season,home_team_elo,away_team_elo,home_team_strength,away_team_strength,home_xG_to_date,away_xG_to_date,home_xG_against_to_date,away_xG_against_to_date,home_goals_scored_to_date,...,away_points_to_date,home_form,away_form,match_result,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds,Probability_Home_win,Probability_Draw,Probability_Away_win
0,2324,1726.401733,2077.252197,-0.136,1.833,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,2,9.62,5.81,1.33,0.045855,0.204530,0.749615
1,2324,1919.369019,1673.962769,1.222,0.232,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0,1.19,8.00,16.00,0.825895,0.140220,0.033885
2,2324,1660.638062,1779.450317,0.089,0.393,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,1,2.75,3.60,2.63,0.309362,0.276278,0.414360
3,2324,1828.169312,1606.522949,0.700,0.028,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,0,1.27,6.36,11.36,0.721345,0.173106,0.105550
4,2324,1708.234741,1736.637207,0.180,0.245,0.0,0.0,0.0,0.0,0,...,0,0.0,0.0,2,2.39,3.32,3.30,0.369555,0.278770,0.351675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2324,1742.560303,1787.528564,0.387,0.387,46.2,62.6,51.3,57.5,52,...,68,2.6,1.6,0,1.63,4.64,4.99,0.651998,0.195887,0.152115
376,2324,1897.319702,1681.174927,1.122,0.225,83.3,46.5,45.3,63.2,84,...,46,1.6,0.6,0,1.13,10.69,16.27,0.795698,0.169163,0.035139
377,2324,1572.322388,1707.862427,-0.017,0.207,40.3,49.7,76.7,61.0,50,...,44,0.2,1.0,2,3.11,3.99,2.19,0.323184,0.217117,0.459699
378,2324,2048.724609,1728.056885,1.685,0.455,78.7,52.1,35.0,69.4,93,...,52,3.0,0.8,0,1.07,15.01,27.84,0.802491,0.160714,0.036795


In [545]:
data

Unnamed: 0,season,week,date,home_team,home_xg,score,away_xg,away_team,referee,game_id,...,away_points_to_date,home_form,away_form,match_result,League Division,Home Team,Away Team,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds
0,1920,1,2019-08-09,liverpool,1.8,4-1,0.9,norwich city,Michael Oliver,928467bd,...,0,0.0,0.0,0,E0,Liverpool,Norwich,1.14,10.43,19.63
1,1920,1,2019-08-10,bournemouth,1.3,1-1,1.3,sheffield utd,Kevin Friend,d402cacd,...,0,0.0,0.0,1,E0,Bournemouth,Sheffield United,1.98,3.67,4.06
2,1920,1,2019-08-10,burnley,0.9,3-0,1.2,southampton,Graham Scott,34b99058,...,0,0.0,0.0,0,E0,Burnley,Southampton,2.71,3.19,2.90
3,1920,1,2019-08-10,crystal palace,0.9,0-0,1.1,everton,Jonathan Moss,a802f51e,...,0,0.0,0.0,1,E0,Crystal Palace,Everton,3.37,3.45,2.27
4,1920,1,2019-08-10,tottenham,2.4,3-1,0.7,aston villa,Chris Kavanagh,404ee5d3,...,0,0.0,0.0,0,E0,Tottenham,Aston Villa,1.39,5.35,8.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,2324,38,2024-05-19,crystal palace,2.5,5-0,0.9,aston villa,Darren Bond,c975c7a6,...,68,2.6,1.6,0,E0,Crystal Palace,Aston Villa,1.63,4.64,4.99
1896,2324,38,2024-05-19,liverpool,4.5,2-0,0.5,wolves,Chris Kavanagh,d4823ed5,...,46,1.6,0.6,0,E0,Liverpool,Wolves,1.13,10.69,16.27
1897,2324,38,2024-05-19,luton town,2.0,2-4,1.1,fulham,Matt Donohue,0fde9d70,...,44,0.2,1.0,2,E0,Luton,Fulham,3.11,3.99,2.19
1898,2324,38,2024-05-19,manchester city,1.9,3-1,0.4,west ham,John Brooks,29335211,...,52,3.0,0.8,0,E0,Man City,West Ham,1.07,15.01,27.84


In [546]:
# Save the average odds to another CSV file for feeding into your model
data.to_csv('Model_input_sample_data.csv', index=False)
print("Average odds data saved to Model_input_sample_data.csv")


Average odds data saved to Model_input_sample_data.csv


## Final Model Deployment ##

In [552]:
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Assume `data_cleaned` is your complete dataset
# Drop the 'season' column if it's not needed for training
#data_cleaned = data_cleaned.drop(['season'], axis=1)

# Separate the features and target
X = data_cleaned.drop(columns=['match_result'])
y = data_cleaned['match_result']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Initialize the models with the best parameters from cross-validation
xgb_model_final = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=3,
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=4,
    n_estimators=80,
    subsample=0.8
)

rf_model_final = RandomForestClassifier(
    bootstrap=True,
    max_depth=4,
    min_samples_leaf=7,
    min_samples_split=5,
    n_estimators=150
)

lr_model_final = LogisticRegression(
    C=0.1,
    solver='lbfgs',
    max_iter=100000,
    penalty='l2',
    multi_class='multinomial'
)

# Step 2: Calibrate each base model
calibrated_xgb_final = CalibratedClassifierCV(base_estimator=xgb_model_final, method='isotonic', cv=5)
calibrated_rf_final = CalibratedClassifierCV(base_estimator=rf_model_final, method='isotonic', cv=5)
calibrated_lr_final = CalibratedClassifierCV(base_estimator=lr_model_final, method='isotonic', cv=5)

# Step 3: Fit the final calibrated models on the full dataset
calibrated_xgb_final.fit(X_scaled, y)
calibrated_rf_final.fit(X_scaled, y)
calibrated_lr_final.fit(X_scaled, y)

# Step 4: Create the final ensemble model
voting_clf_final = VotingClassifier(
    estimators=[
        ('xgb', calibrated_xgb_final),
        ('rf', calibrated_rf_final),
        ('lr', calibrated_lr_final)
    ],
    voting='soft'
)

# Train the final ensemble model on the full dataset
voting_clf_final.fit(X_scaled, y)

# Step 5: Save the final model to a file for future use
with open('final_ensemble_model.pkl', 'wb') as model_file:
    pickle.dump(voting_clf_final, model_file)

print("Final model trained and saved successfully!")


Final model trained and saved successfully!


In [554]:
import shap

# Load your saved model
with open('final_ensemble_model.pkl', 'rb') as model_file:
    voting_clf_final = pickle.load(model_file)

# Prepare SHAP explainers for each model in the ensemble
# Initialize SHAP explainers for the tree-based models
explainer_xgb = shap.TreeExplainer(voting_clf_final.named_estimators_['xgb'].base_estimator)
explainer_rf = shap.TreeExplainer(voting_clf_final.named_estimators_['rf'].base_estimator)

# For logistic regression, use LinearExplainer
explainer_lr = shap.LinearExplainer(voting_clf_final.named_estimators_['lr'].base_estimator, X_scaled)

# Compute SHAP values for a subset of your data (to speed up the process, use a smaller subset)
X_explain = X_scaled[:100]  # Adjust this according to your dataset size
shap_values_xgb = explainer_xgb.shap_values(X_explain)
shap_values_rf = explainer_rf.shap_values(X_explain)
shap_values_lr = explainer_lr.shap_values(X_explain)

# Visualize SHAP values for each model
# XGBoost model
shap.summary_plot(shap_values_xgb, X_explain, feature_names=X.columns)

# RandomForest model
shap.summary_plot(shap_values_rf, X_explain, feature_names=X.columns)

# Logistic Regression model
shap.summary_plot(shap_values_lr, X_explain, feature_names=X.columns)

# Optionally, create a combined plot for the ensemble model's average SHAP values
average_shap_values = (shap_values_xgb + shap_values_rf + shap_values_lr) / 3
shap.summary_plot(average_shap_values, X_explain, feature_names=X.columns)


NotFittedError: need to call fit or load_model beforehand