## CatBoost Modeling

We have established CatBoost as the top performer among our classification models on the numerical data. It has the additional benefit of being able to accept categorical features without one hot encoding.

In [7]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from itertools import cycle

import warnings
import os
import time
import math
import requests

import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('EPL_Updated.csv')

In [9]:
df.columns

Index(['Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds',
       'Pinnacle Closing Away Win Odds', 'date', 'home_team', 'away_team',
       'week', 'date.1', 'home_team.1', 'home_xg', 'score', 'away_xg',
       'away_team.1', 'referee', 'game_id', 'home_team_elo', 'away_team_elo',
       'season', 'home_xG_to_date', 'away_xG_to_date',
       'home_xG_against_to_date', 'away_xG_against_to_date',
       'home_goals_scored', 'away_goals_scored', 'home_goals_scored_to_date',
       'away_goals_scored_to_date', 'home_goals_conceded_to_date',
       'away_goals_conceded_to_date', 'home_match_points', 'away_match_points',
       'home_points_to_date', 'away_points_to_date', 'home_form', 'away_form',
       'match_result'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2660 entries, 0 to 2659
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Pinnacle Closing Home Win Odds  2660 non-null   float64
 1   Pinnacle Closing Draw Odds      2660 non-null   float64
 2   Pinnacle Closing Away Win Odds  2660 non-null   float64
 3   date                            2660 non-null   object 
 4   home_team                       2660 non-null   object 
 5   away_team                       2660 non-null   object 
 6   week                            2280 non-null   float64
 7   date.1                          2280 non-null   object 
 8   home_team.1                     2280 non-null   object 
 9   home_xg                         2280 non-null   float64
 10  score                           2280 non-null   object 
 11  away_xg                         2280 non-null   float64
 12  away_team.1                     22

In [11]:
columns_to_drop = ['week', 'date','home_xg', 'score', 'away_xg','game_id',
                   'season','home_match_points', 'away_match_points', 
                   'home_goals_scored', 'away_goals_scored', 'date.1', 'home_team.1', 'away_team.1', 'referee', 'home_team',
 'away_team']

In [12]:
df_model = df.drop(columns=columns_to_drop)

# Display the first few rows of the dataframe after dropping the columns
df_model.columns

Index(['Pinnacle Closing Home Win Odds', 'Pinnacle Closing Draw Odds',
       'Pinnacle Closing Away Win Odds', 'home_team_elo', 'away_team_elo',
       'home_xG_to_date', 'away_xG_to_date', 'home_xG_against_to_date',
       'away_xG_against_to_date', 'home_goals_scored_to_date',
       'away_goals_scored_to_date', 'home_goals_conceded_to_date',
       'away_goals_conceded_to_date', 'home_points_to_date',
       'away_points_to_date', 'home_form', 'away_form', 'match_result'],
      dtype='object')

In [13]:
feature_order = df_model.columns.tolist()
feature_order

['Pinnacle Closing Home Win Odds',
 'Pinnacle Closing Draw Odds',
 'Pinnacle Closing Away Win Odds',
 'home_team_elo',
 'away_team_elo',
 'home_xG_to_date',
 'away_xG_to_date',
 'home_xG_against_to_date',
 'away_xG_against_to_date',
 'home_goals_scored_to_date',
 'away_goals_scored_to_date',
 'home_goals_conceded_to_date',
 'away_goals_conceded_to_date',
 'home_points_to_date',
 'away_points_to_date',
 'home_form',
 'away_form',
 'match_result']

In [14]:
df_model

Unnamed: 0,Pinnacle Closing Home Win Odds,Pinnacle Closing Draw Odds,Pinnacle Closing Away Win Odds,home_team_elo,away_team_elo,home_xG_to_date,away_xG_to_date,home_xG_against_to_date,away_xG_against_to_date,home_goals_scored_to_date,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,match_result
0,1.49,4.73,7.25,1884.934448,1697.498169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.75,6.15,1.29,1673.780518,1576.490356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.33,5.40,12.25,1633.799683,1692.951660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,1.79,3.56,5.51,1567.101318,1837.004272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,1.82,3.49,5.42,1670.871338,1914.848877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,1.63,4.64,4.99,,,,,,,,,,,,,,,
2656,1.13,10.69,16.27,,,,,,,,,,,,,,,
2657,3.11,3.99,2.19,,,,,,,,,,,,,,,
2658,1.07,15.01,27.84,,,,,,,,,,,,,,,


In [15]:
csv_file_path = '/Users/lkimball/Desktop/Betting/TotalSet_Prob_distribution.csv'
df_model.to_csv(csv_file_path, index=True)

In [16]:
# Assuming df_dropped is your DataFrame after dropping columns

# Define the target variable and the feature set
X = df_model.drop('match_result', axis=1)
y = df_model['match_result']

# Identify categorical features for CatBoost
categorical_features_indices = list(X.select_dtypes(include=['object']).columns)

# Determine the number of records to include in the test set
num_test_records = 380

# Split the features data
X_train = X.iloc[:-num_test_records]
X_test = X.iloc[-num_test_records:]

# Split the target data
y_train = y.iloc[:-num_test_records]
y_test = y.iloc[-num_test_records:]

# Separate numerical features
numerical_features = X.select_dtypes(exclude=['object'])

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the numerical features using only the training data
X_train[numerical_features.columns] = scaler.fit_transform(X_train[numerical_features.columns])

# Use the same scaling parameters to scale the numerical features in the testing data
X_test[numerical_features.columns] = scaler.transform(X_test[numerical_features.columns])

# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=3,
    cat_features=categorical_features_indices,
    verbose=10
)

# Train CatBoostClassifier
catboost_model.fit(X_train, y_train)

# Predict on the testing set
y_pred = catboost_model.predict(X_test)


# Predict probabilities on the testing set using the best estimator
y_pred_proba = catboost_model.predict_proba(X_test)

# Calculate the Log Loss with the best estimator
logloss = log_loss(y_test, y_pred_proba)

print(f'Best Log Loss: {logloss}')


0:	learn: 1.0794822	total: 60.2ms	remaining: 5.96s
10:	learn: 0.9910403	total: 76.6ms	remaining: 619ms
20:	learn: 0.9645600	total: 90.4ms	remaining: 340ms
30:	learn: 0.9508591	total: 104ms	remaining: 232ms
40:	learn: 0.9413634	total: 118ms	remaining: 170ms
50:	learn: 0.9335290	total: 132ms	remaining: 127ms
60:	learn: 0.9284762	total: 149ms	remaining: 95ms
70:	learn: 0.9223885	total: 166ms	remaining: 67.8ms
80:	learn: 0.9174896	total: 185ms	remaining: 43.3ms
90:	learn: 0.9105336	total: 207ms	remaining: 20.5ms
99:	learn: 0.9051728	total: 226ms	remaining: 0us


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
X_test

In [None]:
# Create a CatBoost classifier instance
catboost = CatBoostClassifier(cat_features=categorical_features_indices, verbose=0)

# Define the pipeline
pipeline = Pipeline([
    ('classifier', catboost)
])

# Define the hyperparameters grid to be tested
params_grid = {
    'classifier__iterations': [25, 50, 100],
    'classifier__depth': [5, 6, 7, 8],
    'classifier__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2]
    # You can add other parameters you want to tune
}

# Setup the GridSearchCV object
# Note: Scoring is now 'neg_log_loss' to use log loss. The "negative" is because GridSearchCV always tries to maximize its score, 
# so negating the log loss makes it a score to be maximized (lower log loss is better).
grid_search = GridSearchCV(pipeline, params_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predict probabilities on the testing set using the best estimator
y_pred_proba = best_estimator.predict_proba(X_test)

# Calculate the Log Loss with the best estimator
logloss = log_loss(y_test, y_pred_proba)

print(f'Best parameters: {best_params}')
print(f'Best Log Loss: {logloss}')


In [None]:
import pickle
# Save the scaler object
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
# Get feature importances from the best estimator
feature_importances = best_estimator.named_steps['classifier'].get_feature_importance()

# Get feature names
feature_names = X_train.columns

# Create a DataFrame to display feature importances
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame to show the most important features at the top
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

print(importances_df)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure your DataFrame 'importances_df' is defined as in your provided code

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=importances_df, x='Importance', y='Feature')

# Add plot labels and title
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')

# Show the plot
plt.show()


In [None]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, log_loss

# Assuming X_train, X_test, y_train, y_test, and categorical_features_indices are already defined

# Create a CatBoost classifier instance
catboost = CatBoostClassifier(cat_features=categorical_features_indices, verbose=0)

# Define the pipeline
pipeline = Pipeline([
    # Here you can add other preprocessing steps if needed
    ('classifier', catboost)
])

# Define the hyperparameters grid to be tested
params_grid = {
    'classifier__iterations': [25, 50, 100],
    'classifier__depth': [5, 6, 7, 8],
    'classifier__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2]
    # You can add other parameters you want to tune
}

# Setup the GridSearchCV object
# Change scoring to 'neg_log_loss'
grid_search = GridSearchCV(pipeline, params_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best hyperparameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predict on the testing set using the best estimator
y_pred = best_estimator.predict(X_test)

# Predict probabilities for log loss calculation
y_pred_proba = best_estimator.predict_proba(X_test)

# Calculate various scores with the best estimator
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_test, y_pred_proba)

# Print the best parameters and all scores
print(f'Best parameters: {best_params}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Log Loss: {logloss:.4f}')

# Print the full classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
X_train.columns

In [None]:
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Assuming X_train, X_test, y_train, y_test, and categorical_features_indices are already defined

# Create a CatBoost classifier instance
catboost = CatBoostClassifier(cat_features=categorical_features_indices, verbose=0)

# Define the pipeline
pipeline = Pipeline([
    # Here you can add other preprocessing steps if needed
    ('classifier', catboost)
])

# Define the hyperparameters grid to be tested
params_grid = {
    'classifier__iterations': [25, 50, 100],
    'classifier__depth': [5, 6, 7, 8],
    'classifier__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2]
    # You can add other parameters you want to tune
}

# Setup the GridSearchCV object
grid_search = GridSearchCV(pipeline, params_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best hyperparameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Predict on the testing set using the best estimator
y_pred = best_estimator.predict(X_test)

# Predict probabilities for log loss calculation
y_pred_proba = best_estimator.predict_proba(X_test)

# Calculate various scores with the best estimator
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
logloss = log_loss(y_test, y_pred_proba)

# Print the best parameters and all scores
print(f'Best parameters: {best_params}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Log Loss: {logloss:.4f}')

# Print the full classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate class-wise accuracy
class_wise_accuracy = cm.diagonal() / cm.sum(axis=1)

# Print class-wise accuracy
print("\nClass-wise Accuracy:")
for i, accuracy in enumerate(class_wise_accuracy):
    print(f'Class {i} Accuracy: {accuracy:.4f}')


In [None]:
cm

## K-Folds so I can plot log-loss

In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline




# Define a TimeSeriesSplit cross-validation strategy
tscv = TimeSeriesSplit(n_splits=5)

best_log_loss = np.inf
best_params = None

# Define your hyperparameters grid to be tested
iterations_options = [150, 175, 200, 225]
depth_options = [1, 2, 3, 4]
learning_rate_options = [.01, .025, 0.05, 0.1, .25]
l2_reg_options = [3, 4, 5]  # L2 regularization values

# Find the best hyperparameters
for iterations in iterations_options:
    for depth in depth_options:
        for learning_rate in learning_rate_options:
            for l2_reg in l2_reg_options:
                catboost = CatBoostClassifier(iterations=iterations, depth=depth, 
                                              learning_rate=learning_rate, 
                                              l2_leaf_reg=l2_reg,
                                              cat_features=categorical_features_indices, 
                                              verbose=0)
                pipeline = Pipeline([('classifier', catboost)])
                
                val_scores = []

                for train_index, val_index in tscv.split(X_train):
                    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                    pipeline.fit(X_train_fold, y_train_fold)
                    y_val_pred_proba = pipeline.predict_proba(X_val_fold)
                    val_log_loss = log_loss(y_val_fold, y_val_pred_proba)
                    val_scores.append(val_log_loss)

                avg_val_log_loss = np.mean(val_scores)
                if avg_val_log_loss < best_log_loss:
                    best_log_loss = avg_val_log_loss
                    best_params = (iterations, depth, learning_rate, l2_reg)

# Display best hyperparameters
print(f"Best Hyperparameters: Iterations={best_params[0]}, Depth={best_params[1]}, LR={best_params[2]}, L2={best_params[3]}")

# (Continue with your model training and evaluation as needed)


In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

# Assuming X_train, y_train, and categorical_features_indices are already defined

# Define a TimeSeriesSplit cross-validation strategy
tscv = TimeSeriesSplit(n_splits=5)

best_log_loss = np.inf
best_params = None


# Define your hyperparameters grid to be tested
iterations_options = [150, 175, 200, 225]
depth_options = [1, 2, 3, 4]
learning_rate_options = [.01, .025, 0.05, 0.1, .25]
l2_reg_options = [3, 4, 5]  # L2 regularization values

# To track training and validation log losses
all_train_log_losses = []
all_val_log_losses = []

# Find the best hyperparameters
for iterations in iterations_options:
    for depth in depth_options:
        for learning_rate in learning_rate_options:
            for l2_reg in l2_reg_options:
                catboost = CatBoostClassifier(iterations=iterations, depth=depth, 
                                              learning_rate=learning_rate, 
                                              l2_leaf_reg=l2_reg,
                                              cat_features=categorical_features_indices, 
                                              verbose=0)
                pipeline = Pipeline([('classifier', catboost)])
                
                train_log_losses = []
                val_log_losses = []

                for train_index, val_index in tscv.split(X_train, y_train):
                    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                    pipeline.fit(X_train_fold, y_train_fold)

                    # Track training log loss
                    y_train_pred_proba = pipeline.predict_proba(X_train_fold)
                    train_log_loss = log_loss(y_train_fold, y_train_pred_proba)
                    train_log_losses.append(train_log_loss)

                    # Track validation log loss
                    y_val_pred_proba = pipeline.predict_proba(X_val_fold)
                    val_log_loss = log_loss(y_val_fold, y_val_pred_proba)
                    val_log_losses.append(val_log_loss)

                avg_train_log_loss = np.mean(train_log_losses)
                avg_val_log_loss = np.mean(val_log_losses)

                # Store the log losses for analysis
                all_train_log_losses.append(train_log_losses)
                all_val_log_losses.append(val_log_losses)

                if avg_val_log_loss < best_log_loss:
                    best_log_loss = avg_val_log_loss
                    best_params = (iterations, depth, learning_rate, l2_reg)

# Display best hyperparameters
print(f"Best Hyperparameters: Iterations={best_params[0]}, Depth={best_params[1]}, LR={best_params[2]}, L2={best_params[3]}")

# Analyze the stored log losses for overfitting
# ...


In [None]:
import matplotlib.pyplot as plt



# Plotting the training and validation log losses for the best hyperparameters
plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), train_log_losses, 'o-', color="r", label="Training Log Loss")
plt.plot(range(1, 6), val_log_losses, 'o-', color="g", label="Validation Log Loss")
plt.title('Training vs Validation Log Loss for Best Hyperparameters')
plt.xlabel('Fold')
plt.ylabel('Log Loss')
plt.xticks(range(1, 6))
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plotting the results
plt.figure(figsize=(12, 6))
plt.plot(train_log_losses, label='Train Log Loss')
plt.plot(val_log_losses, label='Validation Log Loss')
plt.xlabel('Fold')
plt.ylabel('Log Loss')
plt.title(f'Training vs Validation Log Loss (Best Params: Iterations={best_params[0]}, Depth={best_params[1]}, LR={best_params[2]})')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming you have train_log_losses and val_log_losses from the previous cross-validation
data_to_plot = [train_log_losses, val_log_losses]

plt.figure(figsize=(10, 6))
plt.boxplot(data_to_plot, labels=['Train Log Loss', 'Validation Log Loss'])
plt.title('Box Plot of Model Performance Across Folds')
plt.ylabel('Log Loss')
plt.show()


Final Eval of model

In [None]:
# Initialize the model with the best hyperparameters
best_model = CatBoostClassifier(
    iterations=75, 
    depth=2, 
    learning_rate=0.05, 
    l2_leaf_reg=1,
    cat_features=categorical_features_indices, 
    verbose=0
)

# Train the model on the entire training dataset
best_model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

# Evaluate the model's performance
test_log_loss = log_loss(y_test, y_pred_proba)
print("Log Loss on Test Set:", test_log_loss)

# You may also compute other metrics, depending on your requirements


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Plot the confusion matrix
disp.plot()
plt.show()


In [None]:
# Assuming best_model is your trained CatBoostClassifier
feature_importances = best_model.get_feature_importance()

# Create a bar chart for feature importances
plt.figure(figsize=(12, 6))
plt.bar(range(len(feature_importances)), feature_importances)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance')
plt.xticks(range(len(feature_importances)), X_train.columns, rotation=90)
plt.show()


In [None]:
import shap

# Create a SHAP explainer object using your trained CatBoost model
explainer = shap.TreeExplainer(best_model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_second_test)  # Replace X_second_test with your second test set features

# For a specific instance (adjust instance_index as needed)
instance_index = 0  # Change this index to explore different instances

# Generate a force plot for that instance
# Make sure to pass the correct arguments based on your SHAP version
shap.force_plot(explainer.expected_value, shap_values[instance_index], X_second_test.iloc[instance_index], matplotlib=True)


In [None]:
# Assuming the best_estimator from the GridSearchCV is already trained and available

# Predicting probabilities for each class for each record in the test set
class_probabilities_TS = best_model.predict_proba(X_test)

# Displaying the first few records of the probabilities
print(class_probabilities_TS[:5])


In [None]:
# Assuming the best_estimator from the GridSearchCV is already trained and available

# Predicting probabilities for each class for each record in the test set
class_probabilities = best_estimator.predict_proba(X_test)

# Displaying the first few records of the probabilities
print(class_probabilities[:5])


In [None]:
probability_df = pd.DataFrame(class_probabilities_TS, columns=['Probability_Home_win', 'Probability_Draw', 'Probability_Away_win'])

# Create a new column in X_test to store the original index values
X_test['original_index'] = X_test.index

# Reset the index of X_test
X_test_reset = X_test.reset_index(drop=True)

# Create the probability DataFrame
probability_df_TS = pd.DataFrame(class_probabilities_TS, columns=['Probability_Home_win', 'Probability_Draw', 'Probability_Away_win'])

# Concatenate the reset X_test DataFrame and the probability DataFrame
result_df = pd.concat([X_test_reset, probability_df_TS], axis=1)

# Displaying the first few records of the resulting DataFrame
result_df


In [None]:
test_predicted_prob = result_df[['original_index','Probability_Home_win', 'Probability_Draw', 'Probability_Away_win']]

In [None]:
test_predicted_prob

In [None]:
csv_file_path = '/Users/lkimball/Desktop/Flatiron/CapstoneProject/test_predicted_prob.csv'
test_predicted_prob.to_csv(csv_file_path, index=True)

In [None]:

with open('best_catboost_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)