### Pre-Processing

In [109]:
!pip install pytorch-lightning 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [110]:
from typing import Any

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from torch import nn, Tensor
from torch.utils.data.dataloader import DataLoader, Dataset
from sklearn.linear_model import LogisticRegression
import torch.nn.functional as F
import matplotlib.pyplot as plt

import torch

from sklearn.model_selection import GridSearchCV
# from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization 
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from keras.utils import to_categorical
from sklearn.metrics import precision_recall_curve,PrecisionRecallDisplay
from sklearn.decomposition import PCA
from sklearn.calibration import CalibratedClassifierCV
from tqdm import tqdm
from torch import optim

In [111]:
data=pd.read_csv('joined.csv')
df2 = pd.read_csv('profit_df.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8648 entries, 0 to 8647
Data columns (total 71 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Home                            8648 non-null   object 
 1   Away                            8648 non-null   object 
 2   1_PROB                          8648 non-null   float64
 3   home_win                        8648 non-null   int64  
 4   season                          8648 non-null   int64  
 5   home_cumulative_season_win      8411 non-null   float64
 6   away_cumulative_season_win      8412 non-null   float64
 7   diff_cumulative_season_win      8400 non-null   float64
 8   home_cumulative_win             8584 non-null   float64
 9   away_cumulative_win             8577 non-null   float64
 10  diff_cumulative_win             8562 non-null   float64
 11  home_cumulative_season_xG_diff  8411 non-null   float64
 12  away_cumulative_season_xG_diff  84

In [112]:
data['home_last8_false_pred_exp'] = df2['home_last8_false_pred_exp']
data['away_last8_false_pred_exp'] = df2['away_last8_false_pred_exp']

In [113]:
data = data.dropna()

In [114]:
data = data.drop(columns=['Home','Away'])

In [115]:
data.shape

(7473, 71)

In [116]:
train = data.loc[data.season<=3]
val = data.loc[data.season==4]
test = data.loc[data.season==5]

tmp = list(data.columns)
tmp.remove('home_win')
# tmp.remove('season')

train_X = train[tmp]
train_y = train['home_win']

val_X = val[tmp]
val_y = val['home_win']

test_X = test[tmp]
test_y = test['home_win']

#### Dataset Class

In [117]:
# Dataset class 

class MLDataset(Dataset):
    def __init__(self, module='train', path='profit_df.csv', cols=None):
        df = pd.read_csv(path)
        cols = ['1_PROB', 'home_win', 'season', 'diff_cumulative_season_win',
       'diff_cumulative_win', 'diff_cumulative_season_xG_diff',
       'diff_cumulative_xG_diff', 'diff_last12_win_exp_0.25',
       'diff_last12_xG_diff_exp_0.25'] if cols is None else cols
       # , 'home_last8_false_pred_exp','away_last8_false_pred_exp' add these back later
        df = df[cols].dropna()
        # print(df.columns)
        if module == 'train':
            data = df.loc[df.season <= 3].drop(columns=['season'])
        elif module == 'val':
            data = df.loc[df.season == 4].drop(columns=['season'])
        else:
            assert module == 'test'
            data = df.loc[df.season == 5].drop(columns=['season'])
        tmp = list(data.columns)
        tmp.remove('home_win')





        X = data[tmp]
        y = data['home_win']

        self.train_X = torch.Tensor(X.values)
        self.train_y = torch.Tensor(y.values)

    def __len__(self):
        return self.train_X.shape[0]

    def __getitem__(self, idx):
        if self.train_y[idx].shape == torch.Size([]):
            return self.train_X[idx], self.train_y[idx].view(1)
        return self.train_X[idx], self.train_y[idx].view(-1, 1)




In [118]:
class MLDataset2(Dataset):
    def __init__(self, X, y):
        
        self.train_X = torch.Tensor(X)
        self.train_y = torch.Tensor(y)

    def __len__(self):
        return self.train_X.shape[0]

    def __getitem__(self, idx):
        if self.train_y[idx].shape == torch.Size([]):
            return self.train_X[idx], self.train_y[idx].view(1)
        return self.train_X[idx], self.train_y[idx].view(-1, 1)


### Custom Logistic Regression classes


#### Base Logistic Regression Classes

In [119]:
class CustomLogReg(pl.LightningModule):
    def __init__(self, in_features):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, batch) -> Any:
        return self.model(batch)

    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        probs = X[:, 0]

        logits = self.forward(X)

        loss = self.custom_loss(logits, y, probs)

        self.log("train_loss", loss, on_step=True, on_epoch=True)

        return loss

    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        probs = X[:, 0].view(-1, 1)

        logits = self.forward(X)

        loss = self.custom_loss(logits, y, probs)
        # profit = self.profit(logits, y, probs)
        acc = (logits > 0.5) == y

        self.log("val_loss", loss, on_step=True, on_epoch=True)
        # self.log("val_profit", profit, on_step=True, on_epoch=True)
        self.log("val_acc", acc.type(torch.float64).mean(), on_step=True, on_epoch=True)

        return loss

    def test_step(self, test_batch, batch_idx):
        X, y = test_batch
        probs = X[:, 0].view(-1, 1)

        logits = self.forward(X)

        loss = self.custom_loss(logits, y, probs)
        # profit = self.profit(logits, y, probs)
        acc = (logits > 0.5) == y

        self.log("test_loss", loss, on_step=True, on_epoch=True)
        # self.log("test_profit", profit, on_step=True, on_epoch=True)
        self.log("test_acc", acc.type(torch.float64).mean(), on_step=True, on_epoch=True)

        return loss

    def custom_loss(self, logits, y, probs):
        return F.binary_cross_entropy(logits, y)

    

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer

    def custom_test(self, test_dataset):

        X, y = test_dataset[:]
        probs = X[:, 0].view(-1, 1)

        logits = self.forward(X)

        loss = self.custom_loss(logits, y, probs)
        # profit = self.profit(logits, y, probs)
        acc = (logits > 0.5) == y
        #'profit': self.profit(logits, y, probs),
        return {'loss': self.custom_loss(logits, y, probs),
                'acc': acc.type(torch.float64).mean()}



#### Manual Logistic Regression class 

In [120]:
# class with manual cross entropy loss to make sure manual losses work
class NewLogReg(CustomLogReg):
    def custom_loss(self, logits, y, probs):
        loss = y * torch.log(logits) + (1-y) * torch.log(1 - logits)
        return -loss.mean()




#### weighted by prob Logistic Regression class

In [121]:
# class with cross entropy loss weighted by reward, outside ln
class ProbLogReg(CustomLogReg):
    def custom_loss(self, logits, y, probs):
        loss = probs * y * torch.log(logits) + (1-probs) * (1-y) * torch.log(1 - logits)
        return -loss.mean()



### Timed profit function

In [131]:
def get_timed_profit(model: CustomLogReg, dataset: MLDataset, I=100,  threshold: float = 0.):
    """
    when1 is when we would bet home win
    when2 is when we would bet home loss
    the profit when correctly betting home_win is ((1/probs) - 1) * I
    similarly for correct loss is   ((1/(1-probs)) - 1) * I
    incorrect is always -I
    profit is thus: (1[home_win] * win_mult - 1) * (pred > prob) * I
    #               ...  + (1[home_win] * loss_mult * (preds < threshold) - 1) * I
    """
    X, y = dataset[:]
    
    # pytorhc lightning models
    try:
      preds = model(X) 
      probs = X[:, 0].view(-1,1)
      when1 = preds > (probs + threshold)
      when2 = preds < (probs - threshold)

      profits = I * (y * (1 / probs) - 1) * when1 + I * ((1-y) * (1 / (1-probs)) - 1) * when2

      return profits.cumsum(0)
    # sklearn models
    except:
      preds = model.predict(X).reshape(-1,1)
      y = y.reshape(-1, 1)
      probs = X[:, 0].detach().numpy().reshape(-1,1)
      when1 = preds > (probs + threshold)
      when2 = preds < (probs - threshold)
      profits = I * (y * (1 / probs) - 1) * when1 + I * ((1-y) * (1 / (1-probs)) - 1) * when2
      
      return profits.cumsum(0)


In [139]:
def roi(model, dataset, I=1000, threshold=0):
  X,y = dataset[:]
  try:
    preds = model(X) 
    probs = X[:, 0].view(-1,1)
    when1 = preds > (probs + threshold)
    when2 = preds < (probs - threshold)

    profits = I * (y * (1 / probs) - 1) * when1 + I * ((1-y) * (1 / (1-probs)) - 1) * when2

    # sklearn models
  except:
    preds = model.predict(X).reshape(-1,1)
    y = y.reshape(-1, 1)
    probs = X[:, 0].detach().numpy().reshape(-1,1)
    when1 = preds > (probs + threshold)
    when2 = preds < (probs - threshold)
    profits = I * (y * (1 / probs) - 1) * when1 + I * ((1-y) * (1 / (1-probs)) - 1) * when2

  return profits.sum() / (len(dataset) * I)
  
  

## Pre-processing Pipeline

In [141]:
pipes1 = {}

pipes1['LogisticRegression'] = Pipeline([
    ('scaler',StandardScaler()),
    ('logr',LogisticRegression(max_iter=25000))
])
pipes1['LogisticRegressionWithPCA'] = Pipeline([
    ('scaler',StandardScaler()),
    ('pca',PCA()),
    ('logr',LogisticRegression(max_iter=25000))
])

In [142]:
pipes2 = {}

pipes2['problogreg'] = None

In [143]:
results = {}
results['ModelName'] = []
results['TestAccuracy'] = []
results['TestProfit'] = []
results['ROI'] = []
profit_graphs = []

In [144]:
params1 = {'logreg__C': 10, 'logreg__penalty':'l2'}
params2 = {'C': 10, 'penalty':'l2'}

In [145]:
models = {}
for k in pipes1.keys():
  models[k] = LogisticRegression(max_iter=1000, **params2).fit(train_X,train_y)
  results['ModelName'].append(k)
  results['TestAccuracy'].append(accuracy_score(test_y, models[k].predict(test_X)))
  probs = test_X.to_numpy()[:,0]
  t = MLDataset2(test_X.to_numpy(), test_y.to_numpy())
  results['TestProfit'].append(get_timed_profit(models[k], t)[-1])
  results['ROI'].append(roi(models[k], t, 1000))

  profit_graphs.append(get_timed_profit(models[k], MLDataset2(test_X.to_numpy(), test_y.to_numpy()),  I=1000)[-1])
  print(k, 'DONE')



LogisticRegression DONE
LogisticRegressionWithPCA DONE




In [146]:
trainers = {}
max_epochs=1

In [147]:
for k in pipes2.keys():
    trainers[k] = pl.Trainer(max_epochs=max_epochs, default_root_dir=f"checkpoint/{k}/")
    trainer = trainers[k]
    train_x_np = train_X.to_numpy()
    train_y_np = train_y.to_numpy()
    val_x_np = val_X.to_numpy()
    val_y_np = val_y.to_numpy()

    test_x_np = test_X.to_numpy()
    test_y_np = test_y.to_numpy()

    train_data = MLDataset2(train_x_np, train_y_np)
    val_data = MLDataset2(val_x_np, val_y_np )
    test_data = MLDataset2(test_x_np, test_y_np )

    train_loader = DataLoader(train_data)
    val_loader = DataLoader(val_data)
    test_loader = DataLoader(test_data)

    if k == 'problogreg':
      model = ProbLogReg(train_x_np.shape[1])
    elif k == 'torchlogreg':
      model = CustomLogReg(train_x_np.shape[1])
    else:
      raise AssertionError
    
    models[k] = model

    trainer.fit(model, train_loader, val_loader)

    logits = model(torch.Tensor(test_x_np)).detach().numpy()
    pred = logits > 0.5
    probs = test_x_np[:, 0]

    results['ModelName'].append(k)
    results['TestAccuracy'].append(accuracy_score(test_y_np, pred))
    results['TestProfit'].append(get_timed_profit(model, test_data, I=1000)[-1])
    results['ROI'].append(roi(model, test_data, I=1000))



INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 9.2 K 
-------------------------------------
9.2 K     Trainable params
0         Non-trainable params
9.2 K     Total params
0.037     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [148]:
profit_row = []
thresholds = [0.0, 0.025, 0.05, 0.075, 0.1]
for name, model in models.items():
  profits = np.zeros(len(thresholds))
  for i, thresh in enumerate(thresholds):
    profit = (thresh, get_timed_profit(model, test_data, I=100, threshold=thresh)[-1].numpy())
    profits[i] = profit[1]
    print(name, profit[0], profit[1])
  j = profits.argmax()
  print(f'max profit was achieved with threshold: {thresholds[j]} for model {name}')






LogisticRegression 0.0 [1785.0408]
LogisticRegression 0.025 [1785.0408]
LogisticRegression 0.05 [1779.9491]
LogisticRegression 0.075 [1816.8564]
LogisticRegression 0.1 [1843.8317]
max profit was achieved with threshold: 0.1 for model LogisticRegression
LogisticRegressionWithPCA 0.0 [1785.0408]
LogisticRegressionWithPCA 0.025 [1785.0408]
LogisticRegressionWithPCA 0.05 [1779.9491]
LogisticRegressionWithPCA 0.075 [1816.8564]
LogisticRegressionWithPCA 0.1 [1843.8317]
max profit was achieved with threshold: 0.1 for model LogisticRegressionWithPCA
problogreg 0.0 [1061.554]
problogreg 0.025 [2073.671]
problogreg 0.05 [3523.3242]
problogreg 0.075 [2482.1956]
problogreg 0.1 [4580.431]
max profit was achieved with threshold: 0.1 for model problogreg


On the validation we find the best confidence thresholds to use when betting for each model. Now let's train Our custom model on the entire development data with this 0.1 treshold.

In [154]:
max_epochs=1
k = "final_results"
dev_X = pd.concat([train_X,val_X])
dev_y = pd.concat([train_y,val_y])


trainers[k] = pl.Trainer(max_epochs=max_epochs, default_root_dir=f"checkpoint/{k}/")
trainer = trainers[k]
dev_x_np = dev_X.to_numpy()
dev_y_np = dev_y.to_numpy()

test_x_np = test_X.to_numpy()
test_y_np = test_y.to_numpy()

dev_data = MLDataset2(dev_x_np, dev_y_np)
test_data = MLDataset2(test_x_np, test_y_np )

dev_loader = DataLoader(dev_data)
test_loader = DataLoader(test_data)

model = ProbLogReg(dev_x_np.shape[1])

models[k] = model

trainer.fit(model, dev_loader)

logits = model(torch.Tensor(test_x_np)).detach().numpy()
pred = logits > 0.5
probs = test_x_np[:, 0]

results['ModelName'].append(k)
results['TestAccuracy'].append(accuracy_score(test_y_np, pred))
results['TestProfit'].append(get_timed_profit(model, test_data, I=1000, threshold=0.1)[-1])
results['ROI'].append(roi(model, test_data, 1000, threshold=0.1))

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 9.2 K 
-------------------------------------
9.2 K     Trainable params
0         Non-trainable params
9.2 K     Total params
0.037     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [157]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,ModelName,TestAccuracy,TestProfit,ROI
0,LogisticRegression,0.66882,[tensor(1785.0408)],tensor(0.0105)
1,LogisticRegressionWithPCA,0.66882,[tensor(1785.0408)],tensor(0.0105)
2,problogreg,0.657663,[tensor(10615.5391)],tensor(0.0062)
5,final_results,0.662361,[tensor(43663.2656)],tensor(0.0256)


In [151]:
profit_over_time = {}
test_data = MLDataset2(test_X.to_numpy(), test_y.to_numpy())
for name, model in models.items():
  profit_over_time[name] = get_timed_profit(model, test_data, I=1000, threshold=0.05)
  plt.plot(np.arange(len(test_data)), profit_over_time[name])
  plt.xlabel('Time')
  plt.ylabel('Profit')
  plt.title(f'Profit over time for {k}')
  plt.savefig(f'{name}_profit.png')
  plt.clf()



<Figure size 432x288 with 0 Axes>

In [152]:
from joblib import dump, load
dump(filename='CustomLogisticRegression.pkl', value=models['problogreg'])

['CustomLogisticRegression.pkl']