# Libraries

In [89]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [90]:
fixture = pd.read_csv('fixture.csv')
fixture

Unnamed: 0,year,round,date,time,region,venue,hteam,ateam,hscore,ascore,home_win,hdiff
0,2017,1,2017-03-26,14:50:00,SA,Adelaide Oval,Adelaide,Greater Western Sydney,147,91,1,56
1,2017,1,2017-03-23,19:20:00,VIC,M.C.G.,Carlton,Richmond,89,132,0,-43
2,2017,1,2017-03-24,19:50:00,VIC,M.C.G.,Collingwood,Western Bulldogs,86,100,0,-14
3,2017,1,2017-03-25,19:25:00,VIC,M.C.G.,Essendon,Hawthorn,116,91,1,25
4,2017,1,2017-03-26,16:40:00,WA,Subiaco,Fremantle,Geelong,73,115,0,-42
...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023,26,2023-09-15,19:50:00,VIC,M.C.G.,Melbourne,Carlton,71,73,0,-2
1409,2023,26,2023-09-16,19:10:00,SA,Adelaide Oval,Port Adelaide,Greater Western Sydney,70,93,0,-23
1410,2023,27,2023-09-23,17:15:00,QLD,Gabba,Brisbane Lions,Carlton,79,63,1,16
1411,2023,27,2023-09-22,19:50:00,VIC,M.C.G.,Collingwood,Greater Western Sydney,58,57,1,1


# Modeling

## Logistic Regression Model

In [91]:
fixture[fixture['year'] == 2023]

Unnamed: 0,year,round,date,time,region,venue,hteam,ateam,hscore,ascore,home_win,hdiff
1197,2023,1,2023-03-17,19:40:00,VIC,M.C.G.,Geelong,Collingwood,103,125,0,-22
1198,2023,1,2023-03-18,19:00:00,QLD,Carrara,Gold Coast,Sydney,61,110,0,-49
1199,2023,1,2023-03-19,13:10:00,NSW,Sydney Showground,Greater Western Sydney,Adelaide,106,90,1,16
1200,2023,1,2023-03-19,15:20:00,VIC,M.C.G.,Hawthorn,Essendon,65,124,0,-59
1201,2023,1,2023-03-18,19:25:00,VIC,M.C.G.,Melbourne,Western Bulldogs,115,65,1,50
...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023,26,2023-09-15,19:50:00,VIC,M.C.G.,Melbourne,Carlton,71,73,0,-2
1409,2023,26,2023-09-16,19:10:00,SA,Adelaide Oval,Port Adelaide,Greater Western Sydney,70,93,0,-23
1410,2023,27,2023-09-23,17:15:00,QLD,Gabba,Brisbane Lions,Carlton,79,63,1,16
1411,2023,27,2023-09-22,19:50:00,VIC,M.C.G.,Collingwood,Greater Western Sydney,58,57,1,1


In [113]:
# convert to datetime dtype and extract month and day
fixture['date'] = pd.to_datetime(fixture['date'])
fixture['month'] = fixture['date'].dt.month
fixture['day'] = fixture['date'].dt.day_name()

# Removing colons and converting time to integer
fixture['time_int'] = fixture['time'].str.replace(':', '').astype(int) / 100
fixture['time_int'] = fixture['time_int'].astype(int)

# convert dataframe to one-hot vector
fixture_oh = pd.get_dummies(fixture, columns=['region', 'hteam', 'ateam', 'day'], dtype='int')
fixture_oh

Unnamed: 0,year,round,date,time,venue,hscore,ascore,home_win,hdiff,month,...,ateam_Sydney,ateam_West Coast,ateam_Western Bulldogs,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,2017,1,2017-03-26,14:50:00,Adelaide Oval,147,91,1,56,3,...,0,0,0,0,0,0,1,0,0,0
1,2017,1,2017-03-23,19:20:00,M.C.G.,89,132,0,-43,3,...,0,0,0,0,0,0,0,1,0,0
2,2017,1,2017-03-24,19:50:00,M.C.G.,86,100,0,-14,3,...,0,0,1,1,0,0,0,0,0,0
3,2017,1,2017-03-25,19:25:00,M.C.G.,116,91,1,25,3,...,0,0,0,0,0,1,0,0,0,0
4,2017,1,2017-03-26,16:40:00,Subiaco,73,115,0,-42,3,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023,26,2023-09-15,19:50:00,M.C.G.,71,73,0,-2,9,...,0,0,0,1,0,0,0,0,0,0
1409,2023,26,2023-09-16,19:10:00,Adelaide Oval,70,93,0,-23,9,...,0,0,0,0,0,1,0,0,0,0
1410,2023,27,2023-09-23,17:15:00,Gabba,79,63,1,16,9,...,0,0,0,0,0,1,0,0,0,0
1411,2023,27,2023-09-22,19:50:00,M.C.G.,58,57,1,1,9,...,0,0,0,1,0,0,0,0,0,0


In [114]:
# Define your features and target variable
X = fixture_oh.drop(['home_win', 'ascore', 'hscore', 'hdiff', 'date', 'time', 'venue'], axis=1)
y = fixture_oh['home_win']

In [115]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [116]:
X_test

Unnamed: 0,year,round,month,time_int,region_ACT,region_CHN,region_NSW,region_NT,region_QLD,region_SA,...,ateam_Sydney,ateam_West Coast,ateam_Western Bulldogs,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
1130,2022,17,7,1900,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1131,2022,17,7,1925,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1132,2022,17,7,1950,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1133,2022,17,7,1440,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1134,2022,18,7,1315,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,2023,26,9,1950,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1409,2023,26,9,1910,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1410,2023,27,9,1715,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1411,2023,27,9,1950,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [120]:
# Create and fit the model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

In [121]:
# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.47      0.51       120
           1       0.65      0.72      0.69       163

    accuracy                           0.62       283
   macro avg       0.61      0.60      0.60       283
weighted avg       0.61      0.62      0.61       283



In [122]:
def objective(trial):
    # Hyperparameters to be tuned
    C = trial.suggest_float('C', 1e-4, 1e4, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none'])
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])

    # Model definition
    model = LogisticRegression(max_iter=500, C=C, penalty=penalty, solver=solver)

    # Fit model
    model.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy