In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [15]:
# List of common encodings to try
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

data = None
for encoding in encodings:
    try:
        data = pd.read_csv("RETIRED/GOAL_DATA.CSV", encoding=encoding)
        print(f"Successfully read the file with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to decode with encoding: {encoding}")

# if data is not None:
#     # View the data to ensure it was read correctly
#     print(data.head())
# else:
#     print("Unable to read the file with the tested encodings.")

data


Failed to decode with encoding: utf-8
Successfully read the file with encoding: latin1


Unnamed: 0,country,league,alert,verif,sezonul,datameci,orameci,etapa,txtechipa1,place1t,...,wdlh3,wdlh4,wdlh5,wdla1,wdla2,wdla3,wdla4,wdla5,codechipa1,codechipa2
0,England,Premier L,,,21,12/01/2021,2015,1,Burnley,16.0,...,0,0,0,0,0,0,0,0,1030,1020
1,England,Premier L,,,21,13/09/2020,1630,1,Tottenham,1.0,...,0,0,0,0,0,0,0,0,1001,1017
2,England,Premier L,,,21,12/09/2020,2000,1,West Ham,14.0,...,0,0,0,0,0,0,0,0,1022,1016
3,England,Premier L,,,21,13/09/2020,1400,1,West Bromwich,5.0,...,0,0,0,0,0,0,0,0,1006,1042
4,England,Premier L,,,21,20/01/2021,1800,1,Man City,2.0,...,0,0,0,0,0,0,0,0,1009,1003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54943,Mexico,Primera Divisio,,,24,11/11/2023,2300,17,Queretaro,15.0,...,0,0,0,0,0,0,0,0,41011,41018
54944,Mexico,Primera Divisio,,,24,12/11/2023,100,17,Pumas UNAM,5.0,...,0,0,0,0,0,0,0,0,41014,41013
54945,Mexico,Primera Divisio,,,24,12/11/2023,310,17,Tigres,3.0,...,0,0,0,0,0,0,0,0,41005,41012
54946,Mexico,Primera Divisio,,,24,13/11/2023,0,17,Cruz Azul,16.0,...,0,0,0,0,0,0,0,0,41003,41017


In [16]:
col_dict = {
    "country": "Country",
    "league": "League",
    "datameci": "Date",
    "etapa": "Round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "place1t": "Home_team_place_total",
    "place1a": "Home_team_place_home",
    "place2t": "Away_team_place_total",
    "place2d": "Away_team_place_away",
    "customh": "o2.5_avg_season",
    "customa": "o2.5_avg_past5",
    "custom3": "o3.5_avg_season",
    "custom4": "o3.5_avg_past5",
    "home_val": "H_ELO_avg",
    "home_val_2": "H_ELO_avg_opp",
    "home_val_3": "H_win_avg",
    "home_val_4": "H_win_1.5_avg",
    "home_val_5": "H_gg_avg",
    "away_val": "A_ELO_avg",
    "away_val_2": "A_ELO_avg_opp",
    "away_val_3": "A_win_avg",
    "away_val_4": "A_win_1.5_avg",
    "away_val_5": "A_gg_avg",
    "scor1": "home_goals",
    "scor2": "away_goals",
    "cotao": "o2.5_odds",
}

data = data.rename(columns=col_dict).filter(items=col_dict.values())

In [17]:
# Convert Date column to datetime type
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Order by date
data = data.sort_values(by='Date')

data = data[data["Round"]>=8]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Country,League,Date,Round,home_team,away_team,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,...,H_win_1.5_avg,H_gg_avg,A_ELO_avg,A_ELO_avg_opp,A_win_avg,A_win_1.5_avg,A_gg_avg,home_goals,away_goals,o2.5_odds
0,Mexico,Primera Divisio,2020-09-03,8,Club America,Mazatlan,3.0,9,16.0,14,...,66.667,66.667,1952.7,1984.7,0.000,0.000,66.667,3,1,1.57
1,Mexico,Primera Divisio,2020-09-04,8,Pachuca,San Luis,7.0,11,17.0,9,...,0.000,50.000,1939.3,1931.3,33.333,33.333,66.667,3,1,1.90
2,Mexico,Primera Divisio,2020-09-04,8,Queretaro,Toluca,13.0,6,6.0,11,...,33.333,66.667,1924.0,1945.0,25.000,0.000,100.000,4,1,1.79
3,Mexico,Primera Divisio,2020-09-05,8,Necaxa,Leon,12.0,9,2.0,6,...,0.000,50.000,2083.3,2003.7,33.333,0.000,0.000,0,2,1.76
4,Mexico,Primera Divisio,2020-09-05,8,Juarez,Santos Laguna,15.0,16,11.0,16,...,0.000,0.000,2092.7,2041.7,0.000,0.000,100.000,1,1,1.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43144,Brazil,Serie A,2024-11-27,36,Atletico Mineiro,Juventude RS,10.0,15,15.0,19,...,17.647,64.706,1723.1,1777.4,5.882,0.000,76.471,2,3,2.50
43145,Argentina,Primera Divisio,2024-11-27,24,Sarmiento,Platense,26.0,25,11.0,3,...,8.333,25.000,1944.3,1993.9,33.333,25.000,41.667,1,0,3.60
43146,Argentina,Primera Divisio,2024-11-27,24,Union de Santa Fe,Talleres,6.0,4,4.0,4,...,33.333,16.667,2102.4,1959.9,33.333,16.667,41.667,2,3,3.30
43147,Brazil,Serie A,2024-11-28,35,Cruzeiro,Gremio,7.0,7,14.0,11,...,35.294,52.941,1767.4,1794.1,17.647,11.765,47.059,1,1,2.16


In [18]:
data['total_goals'] = data['home_goals'] + data['away_goals']
data['over_2.5_goals'] = data['total_goals'].apply(lambda x: 1 if x > 2.5 else 0)

In [19]:
data_ready = data.drop(columns=['home_team', 'away_team', 'home_goals', 'away_goals', 'total_goals', 'o2.5_odds'])
# Apply one-hot encoding to 'Country' and 'League'
data_ready = pd.get_dummies(data_ready, columns=['Country', 'League'])
data_ready

Unnamed: 0,Date,Round,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,o2.5_avg_season,o2.5_avg_past5,o3.5_avg_season,o3.5_avg_past5,...,League_Segunda,League_Serie A,League_Serie B,League_Super L,League_Super League,League_Super Lig,League_Superettan,League_Superliga,League_Superligaen,League_Urvalsdeild
0,2020-09-03,8,3.0,9,16.0,14,85.71,80.0,57.14,60.0,...,False,False,False,False,False,False,False,False,False,False
1,2020-09-04,8,7.0,11,17.0,9,28.57,20.0,14.29,20.0,...,False,False,False,False,False,False,False,False,False,False
2,2020-09-04,8,13.0,6,6.0,11,42.86,40.0,28.57,20.0,...,False,False,False,False,False,False,False,False,False,False
3,2020-09-05,8,12.0,9,2.0,6,42.86,40.0,0.00,0.0,...,False,False,False,False,False,False,False,False,False,False
4,2020-09-05,8,15.0,16,11.0,16,28.57,40.0,14.29,20.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43144,2024-11-27,36,10.0,15,15.0,19,57.14,20.0,22.86,20.0,...,False,True,False,False,False,False,False,False,False,False
43145,2024-11-27,24,26.0,25,11.0,3,21.74,0.0,13.04,0.0,...,False,False,False,False,False,False,False,False,False,False
43146,2024-11-27,24,6.0,4,4.0,4,30.43,40.0,13.04,20.0,...,False,False,False,False,False,False,False,False,False,False
43147,2024-11-28,35,7.0,7,14.0,11,44.12,60.0,14.71,0.0,...,False,True,False,False,False,False,False,False,False,False


In [20]:
# Split the data into training and testing sets by date
cut_off_date = data_ready['Date'].quantile(0.8)
train_data = data_ready[data_ready['Date'] <= cut_off_date]
y_train = train_data["over_2.5_goals"]
train_data = train_data.drop(columns=['Date', 'over_2.5_goals'])
test_data = data_ready[data_ready['Date'] > cut_off_date]
y_test = test_data["over_2.5_goals"]
test_data = test_data.drop(columns=['Date', 'over_2.5_goals'])

In [21]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(train_data)

# Transform the training and testing data
train_data_scaled = scaler.transform(train_data)
test_data_scaled = scaler.transform(test_data)

In [22]:
# Define parameter grid for multiple models
param_grid = [
    {
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['liblinear', 'lbfgs'],
        'model__max_iter': [100, 200],
    },
    # {
    #     'model': [RandomForestClassifier()],
    #     'model__n_estimators': [50, 100, 200],
    #     'model__max_depth': [None, 10, 20],
    #     'model__min_samples_split': [2, 5, 10],
    # },
    # {
    #     'model': [SVC()],
    #     'model__C': [0.1, 1, 10, 100],
    #     'model__kernel': ['linear', 'rbf', 'poly'],
    #     'model__degree': [3, 4],
    # }
]

# Initialize GridSearchCV with multiple models
grid_search = GridSearchCV(estimator=Pipeline([('model', LogisticRegression())]),
                           param_grid=param_grid,
                           cv=3,
                           scoring='accuracy',
                           verbose=10,
                           n_jobs=-1)

# Fit the GridSearch to the training data
grid_search.fit(train_data_scaled, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(test_data_scaled)

# Evaluate the model with suitable metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print confusion matrix and metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Calculate betting results
profit = 0
for index, pred in enumerate(y_pred):
    if pred == 1:
        # Simulate placing a bet with potential profit or loss
        odds = data.iloc[test_data.index[index]]['o2.5_odds']
        if y_test.iloc[index] == 1:
            profit += (odds * 1) - 1  # Calculate profit
        else:
            profit -= 1  # Lose the bet

# Total number of bets placed
total_stake = y_pred.sum()
roi = profit/total_stake

# Print betting results
print(f"\nTotal Stake: £{total_stake}")
print(f"Net Profit: £{profit:.2f}")
print(f"ROI: {roi:.1f}%")

# Print the best parameters found
print("\nBest parameters:", grid_search.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Confusion Matrix:
[[2354 1785]
 [2122 2327]]

Accuracy: 0.5450628784350257
Precision: 0.5659046692607004
Recall: 0.5230388851427287
F1 Score: 0.5436280808316786

Total Stake: £4112
Net Profit: £-251.43
ROI: -0.1%

Best parameters: {'model': LogisticRegression(), 'model__C': 1, 'model__max_iter': 100, 'model__solver': 'lbfgs'}


In [23]:
# Merge the test data with the predictions and odds to calculate and track profit/loss for each bet
test_results = test_data.copy()  # Start with the test data
test_results['o2.5_odds'] = data.loc[test_data.index, 'o2.5_odds']
test_results['Prediction'] = y_pred
test_results['Actual'] = y_test.values
test_results['Profit/Loss'] = test_results.apply(
    lambda row: (row['o2.5_odds'] - 1) if row['Prediction'] == 1 and row['Actual'] == 1 else
                (-1 if row['Prediction'] == 1 and row['Actual'] == 0 else 0), axis=1)

test_results

Unnamed: 0,Round,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,o2.5_avg_season,o2.5_avg_past5,o3.5_avg_season,o3.5_avg_past5,H_ELO_avg,...,League_Super League,League_Super Lig,League_Superettan,League_Superliga,League_Superligaen,League_Urvalsdeild,o2.5_odds,Prediction,Actual,Profit/Loss
34561,14,13.0,6,10.0,16,41.67,20.0,25.00,0.0,1863.2,...,False,False,False,False,False,False,1.93,1,0,-1.0
34562,14,15.0,15,16.0,18,46.15,40.0,7.69,0.0,1745.4,...,False,False,False,False,False,False,1.99,0,0,0.0
34563,14,4.0,4,9.0,6,23.08,0.0,15.38,0.0,1883.5,...,False,False,False,False,False,False,1.85,1,0,-1.0
34564,14,3.0,2,12.0,10,69.23,40.0,46.15,20.0,1848.5,...,False,False,False,False,False,False,1.55,1,0,-1.0
34565,14,8.0,13,1.0,1,53.85,20.0,15.38,0.0,1693.8,...,False,False,False,False,False,False,1.54,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43144,36,10.0,15,15.0,19,57.14,20.0,22.86,20.0,1844.6,...,False,False,False,False,False,False,2.50,0,1,0.0
43145,24,26.0,25,11.0,3,21.74,0.0,13.04,0.0,1907.5,...,False,False,False,False,False,False,3.60,0,0,0.0
43146,24,6.0,4,4.0,4,30.43,40.0,13.04,20.0,1975.8,...,False,False,False,False,False,False,3.30,0,1,0.0
43147,35,7.0,7,14.0,11,44.12,60.0,14.71,0.0,1804.2,...,False,False,False,False,False,False,2.16,0,0,0.0


In [24]:
y_pred.sum()

np.int64(4112)

In [26]:
for i in range(1,10):
    print(i)

1
2
3
4
5
6
7
8
9
