In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# List of encodings to try
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

# Variables to store data
data1, data2 = None, None

# Attempt to read both files with each encoding
for encoding in encodings:
    try:
        data1 = pd.read_csv("../GOAL_DATA_PAST_SEASON.CSV", encoding=encoding, low_memory=False)
        data2 = pd.read_csv("../GOAL_DATA_PAST_5.CSV", encoding=encoding, low_memory=False)
        print(f"Successfully read the files with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to decode with encoding: {encoding}")


Failed to decode with encoding: utf-8


  data1 = pd.read_csv("GOAL_DATA_PAST_SEASON.CSV", encoding=encoding)


Successfully read the files with encoding: latin1


  data2 = pd.read_csv("GOAL_DATA_PAST_5.CSV", encoding=encoding)


In [3]:
data1

Unnamed: 0,country,league,alert,verif,sezonul,datameci,orameci,etapa,txtechipa1,place1t,...,wdlh3,wdlh4,wdlh5,wdla1,wdla2,wdla3,wdla4,wdla5,codechipa1,codechipa2
0,England,Premier L,,,14,17/08/2013,1245,1,Liverpool,2.0,...,0,0,0,0,0,0,0,0,1002,1028
1,England,Premier L,,,14,17/08/2013,1500,1,Arsenal,12.0,...,0,0,0,0,0,0,0,0,1018,1003
2,England,Premier L,,,14,17/08/2013,1500,1,West Ham,16.0,...,0,0,0,0,0,0,0,0,1022,1039
3,England,Premier L,,,14,17/08/2013,1500,1,Norwich,8.0,...,0,0,0,0,0,0,0,0,1011,1017
4,England,Premier L,,,14,17/08/2013,1500,1,West Bromwich,5.0,...,0,0,0,0,0,0,0,0,1006,1004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147052,Mexico,Primera Divisio,,,25,10/11/2024,100,17,Toluca,3.0,...,0,0,0,0,0,0,0,0,41007,41012
147053,Mexico,Primera Divisio,,,25,10/11/2024,305,17,Cruz Azul,1.0,...,0,0,0,0,0,0,0,0,41003,41005
147054,Mexico,Primera Divisio,,,25,10/11/2024,2300,17,Necaxa,12.0,...,0,0,0,0,0,0,0,0,41009,41002
147055,Mexico,Primera Divisio,,,25,11/11/2024,105,17,Monterrey,5.0,...,0,0,0,0,0,0,0,0,41018,41016


In [4]:
data2

Unnamed: 0,country,league,alert,verif,sezonul,datameci,orameci,etapa,txtechipa1,place1t,...,wdlh3,wdlh4,wdlh5,wdla1,wdla2,wdla3,wdla4,wdla5,codechipa1,codechipa2
0,England,Premier L,,,15,16/08/2014,1245,1,Man Utd,12.0,...,0,0,0,0,0,0,0,0,1020,1034
1,England,Premier L,,,15,16/08/2014,1500,1,West Bromwich,5.0,...,0,0,0,0,0,0,0,0,1006,1021
2,England,Premier L,,,15,16/08/2014,1500,1,Leicester,20.0,...,0,0,0,0,0,0,0,0,1042,1017
3,England,Premier L,,,15,16/08/2014,1500,1,Stoke,15.0,...,0,0,0,0,0,0,0,0,1028,1003
4,England,Premier L,,,15,16/08/2014,1500,1,QPR,18.0,...,0,0,0,0,0,0,0,0,1033,1029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135034,Mexico,Primera Divisio,,,25,10/11/2024,100,17,Toluca,3.0,...,0,0,0,0,0,0,0,0,41007,41012
135035,Mexico,Primera Divisio,,,25,10/11/2024,305,17,Cruz Azul,1.0,...,0,0,0,0,0,0,0,0,41003,41005
135036,Mexico,Primera Divisio,,,25,10/11/2024,2300,17,Necaxa,12.0,...,0,0,0,0,0,0,0,0,41009,41002
135037,Mexico,Primera Divisio,,,25,11/11/2024,105,17,Monterrey,5.0,...,0,0,0,0,0,0,0,0,41018,41016


In [5]:
col_dict_1 = {
    "country": "Country",
    "league": "League",
    "datameci": "Date",
    "etapa": "Round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "place1t": "Home_team_place_total",
    "place1a": "Home_team_place_home",
    "place2t": "Away_team_place_total",
    "place2d": "Away_team_place_away",
    "customh": "ELO_home",
    "customa": "ELO_away",
    "custom3": "FORM_home",
    "custom4": "FORM_away",
    "home_val": "home_win",
    "home_val_2": "home_win_15",
    "home_val_3": "home_o25",
    "home_val_4": "home_o35",
    "home_val_5": "home_scored",
    "away_val": "away_win",
    "away_val_2": "away_win_15",
    "away_val_3": "away_o25",
    "away_val_4": "away_o35",
    "away_val_5": "away_scored",
    "scor1": "home_goals",
    "scor2": "away_goals",
    "cotao": "o2.5_odds",
}

col_dict_2 = {
    "country": "Country",
    "league": "League",
    "datameci": "Date",
    "etapa": "Round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "place1t": "Home_team_place_total",
    "place1a": "Home_team_place_home",
    "place2t": "Away_team_place_total",
    "place2d": "Away_team_place_away",
    "customh": "ELO_home_past_5",
    "customa": "ELO_away_past_5",
    "custom3": "FORM_home_past_5",
    "custom4": "FORM_away_past_5",
    "home_val": "home_win_past_5",
    "home_val_2": "home_win_15_past_5",
    "home_val_3": "home_o25_past_5",
    "home_val_4": "home_o35_past_5",
    "home_val_5": "home_scored_past_5",
    "away_val": "away_win_past_5",
    "away_val_2": "away_win_15_past_5",
    "away_val_3": "away_o25_past_5",
    "away_val_4": "away_o35_past_5",
    "away_val_5": "away_scored_past_5",
    "scor1": "home_goals",
    "scor2": "away_goals",
    "cotao": "o2.5_odds",
}

data1 = data1.rename(columns=col_dict_1).filter(items=col_dict_1.values())
data2 = data2.rename(columns=col_dict_2).filter(items=col_dict_2.values())

In [6]:
# Merge data1 and data2 on all columns with matching names
data = pd.merge(
    data1,
    data2,
    how='inner'  # Use 'inner' join to keep only matching columns
)

print("DataFrames merged successfully. Here is the merged data:")
data

DataFrames merged successfully. Here is the merged data:


Unnamed: 0,Country,League,Date,Round,home_team,away_team,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,...,home_win_past_5,home_win_15_past_5,home_o25_past_5,home_o35_past_5,home_scored_past_5,away_win_past_5,away_win_15_past_5,away_o25_past_5,away_o35_past_5,away_scored_past_5
0,England,Premier L,16/08/2014,1,Man Utd,Swansea,12.0,12,19.0,19,...,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.000,0.000
1,England,Premier L,16/08/2014,1,West Bromwich,Sunderland,5.0,5,13.0,13,...,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.000,0.000
2,England,Premier L,16/08/2014,1,Leicester,Everton,20.0,20,9.0,9,...,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.000,0.000
3,England,Premier L,16/08/2014,1,Stoke,Aston Villa,15.0,15,3.0,3,...,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.000,0.000
4,England,Premier L,16/08/2014,1,QPR,Hull,18.0,18,16.0,16,...,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135034,Mexico,Primera Divisio,10/11/2024,17,Toluca,Club America,3.0,3,6.0,7,...,60.0,50.0,87.5,62.5,3.000,20.0,12.5,62.500,37.500,1.625
135035,Mexico,Primera Divisio,10/11/2024,17,Cruz Azul,Tigres,1.0,1,2.0,3,...,100.0,62.5,50.0,25.0,2.375,40.0,37.5,50.000,25.000,1.625
135036,Mexico,Primera Divisio,10/11/2024,17,Necaxa,Atlas,12.0,12,10.0,10,...,20.0,37.5,50.0,25.0,1.625,20.0,0.0,87.500,50.000,1.125
135037,Mexico,Primera Divisio,11/11/2024,17,Monterrey,Leon,5.0,9,11.0,11,...,60.0,25.0,75.0,50.0,1.750,20.0,0.0,85.714,71.429,1.571


In [7]:
# Convert Date column to datetime type
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Order by date
data = data.sort_values(by='Date')

data = data[data["Round"]>=8]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Country,League,Date,Round,home_team,away_team,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,...,home_win_past_5,home_win_15_past_5,home_o25_past_5,home_o35_past_5,home_scored_past_5,away_win_past_5,away_win_15_past_5,away_o25_past_5,away_o35_past_5,away_scored_past_5
0,Austria,Erste Liga,2014-08-29,8,Innsbruck,LASK,4.0,9,3.0,3,...,33.333,0.000,66.667,0.000,1.000,66.667,33.333,33.333,0.000,1.333
1,Austria,Erste Liga,2014-08-29,8,KSV 1919,Mattersburg,7.0,8,1.0,4,...,0.000,0.000,33.333,33.333,0.667,33.333,33.333,33.333,33.333,1.333
2,Austria,Erste Liga,2014-08-29,8,Liefering,Horn,2.0,3,6.0,7,...,66.667,33.333,33.333,33.333,2.000,25.000,0.000,50.000,0.000,0.500
3,Austria,Erste Liga,2014-08-29,8,St. Polten,Lustenau,5.0,4,8.0,8,...,50.000,25.000,50.000,25.000,1.500,33.333,0.000,33.333,0.000,0.667
4,Slovakia,Superliga,2014-08-29,8,Dunajska Streda,Zilina,5.0,5,1.0,1,...,66.667,33.333,33.333,33.333,1.333,25.000,0.000,0.000,0.000,0.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106162,Netherlands,Eredivisie,2025-01-11,18,Zwolle,Nijmegen,13.0,12,12.0,12,...,60.000,25.000,62.500,25.000,1.375,20.000,25.000,50.000,25.000,0.875
106163,Netherlands,Eredivisie,2025-01-11,18,Eindhoven,Alkmaar,1.0,1,5.0,5,...,100.000,87.500,87.500,50.000,4.000,60.000,12.500,62.500,12.500,1.500
106164,Netherlands,Eredivisie,2025-01-11,18,Almelo,Sparta,15.0,15,16.0,9,...,20.000,11.111,55.556,44.444,1.556,0.000,0.000,37.500,0.000,0.875
106165,Spain,Segunda,2025-01-11,22,Huesca,Mirandes,7.0,8,2.0,8,...,40.000,20.000,60.000,40.000,1.800,60.000,20.000,30.000,30.000,1.100


In [8]:
data['total_goals'] = data['home_goals'] + data['away_goals']
data['over_2.5_goals'] = data['total_goals'].apply(lambda x: 1 if x > 2.5 else 0)

In [9]:
data_ready = data.drop(columns=['home_team', 'away_team', 'home_goals', 'away_goals', 'total_goals', 'o2.5_odds','FORM_away_past_5', 'FORM_home_past_5'])
# Apply one-hot encoding to 'Country' and 'League'
data_ready = pd.get_dummies(data_ready, columns=['Country', 'League'])
data_ready

Unnamed: 0,Date,Round,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,ELO_home,ELO_away,FORM_home,FORM_away,...,League_Segunda,League_Serie A,League_Serie B,League_Super L,League_Super League,League_Super Lig,League_Superettan,League_Superliga,League_Superligaen,League_Urvalsdeild
0,2014-08-29,8,4.0,9,3.0,3,1692.0,1806.0,16.0,8.0,...,False,False,False,False,False,False,False,False,False,False
1,2014-08-29,8,7.0,8,1.0,4,1631.0,1738.0,3.0,11.0,...,False,False,False,False,False,False,False,False,False,False
2,2014-08-29,8,2.0,3,6.0,7,1824.0,1602.0,4.0,5.0,...,False,False,False,False,False,False,False,False,False,False
3,2014-08-29,8,5.0,4,8.0,8,1647.0,1658.0,1.0,-14.0,...,False,False,False,False,False,False,False,False,False,False
4,2014-08-29,8,5.0,5,1.0,1,1852.0,1975.0,5.0,0.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106162,2025-01-11,18,13.0,12,12.0,12,1708.0,1746.0,-5.0,-24.0,...,False,False,False,False,False,False,False,False,False,False
106163,2025-01-11,18,1.0,1,5.0,5,2240.0,1964.0,0.0,13.0,...,False,False,False,False,False,False,False,False,False,False
106164,2025-01-11,18,15.0,15,16.0,9,1622.0,1662.0,-2.0,-16.0,...,False,False,False,False,False,False,False,False,False,False
106165,2025-01-11,22,7.0,8,2.0,8,1593.0,1636.0,13.0,6.0,...,True,False,False,False,False,False,False,False,False,False


In [20]:
# Split the data into training and testing sets by date
cut_off_date = data_ready['Date'].quantile(0.8)
train_data = data_ready[data_ready['Date'] <= cut_off_date]
y_train = train_data["over_2.5_goals"]
train_data = train_data.drop(columns=['Date', 'over_2.5_goals'])
test_data = data_ready[data_ready['Date'] > cut_off_date]
y_test = test_data["over_2.5_goals"]
test_data = test_data.drop(columns=['Date', 'over_2.5_goals'])

In [21]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(train_data)

# Transform the training and testing data
train_data_scaled = scaler.transform(train_data)
test_data_scaled = scaler.transform(test_data)

In [22]:
# Define parameter grid for multiple models
param_grid = [
    {
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10, 100],
        'model__solver': ['liblinear', 'lbfgs'],
        'model__max_iter': [100, 200],
    },
    # {
    #     'model': [RandomForestClassifier()],
    #     'model__n_estimators': [50, 100, 200],
    #     'model__max_depth': [None, 10, 20],
    #     'model__min_samples_split': [2, 5, 10],
    # },
    # {
    #     'model': [SVC()],
    #     'model__C': [0.1, 1, 10, 100],
    #     'model__kernel': ['linear', 'rbf', 'poly'],
    #     'model__degree': [3, 4],
    # }
]

# Initialize GridSearchCV with multiple models
grid_search = GridSearchCV(estimator=Pipeline([('model', LogisticRegression())]),
                           param_grid=param_grid,
                           cv=3,
                           scoring='accuracy',
                           verbose=10,
                           n_jobs=-1)

# Fit the GridSearch to the training data
grid_search.fit(train_data_scaled, y_train)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(test_data_scaled)

# Evaluate the model with suitable metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print confusion matrix and metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Calculate betting results
profit = 0
for index, pred in enumerate(y_pred):
    if pred == 1:
        # Simulate placing a bet with potential profit or loss
        odds = data.iloc[test_data.index[index]]['o2.5_odds']
        if y_test.iloc[index] == 1:
            profit += (odds * 1) - 1  # Calculate profit
        else:
            profit -= 1  # Lose the bet

# Total number of bets placed
total_stake = y_pred.sum()
roi = profit/total_stake

# Print betting results
print(f"\nTotal Stake: £{total_stake}")
print(f"Net Profit: £{profit:.2f}")
print(f"ROI: {roi:.1f}%")

# Print the best parameters found
print("\nBest parameters:", grid_search.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Confusion Matrix:
[[2354 1785]
 [2122 2327]]

Accuracy: 0.5450628784350257
Precision: 0.5659046692607004
Recall: 0.5230388851427287
F1 Score: 0.5436280808316786

Total Stake: £4112
Net Profit: £-251.43
ROI: -0.1%

Best parameters: {'model': LogisticRegression(), 'model__C': 1, 'model__max_iter': 100, 'model__solver': 'lbfgs'}


In [23]:
# Merge the test data with the predictions and odds to calculate and track profit/loss for each bet
test_results = test_data.copy()  # Start with the test data
test_results['o2.5_odds'] = data.loc[test_data.index, 'o2.5_odds']
test_results['Prediction'] = y_pred
test_results['Actual'] = y_test.values
test_results['Profit/Loss'] = test_results.apply(
    lambda row: (row['o2.5_odds'] - 1) if row['Prediction'] == 1 and row['Actual'] == 1 else
                (-1 if row['Prediction'] == 1 and row['Actual'] == 0 else 0), axis=1)

test_results

Unnamed: 0,Round,Home_team_place_total,Home_team_place_home,Away_team_place_total,Away_team_place_away,o2.5_avg_season,o2.5_avg_past5,o3.5_avg_season,o3.5_avg_past5,H_ELO_avg,...,League_Super League,League_Super Lig,League_Superettan,League_Superliga,League_Superligaen,League_Urvalsdeild,o2.5_odds,Prediction,Actual,Profit/Loss
34561,14,13.0,6,10.0,16,41.67,20.0,25.00,0.0,1863.2,...,False,False,False,False,False,False,1.93,1,0,-1.0
34562,14,15.0,15,16.0,18,46.15,40.0,7.69,0.0,1745.4,...,False,False,False,False,False,False,1.99,0,0,0.0
34563,14,4.0,4,9.0,6,23.08,0.0,15.38,0.0,1883.5,...,False,False,False,False,False,False,1.85,1,0,-1.0
34564,14,3.0,2,12.0,10,69.23,40.0,46.15,20.0,1848.5,...,False,False,False,False,False,False,1.55,1,0,-1.0
34565,14,8.0,13,1.0,1,53.85,20.0,15.38,0.0,1693.8,...,False,False,False,False,False,False,1.54,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43144,36,10.0,15,15.0,19,57.14,20.0,22.86,20.0,1844.6,...,False,False,False,False,False,False,2.50,0,1,0.0
43145,24,26.0,25,11.0,3,21.74,0.0,13.04,0.0,1907.5,...,False,False,False,False,False,False,3.60,0,0,0.0
43146,24,6.0,4,4.0,4,30.43,40.0,13.04,20.0,1975.8,...,False,False,False,False,False,False,3.30,0,1,0.0
43147,35,7.0,7,14.0,11,44.12,60.0,14.71,0.0,1804.2,...,False,False,False,False,False,False,2.16,0,0,0.0


In [24]:
y_pred.sum()

np.int64(4112)

In [26]:
for i in range(1,10):
    print(i)

1
2
3
4
5
6
7
8
9
