In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv(r'Data\Algorithm_Dataset.csv')

y_home = df['home_score']
y_away = df['away_score']
x = df.drop(columns = ['home_score','away_score'])

X_train, X_test, y_home_train, y_home_test = train_test_split(x, y_home, test_size=0.2, random_state=42)
_, _, y_away_train, y_away_test = train_test_split(x, y_away, test_size=0.2, random_state=42)


if 'original_Season' in X_test.columns:
    Comparison = X_test[['original_Season', 'original_home_team', 'original_away_team','result']]
    X_train = X_train.drop(columns=['original_Season', 'original_home_team', 'original_away_team','result'], errors='ignore')
    X_test = X_test.drop(columns=['original_Season', 'original_home_team', 'original_away_team','result'], errors='ignore')
else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


In [3]:
model_home = RandomForestRegressor(n_estimators=100, random_state=42)
model_away = RandomForestRegressor(n_estimators=100, random_state=42)

model_home.fit(X_train, y_home_train)
model_away.fit(X_train, y_away_train)

In [4]:
y_home_pred = model_home.predict(X_test)
y_away_pred = model_away.predict(X_test)

y_home_pred = np.round(y_home_pred* 2)/2
y_away_pred = np.round(y_away_pred* 2)/2

print(f"Home score RMSE: {mean_squared_error(y_home_test, y_home_pred, squared=False)}")
print(f"Away score RMSE: {mean_squared_error(y_away_test, y_away_pred, squared=False)}")

Home score RMSE: 1.3395039504732091
Away score RMSE: 1.1439378261076953




In [5]:

# Initialize lists to store predicted and actual winners
predicted_winners = []
actual_winners = []

# Determine predicted winners
for i in range(len(y_home_pred)):
    home_team = Comparison['original_home_team'].iloc[i]
    away_team = Comparison['original_away_team'].iloc[i]
    home_score_pred = y_home_pred[i]
    away_score_pred = y_away_pred[i]
    
    if home_score_pred > away_score_pred:
        predicted_winner = home_team
    elif home_score_pred < away_score_pred:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    predicted_winners.append(predicted_winner)

    # Determine actual winners
    home_score_actual = y_home_test.iloc[i]
    away_score_actual = y_away_test.iloc[i]
    
    if home_score_actual > away_score_actual:
        actual_winner = home_team
    elif home_score_actual < away_score_actual:
        actual_winner = away_team
    else:
        actual_winner = 'draw'
    
    actual_winners.append(actual_winner)

# Calculate accuracy
correct_predictions = 0
total_matches = len(predicted_winners)

for predicted, actual in zip(predicted_winners, actual_winners):
    if predicted == actual:
        correct_predictions += 1

accuracy = (correct_predictions / total_matches) * 100
print(f"Winner Accuracy: {accuracy:.2f}%")


Winner Accuracy: 55.21%


Predict the group stages of the Euros 2024

In [6]:
pred_df = pd.read_csv(r'Data\Prediction_Data.csv')

if 'original_Season' in pred_df.columns:
    Euro_2024_predictions = pred_df[['original_Season', 'original_home_team', 'original_away_team','result']]
    pred_df = pred_df.drop(columns=['original_Season', 'original_home_team', 'original_away_team','home_score','away_score','result'], errors='ignore')

else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


pred_home =  model_home.predict(pred_df)
pred_away =  model_away.predict(pred_df)
pred_home = np.round(pred_home* 2)/2
pred_away = np.round(pred_away* 2)/2

#pred_home = np.round(pred_home)
#pred_away = np.round(pred_away)


In [7]:
# Initialize an empty column to store predicted winners
Euro_2024_predictions ['predicted_winners'] = None
Euro_2024_predictions ['stage'] = 'GS'

# Determine predicted winners
for i in range(len(Euro_2024_predictions )):
    home_team = Euro_2024_predictions ['original_home_team'].iloc[i]
    away_team = Euro_2024_predictions ['original_away_team'].iloc[i]
    pred_home_score = pred_home[i] 
    pred_away_score = pred_away[i]  
    
    if pred_home_score > pred_away_score:
        predicted_winner = home_team
    elif pred_home_score < pred_away_score:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    Euro_2024_predictions .loc[i, 'predicted_winners'] = predicted_winner 

Euro_2024_predictions .sort_values(by='original_home_team')
    
print(Euro_2024_predictions )


   original_Season original_home_team original_away_team  result  \
0        2023-2024            Germany           Scotland     NaN   
1        2023-2024            Hungary        Switzerland     NaN   
2        2023-2024            Germany            Hungary     NaN   
3        2023-2024           Scotland        Switzerland     NaN   
4        2023-2024        Switzerland            Germany     NaN   
5        2023-2024           Scotland            Hungary     NaN   
6        2023-2024              Spain            Croatia     NaN   
7        2023-2024              Italy            Albania     NaN   
8        2023-2024            Croatia            Albania     NaN   
9        2023-2024              Spain              Italy     NaN   
10       2023-2024              Spain            Albania     NaN   
11       2023-2024            Croatia              Italy     NaN   
12       2023-2024             Serbia            England     NaN   
13       2023-2024           Slovenia           

Round of 16 Winners Prediction 

In [8]:
R16_pred_df = pd.read_csv(r'Data\R16_Pred_Data.csv')

if 'original_Season' in R16_pred_df.columns:
    Euro_2024_R16_predictions = R16_pred_df[['original_Season', 'original_home_team', 'original_away_team','result']]
    R16_pred_df = R16_pred_df.drop(columns=['original_Season', 'original_home_team', 'original_away_team','home_score','away_score','result'], errors='ignore')

else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


R16_pred_home =  model_home.predict(R16_pred_df)
R16_pred_away =  model_away.predict(R16_pred_df)
R16_pred_home = np.round(R16_pred_home* 2)/2
R16_pred_away = np.round(R16_pred_away* 2)/2


In [9]:
# Initialize an empty column to store predicted winners
Euro_2024_R16_predictions ['predicted_winners'] = None
Euro_2024_R16_predictions ['stage'] = 'R16'
# Determine predicted winners
for i in range(len(Euro_2024_R16_predictions )):
    home_team = Euro_2024_R16_predictions ['original_home_team'].iloc[i]
    away_team = Euro_2024_R16_predictions ['original_away_team'].iloc[i]
    pred_home_score = R16_pred_home[i] 
    pred_away_score = R16_pred_away[i]  
    
    if pred_home_score > pred_away_score:
        predicted_winner = home_team
    elif pred_home_score < pred_away_score:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    Euro_2024_R16_predictions .loc[i, 'predicted_winners'] = predicted_winner 

print(Euro_2024_R16_predictions )


  original_Season original_home_team original_away_team  result  \
0       2023-2024           Scotland            Croatia     NaN   
1       2023-2024            Germany            Denmark     NaN   
2       2023-2024            England            Austria     NaN   
3       2023-2024              Spain            Hungary     NaN   
4       2023-2024        Netherlands           Slovakia     NaN   
5       2023-2024           Portugal              Italy     NaN   
6       2023-2024            Belgium           Slovenia     NaN   
7       2023-2024             France             Turkey     NaN   

  predicted_winners stage  
0              draw   R16  
1           Germany   R16  
2           England   R16  
3             Spain   R16  
4       Netherlands   R16  
5          Portugal   R16  
6           Belgium   R16  
7            France   R16  


Quarter Finals

In [10]:
QF_pred_df = pd.read_csv(r'Data\QF_Pred_Data.csv')

if 'original_Season' in QF_pred_df.columns:
    Euro_2024_QF_predictions = QF_pred_df[['original_Season', 'original_home_team', 'original_away_team','result']]
    QF_pred_df = QF_pred_df.drop(columns=['original_Season', 'original_home_team', 'original_away_team','home_score','away_score','result'], errors='ignore')

else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


QF_pred_home =  model_home.predict(QF_pred_df)
QF_pred_away =  model_away.predict(QF_pred_df)
print(QF_pred_home)
print(QF_pred_away)
QF_pred_home = np.round(QF_pred_home* 2)/2
QF_pred_away = np.round(QF_pred_away* 2)/2




[0.65 1.59 1.47 1.01]
[1.97   1.23   1.31   1.7675]


In [11]:
# Initialize an empty column to store predicted winners
Euro_2024_QF_predictions ['predicted_winner'] = None
Euro_2024_QF_predictions ['stage'] = 'QF'

# Determine predicted winners
for i in range(len(Euro_2024_QF_predictions )):
    home_team = Euro_2024_QF_predictions ['original_home_team'].iloc[i]
    away_team = Euro_2024_QF_predictions ['original_away_team'].iloc[i]
    pred_home_score = QF_pred_home[i] 
    pred_away_score = QF_pred_away[i]  
    
    if pred_home_score > pred_away_score:
        predicted_winner = home_team
    elif pred_home_score < pred_away_score:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    Euro_2024_QF_predictions .loc[i, 'predicted_winner'] = predicted_winner  
    
print(Euro_2024_QF_predictions )

  original_Season original_home_team original_away_team  result  \
0       2023-2024            Croatia            Germany     NaN   
1       2023-2024            England              Spain     NaN   
2       2023-2024        Netherlands           Portugal     NaN   
3       2023-2024            Belgium             France     NaN   

  predicted_winner stage  
0          Germany    QF  
1          England    QF  
2             draw    QF  
3           France    QF  


Semi Finals

In [12]:
SF_pred_df = pd.read_csv(r'Data\SF_Pred_Data.csv')

if 'original_Season' in SF_pred_df.columns:
    Euro_2024_SF_predictions = SF_pred_df[['original_Season', 'original_home_team', 'original_away_team','result']]
    SF_pred_df = SF_pred_df.drop(columns=['original_Season', 'original_home_team', 'original_away_team','home_score','away_score','result'], errors='ignore')

else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


SF_pred_home =  model_home.predict(SF_pred_df)
SF_pred_away =  model_away.predict(SF_pred_df)
print(SF_pred_home)
print(SF_pred_away)


[1.92833333 1.07      ]
[1.03933333 1.09      ]


In [13]:
# Initialize an empty column to store predicted winners
Euro_2024_SF_predictions ['predicted_winner'] = None
Euro_2024_SF_predictions ['stage'] = 'SF'

# Determine predicted winners
for i in range(len(Euro_2024_SF_predictions )):
    home_team = Euro_2024_SF_predictions ['original_home_team'].iloc[i]
    away_team = Euro_2024_SF_predictions ['original_away_team'].iloc[i]
    pred_home_score = SF_pred_home[i]  
    pred_away_score = SF_pred_away[i]  
    
    if pred_home_score > pred_away_score:
        predicted_winner = home_team
    elif pred_home_score < pred_away_score:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    Euro_2024_SF_predictions .loc[i, 'predicted_winner'] = predicted_winner  
    
print(Euro_2024_SF_predictions )

  original_Season original_home_team original_away_team  result  \
0       2023-2024            Germany              Spain     NaN   
1       2023-2024        Netherlands             France     NaN   

  predicted_winner stage  
0          Germany    SF  
1           France    SF  


Final

In [14]:
F_pred_df = pd.read_csv(r'Data\F_Pred_Data.csv')

if 'original_Season' in F_pred_df.columns:
    Euro_2024_F_predictions = F_pred_df[['original_Season', 'original_home_team', 'original_away_team','result']]
    F_pred_df = F_pred_df.drop(columns=['original_Season', 'original_home_team', 'original_away_team','home_score','away_score','result'], errors='ignore')

else:
    print("Columns 'original_Season', 'original_home_team', 'original_away_team' not found in X_test.")


F_pred_home =  model_home.predict(F_pred_df)
F_pred_away =  model_away.predict(F_pred_df)


In [15]:
# Initialize an empty column to store predicted winners
Euro_2024_F_predictions ['predicted_winner'] = None
Euro_2024_F_predictions ['stage'] = 'F'

# Determine predicted winners
for i in range(len(Euro_2024_F_predictions )):
    home_team = Euro_2024_F_predictions ['original_home_team'].iloc[i]
    away_team = Euro_2024_F_predictions ['original_away_team'].iloc[i]
    pred_home_score = F_pred_home[i]  
    pred_away_score = F_pred_away[i]  
    
    if pred_home_score > pred_away_score:
        predicted_winner = home_team
    elif pred_home_score < pred_away_score:
        predicted_winner = away_team
    else:
        predicted_winner = 'draw'
    
    Euro_2024_F_predictions .loc[i, 'predicted_winner'] = predicted_winner  
    
print(Euro_2024_F_predictions )

  original_Season original_home_team original_away_team  result  \
0       2023-2024            Germany             France     NaN   

  predicted_winner stage  
0          Germany     F  


Combine all files

In [16]:
dfs = [Euro_2024_predictions,Euro_2024_R16_predictions,Euro_2024_QF_predictions, Euro_2024_SF_predictions,Euro_2024_F_predictions]
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df)

   original_Season original_home_team original_away_team  result  \
0        2023-2024            Germany           Scotland     NaN   
1        2023-2024            Hungary        Switzerland     NaN   
2        2023-2024            Germany            Hungary     NaN   
3        2023-2024           Scotland        Switzerland     NaN   
4        2023-2024        Switzerland            Germany     NaN   
5        2023-2024           Scotland            Hungary     NaN   
6        2023-2024              Spain            Croatia     NaN   
7        2023-2024              Italy            Albania     NaN   
8        2023-2024            Croatia            Albania     NaN   
9        2023-2024              Spain              Italy     NaN   
10       2023-2024              Spain            Albania     NaN   
11       2023-2024            Croatia              Italy     NaN   
12       2023-2024             Serbia            England     NaN   
13       2023-2024           Slovenia           