# Euro prediction

## 0. Settings

In [2]:
import os

import pandas as pd
import numpy as np
from dotenv import load_dotenv

from psycopg2 import sql, connect
from sqlalchemy import create_engine, text

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from train_test.general import train_test_bookies_comparison, display_train_test_bookies_comparison, train_test
from train_test.split import train_test_split_expanding_windows, train_test_split_sliding_windows
from feature_eng.odds import bookie_prediction, prob_by_bookies
from train_test.metrics import accuracy_fn, classwise_ECE_fn, f_mesure_fn, log_loss_fn, mse_loss_fn, precision_fn, recall_fn
from feature_eng.team_stats import goal_stats, elo_scores, glicko2_scores, trueskill_scores
import datetime


### 0.1 Connection 

In [3]:
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

DB_TN_FBREF_RESULTS = os.getenv('DB_TN_FBREF_RESULTS')
DB_TN_SOFIFA_TEAMS_STATS = os.getenv('DB_TN_SOFIFA_TEAMS_STATS')

In [4]:
connection_url = f'postgresql+psycopg2://{(DB_USER)}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
engine = create_engine(connection_url)

with engine.connect() as connection:
    query = text(f"SELECT * FROM {DB_TN_FBREF_RESULTS}")
    fbref_results_df = pd.read_sql(query, connection)
    sofifa_teams_stats_df = pd.read_sql(f'SELECT * FROM {DB_TN_SOFIFA_TEAMS_STATS}', connection) 

In [5]:
rule_fifa_edition = (sofifa_teams_stats_df["fifa_edition"] == "FC 24")
rule_league_int = (sofifa_teams_stats_df["league"] == "INT")
sofifa_teams_stats_df[rule_fifa_edition & rule_league_int].sort_values(by="update", ascending=False).groupby("team").head(1).sort_values(by="team")

Unnamed: 0,league,team,overall,attack,midfield,defence,transfer_budget,club_worth,build_up_speed,build_up_dribbling,...,defence_pressure,defence_team_width,defence_defender_line,defence_domestic_prestige,international_prestige,players,starting_xi_average_age,whole_team_average_age,fifa_edition,update
2886,INT,Albania,73,66,73,74,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,2,23,27.09,26.78,FC 24,2024-06-12
106599,INT,Argentina,83,85,83,82,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,9,26,28.64,27.96,FC 24,2024-06-12
106600,INT,Belgium,81,82,81,77,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,8,26,27.73,26.23,FC 24,2024-06-12
106601,INT,Croatia,79,77,82,77,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,6,26,27.18,26.46,FC 24,2024-06-12
106602,INT,Czech Republic,75,74,75,75,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,5,26,25.45,25.62,FC 24,2024-06-12
106603,INT,Denmark,79,76,78,79,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,6,26,27.27,26.35,FC 24,2024-06-12
106604,INT,England,85,87,86,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,8,26,26.27,25.69,FC 24,2024-06-12
106605,INT,Finland,71,70,71,67,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,3,26,28.73,26.5,FC 24,2024-06-12
106606,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-06-12
3777,INT,Georgia,71,75,70,68,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,2,23,26.27,26.17,FC 24,2024-06-12


In [6]:
sofifa_teams_stats_df[sofifa_teams_stats_df["team"] == "France"].sort_values("update", ascending=False)

Unnamed: 0,league,team,overall,attack,midfield,defence,transfer_budget,club_worth,build_up_speed,build_up_dribbling,...,defence_pressure,defence_team_width,defence_defender_line,defence_domestic_prestige,international_prestige,players,starting_xi_average_age,whole_team_average_age,fifa_edition,update
106606,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-06-12
85789,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-05-22
86134,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-05-08
85683,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-05-07
86107,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FC 24,2024-04-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85841,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FIFA 09,2008-08-30
86082,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FIFA 08,2008-02-22
85854,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FIFA 08,2007-08-30
85736,INT,France,84,86,85,83,0,0.0,Slow,Little,...,Deep,Narrow,Cover,10,10,26,27.27,25.65,FIFA 07,2007-02-22


In [7]:
cond_euro = (fbref_results_df["league"] == "INT-European Championships")
cond_is_coming = (fbref_results_df["date"] >= datetime.date.today())
cond_is_current_season = (fbref_results_df["season"] == "2425")
fbref_results_df[cond_euro & cond_is_current_season]

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,attendance,venue,referee,match_report,notes,index,away_g,home_g,away_sat,home_sat
85253,b996c4de,INT-European Championships,2425,2024-06-15 Spain-Croatia,Group stage,1.0,Sat,2024-06-15,18:00:00,Spain,...,68844.0,Olympiastadion Berlin,Michael Oliver,/en/matches/b996c4de/Spain-Croatia-June-15-202...,,6360372,0.0,3.0,,
85254,923de563,INT-European Championships,2425,2024-06-16 Poland-Netherlands,Group stage,1.0,Sun,2024-06-16,15:00:00,Poland,...,48117.0,Volksparkstadion,Artur Soares Dias,/en/matches/923de563/Poland-Netherlands-June-1...,,16295875,2.0,1.0,,
85255,94c752b7,INT-European Championships,2425,2024-06-16 Serbia-England,Group stage,1.0,Sun,2024-06-16,21:00:00,Serbia,...,48953.0,Veltins-Arena,Daniele Orsato,/en/matches/94c752b7/Serbia-England-June-16-20...,,11528464,1.0,0.0,,
85256,3d8961eb,INT-European Championships,2425,2024-06-17 Austria-France,Group stage,1.0,Mon,2024-06-17,21:00:00,Austria,...,46425.0,Merkur Spielarena,Jesús Gil,/en/matches/3d8961eb/Austria-France-June-17-20...,,1556110,1.0,0.0,,
85257,bd775264,INT-European Championships,2425,2024-06-14 Germany-Scotland,Group stage,1.0,Fri,2024-06-14,21:00:00,Germany,...,65052.0,Allianz Arena,Clément Turpin,/en/matches/bd775264/Germany-Scotland-June-14-...,,2483190,1.0,5.0,,
85258,c15ec4e7,INT-European Championships,2425,2024-06-15 Hungary-Switzerland,Group stage,1.0,Sat,2024-06-15,15:00:00,Hungary,...,41676.0,RheinEnergieSTADION,Slavko Vinčič,/en/matches/c15ec4e7/Hungary-Switzerland-June-...,,11705248,3.0,1.0,,
85259,4891e62c,INT-European Championships,2425,2024-06-15 Italy-Albania,Group stage,1.0,Sat,2024-06-15,21:00:00,Italy,...,60512.0,Signal Iduna Park,Felix Zwayer,/en/matches/4891e62c/Italy-Albania-June-15-202...,,14297509,1.0,2.0,,
85260,,INT-European Championships,2425,2024-06-25 Netherlands-Austria,Group stage,3.0,Tue,2024-06-25,18:00:00,Netherlands,...,,Olympiastadion Berlin,,,,12042657,,,,
85261,,INT-European Championships,2425,2024-06-26 Czechia-Türkiye,Group stage,3.0,Wed,2024-06-26,21:00:00,Czechia,...,,Volksparkstadion,,,,2598241,,,,
85262,,INT-European Championships,2425,2024-06-26 Georgia-Portugal,Group stage,3.0,Wed,2024-06-26,21:00:00,Georgia,...,,Veltins-Arena,,,,10115670,,,,


In [8]:
mapping = {
    'VfL Bochum 1848': 'Bochum',
    'Tottenham Hotspur': 'Tottenham',
    'Paris Saint Germain': 'Paris S-G',
    'FC Köln': 'Köln',
    'Real Zaragoza': 'Zaragoza',
    'Wolverhampton Wanderers': 'Wolves',
    'Sheffield United': 'Sheffield Utd',
    'Amiens SC': 'Amiens',
    'FSV Mainz 05': 'Mainz 05',
    'Paderborn': 'Paderborn 07',
    'Bolton Wanderers': 'Bolton',
    'Huddersfield Town': 'Huddersfield',
    'Olympique de Marseille': 'Marseille',
    'LOSC Lille': 'Lille',
    'Grenoble Foot 38': 'Grenoble',
    'Racing Santander': 'Racing Sant',
    'Eintracht Frankfurt': 'Eint Frankfurt',
    'Fortuna Düsseldorf': 'Düsseldorf',
    'Queens Park Rangers': 'QPR',
    'SC Freiburg': 'Freiburg',
    'DSC Arminia Bielefeld': 'Arminia',
    'Republic of Ireland': 'Rep. of Ireland',
    'Evian TG': 'Evian',
    'FC Barcelona': 'Barcelona',
    'Brighton & Hove Albion': 'Brighton',
    'Deportivo La Coruña': 'La Coruña',
    'Angers SCO': 'Angers',
    'West Ham United': 'West Ham',
    'VfL Wolfsburg': 'Wolfsburg',
    'FC Augsburg': 'Augsburg',
    'India': 'India',
    'Bari 1908': 'Bari',
    'Czech Republic': 'Czechia',
    'Nottingham Forest': "Nott'ham Forest",
    'Newcastle United': 'Newcastle Utd',
    'Borussia Dortmund': 'Dortmund',
    'AFC Bournemouth': 'Bournemouth',
    'Iran': 'IR Iran',
    'Borussia Mönchengladbach': 'Gladbach',
    'Olympique Lyonnais': 'Lyon',
    'Venezuela': 'Venezuela',
    'TSG Hoffenheim': 'Hoffenheim',
    'SD Eibar': 'Eibar',
    'West Bromwich Albion': 'West Brom',
    'VfB Stuttgart': 'Stuttgart',
    'Arles': 'Arles-Avignon',
    'Stade de Reims ': 'Reims',
    'Stade Brestois 29': 'Brest',
    'Real Valladolid': 'Valladolid',
    'Clermont': 'Clermont Foot',
    'FC Union Berlin': 'Union Berlin',
    'Manchester United': 'Manchester Utd',
    'Deportivo Alavés': 'Alavés',
    'Celta de Vigo': 'Celta Vigo',
    'Bayer 04 Leverkusen': 'Leverkusen',
    'FC Bayern München': 'Bayern Munich',
    'SpVgg Greuther Fürth': 'Greuther Fürth',
    'Ingolstadt': 'Ingolstadt 04',
    'Eintracht Braunschweig': 'Braunschweig',
    'Blackburn Rovers': 'Blackburn'
}

# Appliquer le mapping au DataFrame sofifa_teams_stats_df
sofifa_teams_stats_df['team'] = sofifa_teams_stats_df['team'].replace(mapping)
                                                                      
fbref_results_df['date'] = pd.to_datetime(fbref_results_df['date'])
fbref_results_df_date = fbref_results_df[fbref_results_df["date"] >= min(sofifa_teams_stats_df["update"])]

In [9]:
cond_euro = (fbref_results_df["league"] == "INT-European Championships")
cond_is_coming = (fbref_results_df["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_results_df["season"] == "2425")
fbref_results_df[cond_euro & cond_is_current_season]["league"].count()

36

In [10]:
# Display teams that are not in sofifa_teams_stats_df
cond_euro = (fbref_results_df["league"] == "INT-European Championships")
cond_is_coming = (fbref_results_df["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_results_df["season"] == "2425")
team_euro_fbref = set(fbref_results_df[cond_euro & cond_is_current_season]['home_team'].unique()).union(set(fbref_results_df[cond_euro & cond_is_current_season]['away_team'].unique()))

In [11]:
len(team_euro_fbref)

24

In [12]:
sofifa_teams = set(sofifa_teams_stats_df[sofifa_teams_stats_df["league"] == "INT"]['team'].unique())
sofifa_teams


{'Albania',
 'Argentina',
 'Australia',
 'Austria',
 'Belgium',
 'Bolivia',
 'Brazil',
 'Bulgaria',
 'Cameroon',
 'Canada',
 'Chile',
 'China PR',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Czechia',
 "Côte d'Ivoire",
 'Denmark',
 'Ecuador',
 'Egypt',
 'England',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Hungary',
 'IR Iran',
 'Iceland',
 'India',
 'Italy',
 'Japan',
 'Korea Republic',
 'Mexico',
 'Morocco',
 'Netherlands',
 'New Zealand',
 'Northern Ireland',
 'Norway',
 'Paraguay',
 'Peru',
 'Poland',
 'Portugal',
 'Qatar',
 'Rep. of Ireland',
 'Romania',
 'Russia',
 'Saudi Arabia',
 'Scotland',
 'Senegal',
 'Serbia',
 'Slovakia',
 'Slovenia',
 'South Africa',
 'Spain',
 'Sweden',
 'Switzerland',
 'Tunisia',
 'Türkiye',
 'Ukraine',
 'United States',
 'Uruguay',
 'Venezuela',
 'Wales'}

In [13]:
[team for team in team_euro_fbref if team not in sofifa_teams]

[]

In [14]:
# Extraire la liste des équipes du DataFrame sofifa_teams_stats_df
sofifa_teams = set(sofifa_teams_stats_df['team'].unique())

# Filtrer les lignes de fbref_results_df
fbref_df_date_filtered = fbref_results_df_date[
    (fbref_results_df_date['home_team'].isin(sofifa_teams)) & 
    (fbref_results_df_date['away_team'].isin(sofifa_teams))
]

# Afficher les informations du DataFrame filtré
fbref_df_date_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31751 entries, 15862 to 87040
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   game_id       31741 non-null  object        
 1   league        31751 non-null  object        
 2   season        31751 non-null  object        
 3   game          31751 non-null  object        
 4   round         6089 non-null   object        
 5   week          31618 non-null  float64       
 6   day           31751 non-null  object        
 7   date          31751 non-null  datetime64[ns]
 8   time          18191 non-null  object        
 9   home_team     31751 non-null  object        
 10  home_xg       12513 non-null  float64       
 11  score         31640 non-null  object        
 12  away_xg       12513 non-null  float64       
 13  away_team     31751 non-null  object        
 14  attendance    18470 non-null  float64       
 15  venue         26712 non-null  object 

In [15]:
# Display teams that are not in sofifa_teams_stats_df
cond_euro = (fbref_df_date_filtered["league"] == "INT-European Championships")
cond_is_coming = (fbref_df_date_filtered["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_df_date_filtered["season"] == "2425")
len(set(fbref_df_date_filtered[cond_euro & cond_is_current_season]['home_team'].unique()))



24

In [16]:
cond_euro = (fbref_df_date_filtered["league"] == "INT-European Championships")
cond_is_coming = (fbref_df_date_filtered["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_df_date_filtered["season"] == "2425")
fbref_df_date_filtered[cond_euro & cond_is_current_season]["league"].count()

36

In [17]:
import pandas as pd

# Assurez-vous que les dates sont correctement formatées
fbref_df_date_filtered['date'] = pd.to_datetime(fbref_df_date_filtered['date'])
sofifa_teams_stats_df['update'] = pd.to_datetime(sofifa_teams_stats_df['update'])

# Trier les dataframes pour la jointure asynchrone
fbref_df_date_filtered = fbref_df_date_filtered.sort_values(by='date')
sofifa_teams_stats_df = sofifa_teams_stats_df.sort_values(by='update')

# Effectuer une jointure asynchrone pour les équipes à domicile
home_stats = pd.merge_asof(
    fbref_df_date_filtered[['home_team', 'date']].rename(columns={'home_team': 'team', 'date': 'match_date'}),
    sofifa_teams_stats_df,
    left_on='match_date',
    right_on='update',
    by='team',
    direction='backward'
).drop(columns=['team', 'match_date'])

# Renommer les colonnes pour éviter les conflits
home_stats.columns = ['home_' + col for col in home_stats.columns]

# Effectuer une jointure asynchrone pour les équipes à l'extérieur
away_stats = pd.merge_asof(
    fbref_df_date_filtered[['away_team', 'date']].rename(columns={'away_team': 'team', 'date': 'match_date'}),
    sofifa_teams_stats_df,
    left_on='match_date',
    right_on='update',
    by='team',
    direction='backward'
).drop(columns=['team', 'match_date'])

# Renommer les colonnes pour éviter les conflits
away_stats.columns = ['away_' + col for col in away_stats.columns]

# Fusionner les stats avec le dataframe initial
fbref_df_date_filtered_concat = pd.concat([fbref_df_date_filtered.reset_index(drop=True), home_stats.reset_index(drop=True), away_stats.reset_index(drop=True)], axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered['date'] = pd.to_datetime(fbref_df_date_filtered['date'])


In [18]:
fbref_df_date_filtered_concat['FTR'] = fbref_df_date_filtered_concat.apply(lambda x: 1 if x['home_g'] > x['away_g'] else 0 if x['home_g'] == x['away_g'] else -1 if x['home_g'] < x['away_g'] else None, axis=1)

In [19]:
# Drop rows if overall_away is nan or overall_home is nan
import datetime
start_euro = datetime.datetime(2024, 6, 14)
fbref_df_date_filtered_concat_no_nan = fbref_df_date_filtered_concat.dropna(subset=['home_overall', 'away_overall'])
fbref_df_date_filtered_concat_no_nan = fbref_df_date_filtered_concat_no_nan[(fbref_df_date_filtered_concat_no_nan['FTR'].notnull()) | (fbref_df_date_filtered_concat_no_nan['date'] >= start_euro)]
fbref_df_date_filtered_concat_no_nan

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,away_defence_team_width,away_defence_defender_line,away_defence_domestic_prestige,away_international_prestige,away_players,away_starting_xi_average_age,away_whole_team_average_age,away_fifa_edition,away_update,FTR
0,880e137f,ENG-Premier League,0607,2006-09-09 Everton-Liverpool,,4.0,Sat,2006-09-09,,Everton,...,Narrow,Cover,9.0,9.0,33.0,26.27,23.76,FIFA 07,2006-08-30,1.0
1,60c88132,FRA-Ligue 1,0607,2006-09-09 Nantes-Lille,,5.0,Sat,2006-09-09,,Nantes,...,Narrow,Cover,6.0,4.0,26.0,23.91,24.27,FIFA 07,2006-08-30,0.0
7,4742f790,ENG-Premier League,0607,2006-09-09 Newcastle Utd-Fulham,,4.0,Sat,2006-09-09,,Newcastle Utd,...,Narrow,Cover,5.0,2.0,31.0,26.55,25.29,FIFA 07,2006-08-30,-1.0
9,dc45f23c,ENG-Premier League,0607,2006-09-09 Manchester Utd-Tottenham,,4.0,Sat,2006-09-09,,Manchester Utd,...,Narrow,Cover,7.0,7.0,31.0,24.55,23.74,FIFA 07,2006-08-30,1.0
14,36cd8447,ITA-Serie A,0607,2006-09-09 Fiorentina-Inter,,1.0,Sat,2006-09-09,,Fiorentina,...,Narrow,Cover,10.0,8.0,26.0,28.82,27.77,FIFA 07,2006-08-30,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31746,,INT-European Championships,2425,2024-06-25 Netherlands-Austria,Group stage,3.0,Tue,2024-06-25,18:00:00,Netherlands,...,Narrow,Cover,10.0,4.0,23.0,25.82,25.87,FIFA 23,2023-09-11,
31747,,INT-European Championships,2425,2024-06-26 Czechia-Türkiye,Group stage,3.0,Wed,2024-06-26,21:00:00,Czechia,...,Narrow,Cover,10.0,6.0,23.0,25.18,24.09,FIFA 21,2021-09-25,
31748,,INT-European Championships,2425,2024-06-26 Slovakia-Romania,Group stage,3.0,Wed,2024-06-26,18:00:00,Slovakia,...,Narrow,Cover,10.0,5.0,26.0,26.45,26.27,FC 24,2024-06-12,
31749,,INT-European Championships,2425,2024-06-26 Ukraine-Belgium,Group stage,3.0,Wed,2024-06-26,18:00:00,Ukraine,...,Narrow,Cover,10.0,8.0,26.0,27.73,26.23,FC 24,2024-06-12,


In [20]:
cond_euro = (fbref_df_date_filtered_concat["league"] == "INT-European Championships")
cond_is_coming = (fbref_df_date_filtered_concat["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_df_date_filtered_concat["season"] == "2425")
fbref_df_date_filtered_concat[cond_euro & cond_is_current_season]

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,away_defence_team_width,away_defence_defender_line,away_defence_domestic_prestige,away_international_prestige,away_players,away_starting_xi_average_age,away_whole_team_average_age,away_fifa_edition,away_update,FTR
31715,bd775264,INT-European Championships,2425,2024-06-14 Germany-Scotland,Group stage,1.0,Fri,2024-06-14,21:00:00,Germany,...,Narrow,Cover,10.0,4.0,26.0,26.73,26.85,FC 24,2024-06-12,1.0
31716,b996c4de,INT-European Championships,2425,2024-06-15 Spain-Croatia,Group stage,1.0,Sat,2024-06-15,18:00:00,Spain,...,Narrow,Cover,10.0,6.0,26.0,27.18,26.46,FC 24,2024-06-12,1.0
31717,c15ec4e7,INT-European Championships,2425,2024-06-15 Hungary-Switzerland,Group stage,1.0,Sat,2024-06-15,15:00:00,Hungary,...,Narrow,Cover,10.0,6.0,26.0,28.09,26.62,FIFA 23,2022-11-20,-1.0
31718,4891e62c,INT-European Championships,2425,2024-06-15 Italy-Albania,Group stage,1.0,Sat,2024-06-15,21:00:00,Italy,...,Narrow,Cover,10.0,2.0,23.0,27.09,26.78,FC 24,2024-06-12,1.0
31719,923de563,INT-European Championships,2425,2024-06-16 Poland-Netherlands,Group stage,1.0,Sun,2024-06-16,15:00:00,Poland,...,Narrow,Cover,10.0,8.0,26.0,26.55,26.0,FC 24,2024-06-12,-1.0
31720,94c752b7,INT-European Championships,2425,2024-06-16 Serbia-England,Group stage,1.0,Sun,2024-06-16,21:00:00,Serbia,...,Narrow,Cover,10.0,8.0,26.0,26.27,25.69,FC 24,2024-06-12,-1.0
31721,e58cc259,INT-European Championships,2425,2024-06-16 Slovenia-Denmark,Group stage,1.0,Sun,2024-06-16,18:00:00,Slovenia,...,Narrow,Cover,10.0,6.0,26.0,27.27,26.35,FC 24,2024-06-12,0.0
31722,5bda83ea,INT-European Championships,2425,2024-06-17 Belgium-Slovakia,Group stage,1.0,Mon,2024-06-17,18:00:00,Belgium,...,Narrow,Cover,10.0,3.0,23.0,29.64,27.96,FC 24,2024-06-12,-1.0
31723,3d8961eb,INT-European Championships,2425,2024-06-17 Austria-France,Group stage,1.0,Mon,2024-06-17,21:00:00,Austria,...,Narrow,Cover,10.0,10.0,26.0,27.27,25.65,FC 24,2024-06-12,-1.0
31724,67c17ad9,INT-European Championships,2425,2024-06-17 Romania-Ukraine,Group stage,1.0,Mon,2024-06-17,15:00:00,Romania,...,Narrow,Cover,10.0,6.0,26.0,24.27,26.23,FC 24,2024-06-12,1.0


In [21]:
fbref_df_date_filtered_concat_no_nan[fbref_df_date_filtered_concat_no_nan['score'].isnull()]

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,away_defence_team_width,away_defence_defender_line,away_defence_domestic_prestige,away_international_prestige,away_players,away_starting_xi_average_age,away_whole_team_average_age,away_fifa_edition,away_update,FTR
31741,,INT-European Championships,2425,2024-06-24 Albania-Spain,Group stage,3.0,Mon,2024-06-24,21:00:00,Albania,...,Narrow,Cover,10.0,9.0,26.0,27.0,26.69,FC 24,2024-06-12,
31742,,INT-European Championships,2425,2024-06-24 Croatia-Italy,Group stage,3.0,Mon,2024-06-24,21:00:00,Croatia,...,Narrow,Cover,10.0,9.0,26.0,28.18,27.04,FC 24,2024-06-12,
31743,,INT-European Championships,2425,2024-06-25 England-Slovenia,Group stage,3.0,Tue,2024-06-25,21:00:00,England,...,Narrow,Cover,10.0,5.0,23.0,26.64,26.3,FIFA 21,2021-09-25,
31744,,INT-European Championships,2425,2024-06-25 France-Poland,Group stage,3.0,Tue,2024-06-25,18:00:00,France,...,Narrow,Cover,10.0,5.0,26.0,26.91,26.5,FC 24,2024-06-12,
31745,,INT-European Championships,2425,2024-06-25 Denmark-Serbia,Group stage,3.0,Tue,2024-06-25,21:00:00,Denmark,...,Narrow,Cover,10.0,6.0,26.0,26.36,26.46,FIFA 23,2022-11-20,
31746,,INT-European Championships,2425,2024-06-25 Netherlands-Austria,Group stage,3.0,Tue,2024-06-25,18:00:00,Netherlands,...,Narrow,Cover,10.0,4.0,23.0,25.82,25.87,FIFA 23,2023-09-11,
31747,,INT-European Championships,2425,2024-06-26 Czechia-Türkiye,Group stage,3.0,Wed,2024-06-26,21:00:00,Czechia,...,Narrow,Cover,10.0,6.0,23.0,25.18,24.09,FIFA 21,2021-09-25,
31748,,INT-European Championships,2425,2024-06-26 Slovakia-Romania,Group stage,3.0,Wed,2024-06-26,18:00:00,Slovakia,...,Narrow,Cover,10.0,5.0,26.0,26.45,26.27,FC 24,2024-06-12,
31749,,INT-European Championships,2425,2024-06-26 Ukraine-Belgium,Group stage,3.0,Wed,2024-06-26,18:00:00,Ukraine,...,Narrow,Cover,10.0,8.0,26.0,27.73,26.23,FC 24,2024-06-12,
31750,,INT-European Championships,2425,2024-06-26 Georgia-Portugal,Group stage,3.0,Wed,2024-06-26,21:00:00,Georgia,...,Narrow,Cover,10.0,8.0,26.0,26.91,27.0,FC 24,2024-06-12,


In [22]:
date_stop = datetime.datetime(2024, 6, 13)
elo_scores(fbref_df_date_filtered_concat_no_nan, home_team_id_col='home_team', away_team_id_col='away_team', home_team_goal_col='home_g', away_team_goal_col='away_g', date_stop=date_stop)
goal_stats(fbref_df_date_filtered_concat_no_nan, home_team_id_col='home_team', away_team_id_col='away_team', home_team_goal_col='home_g', away_team_goal_col='away_g', date_stop=date_stop)
glicko2_scores(fbref_df_date_filtered_concat_no_nan, home_team_id_col='home_team', away_team_id_col='away_team', home_team_goal_col='home_g', away_team_goal_col='away_g', date_stop=date_stop)
trueskill_scores(fbref_df_date_filtered_concat_no_nan, home_team_id_col='home_team', away_team_id_col='away_team', home_team_goal_col='home_g', away_team_goal_col='away_g', date_stop=date_stop)

In [23]:
cond_euro = (fbref_df_date_filtered_concat_no_nan["league"] == "INT-European Championships")
cond_is_coming = (fbref_df_date_filtered_concat_no_nan["date"] >= datetime.datetime.now())
cond_is_current_season = (fbref_df_date_filtered_concat_no_nan["season"] == "2425")
cols = ['game', 'week', 'date',
       'time', 'home_team', 'score', 'away_team',
       'FTR', 'elo_home_before', 'elo_away_before',
       'home_team_goals_season_to_date_before_match',
       'away_team_goals_season_to_date_before_match',
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'avg_home_team_goals_season_to_date_before_match',
       'avg_away_team_goals_season_to_date_before_match',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']
fbref_df_date_filtered_concat_no_nan[cond_euro & cond_is_current_season][cols]

Unnamed: 0,game,week,date,time,home_team,score,away_team,FTR,elo_home_before,elo_away_before,...,avg_home_team_goals_season_to_date_before_match,avg_away_team_goals_season_to_date_before_match,glicko2_home_before,glicko2_away_before,glicko2_rd_home_before,glicko2_rd_away_before,glicko2_vol_home_before,glicko2_vol_away_before,trueskill_home_before,trueskill_away_before
31715,2024-06-14 Germany-Scotland,1.0,2024-06-14,21:00:00,Germany,5–1,Scotland,1.0,1505.563111,1470.063691,...,0.0,0.0,1526.719791,1473.671067,27.909337,49.860495,0.059982,0.059995,0.0,0.0
31716,2024-06-15 Spain-Croatia,1.0,2024-06-15,18:00:00,Spain,3–0,Croatia,1.0,1493.73826,1465.548158,...,0.0,0.0,1526.232249,1527.140576,28.147176,29.021807,0.059958,0.059969,0.0,0.0
31717,2024-06-15 Hungary-Switzerland,1.0,2024-06-15,15:00:00,Hungary,1–3,Switzerland,-1.0,1463.386813,1447.57776,...,0.0,0.0,1498.703488,1509.682545,33.523677,29.32791,0.059986,0.059976,0.0,0.0
31718,2024-06-15 Italy-Albania,1.0,2024-06-15,21:00:00,Italy,2–1,Albania,1.0,1517.280675,1500.0,...,0.0,0.0,1519.102653,1500.0,29.092133,350.0,0.059972,0.06,0.0,0.0
31719,2024-06-16 Poland-Netherlands,1.0,2024-06-16,15:00:00,Poland,1–2,Netherlands,-1.0,1427.974245,1532.563435,...,0.0,0.0,1483.375706,1535.631558,29.246638,28.332196,0.059975,0.059987,0.0,0.0
31720,2024-06-16 Serbia-England,1.0,2024-06-16,21:00:00,Serbia,0–1,England,-1.0,1474.117751,1495.214962,...,0.0,0.0,1472.577393,1512.768331,49.891746,28.374,0.059995,0.059975,0.0,0.0
31721,2024-06-16 Slovenia-Denmark,1.0,2024-06-16,18:00:00,Slovenia,1–1,Denmark,0.0,1484.707592,1468.659266,...,0.0,0.0,1489.878534,1483.186058,59.785389,27.845977,0.059998,0.059988,0.0,0.0
31722,2024-06-17 Belgium-Slovakia,1.0,2024-06-17,18:00:00,Belgium,0–1,Slovakia,-1.0,1510.766558,1500.0,...,0.0,0.0,1541.754917,1500.0,28.037093,350.0,0.059992,0.06,0.0,0.0
31723,2024-06-17 Austria-France,1.0,2024-06-17,21:00:00,Austria,0–1,France,-1.0,1466.715556,1513.898104,...,0.0,0.0,1485.499373,1516.037108,34.885691,29.078495,0.059995,0.059974,0.0,0.0
31724,2024-06-17 Romania-Ukraine,1.0,2024-06-17,15:00:00,Romania,3–0,Ukraine,1.0,1472.938627,1413.081791,...,0.0,0.0,1490.574717,1481.169557,36.677593,31.867851,0.059993,0.059998,0.0,0.0


In [28]:
fbref_df_date_filtered_concat_no_nan.columns

Index(['game_id', 'league', 'season', 'game', 'round', 'week', 'day', 'date',
       'time', 'home_team', 'home_xg', 'score', 'away_xg', 'away_team',
       'attendance', 'venue', 'referee', 'match_report', 'notes', 'index',
       'away_g', 'home_g', 'away_sat', 'home_sat', 'home_league',
       'home_overall', 'home_attack', 'home_midfield', 'home_defence',
       'home_transfer_budget', 'home_club_worth', 'home_build_up_speed',
       'home_build_up_dribbling', 'home_build_up_passing',
       'home_build_up_positioning', 'home_chance_creation_crossing',
       'home_chance_creation_passing', 'home_chance_creation_shooting',
       'home_chance_creation_positioning', 'home_defence_aggression',
       'home_defence_pressure', 'home_defence_team_width',
       'home_defence_defender_line', 'home_defence_domestic_prestige',
       'home_international_prestige', 'home_players',
       'home_starting_xi_average_age', 'home_whole_team_average_age',
       'home_fifa_edition', 'home_update'

In [29]:
fbref_df_date_filtered_concat_no_nan__train = fbref_df_date_filtered_concat_no_nan[(fbref_df_date_filtered_concat_no_nan['FTR'].notnull()) & (fbref_df_date_filtered_concat_no_nan['season'] != '2425')]
fbref_df_date_filtered_concat_no_nan__train

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,avg_home_team_goals_season_to_date_before_match,avg_away_team_goals_season_to_date_before_match,glicko2_home_before,glicko2_away_before,glicko2_rd_home_before,glicko2_rd_away_before,glicko2_vol_home_before,glicko2_vol_away_before,trueskill_home_before,trueskill_away_before
0,880e137f,ENG-Premier League,0607,2006-09-09 Everton-Liverpool,,4.0,Sat,2006-09-09,,Everton,...,0.000000,0.000000,1500.000000,1500.000000,350.000000,350.000000,0.060000,0.060000,0.000000,0.000000
1,60c88132,FRA-Ligue 1,0607,2006-09-09 Nantes-Lille,,5.0,Sat,2006-09-09,,Nantes,...,0.000000,0.000000,1500.000000,1500.000000,350.000000,350.000000,0.060000,0.060000,0.000000,0.000000
7,4742f790,ENG-Premier League,0607,2006-09-09 Newcastle Utd-Fulham,,4.0,Sat,2006-09-09,,Newcastle Utd,...,0.000000,0.000000,1500.000000,1500.000000,350.000000,350.000000,0.060000,0.060000,0.000000,0.000000
9,dc45f23c,ENG-Premier League,0607,2006-09-09 Manchester Utd-Tottenham,,4.0,Sat,2006-09-09,,Manchester Utd,...,0.000000,0.000000,1500.000000,1500.000000,350.000000,350.000000,0.060000,0.060000,0.000000,0.000000
14,36cd8447,ITA-Serie A,0607,2006-09-09 Fiorentina-Inter,,1.0,Sat,2006-09-09,,Fiorentina,...,0.000000,0.000000,1500.000000,1500.000000,350.000000,350.000000,0.060000,0.060000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31710,2ee9be16,ITA-Serie A,2324,2024-05-26 Empoli-Roma,,38.0,Sun,2024-05-26,20:45:00,Empoli,...,0.729730,1.729730,1451.783789,1613.749022,28.165572,28.315661,0.059686,0.059418,21.080298,24.392905
31711,723ffa45,GER-Bundesliga,2324,2024-05-27 Düsseldorf-Bochum,German 1/2 Relegation/Promotion Play-offs,,Mon,2024-05-27,20:30:00,Düsseldorf,...,3.000000,1.200000,1467.347180,1445.023712,28.594862,28.662918,0.059902,0.059850,20.755207,20.932579
31712,febd4e01,FRA-Ligue 1,2324,2024-05-30 Saint-Étienne-Metz,French 1/2 Relegation/Promotion Play-off,,Thu,2024-05-30,20:30:00,Saint-Étienne,...,0.000000,1.029412,1500.995626,1428.135762,28.911646,27.119836,0.059508,0.059740,22.420056,21.203970
31713,7f01f697,FRA-Ligue 1,2324,2024-06-02 Metz-Saint-Étienne,French 1/2 Relegation/Promotion Play-off,,Sun,2024-06-02,17:00:00,Metz,...,1.028571,2.000000,1426.418607,1502.892889,27.466291,28.865399,0.059739,0.059507,21.126454,22.490834


In [30]:
fbref_df_date_filtered_concat_no_nan__infer = fbref_df_date_filtered_concat_no_nan[fbref_df_date_filtered_concat_no_nan['season'] == '2425']
fbref_df_date_filtered_concat_no_nan__infer

Unnamed: 0,game_id,league,season,game,round,week,day,date,time,home_team,...,avg_home_team_goals_season_to_date_before_match,avg_away_team_goals_season_to_date_before_match,glicko2_home_before,glicko2_away_before,glicko2_rd_home_before,glicko2_rd_away_before,glicko2_vol_home_before,glicko2_vol_away_before,trueskill_home_before,trueskill_away_before
31715,bd775264,INT-European Championships,2425,2024-06-14 Germany-Scotland,Group stage,1.0,Fri,2024-06-14,21:00:00,Germany,...,0.0,0.0,1526.719791,1473.671067,27.909337,49.860495,0.059982,0.059995,0.0,0.0
31716,b996c4de,INT-European Championships,2425,2024-06-15 Spain-Croatia,Group stage,1.0,Sat,2024-06-15,18:00:00,Spain,...,0.0,0.0,1526.232249,1527.140576,28.147176,29.021807,0.059958,0.059969,0.0,0.0
31717,c15ec4e7,INT-European Championships,2425,2024-06-15 Hungary-Switzerland,Group stage,1.0,Sat,2024-06-15,15:00:00,Hungary,...,0.0,0.0,1498.703488,1509.682545,33.523677,29.32791,0.059986,0.059976,0.0,0.0
31718,4891e62c,INT-European Championships,2425,2024-06-15 Italy-Albania,Group stage,1.0,Sat,2024-06-15,21:00:00,Italy,...,0.0,0.0,1519.102653,1500.0,29.092133,350.0,0.059972,0.06,0.0,0.0
31719,923de563,INT-European Championships,2425,2024-06-16 Poland-Netherlands,Group stage,1.0,Sun,2024-06-16,15:00:00,Poland,...,0.0,0.0,1483.375706,1535.631558,29.246638,28.332196,0.059975,0.059987,0.0,0.0
31720,94c752b7,INT-European Championships,2425,2024-06-16 Serbia-England,Group stage,1.0,Sun,2024-06-16,21:00:00,Serbia,...,0.0,0.0,1472.577393,1512.768331,49.891746,28.374,0.059995,0.059975,0.0,0.0
31721,e58cc259,INT-European Championships,2425,2024-06-16 Slovenia-Denmark,Group stage,1.0,Sun,2024-06-16,18:00:00,Slovenia,...,0.0,0.0,1489.878534,1483.186058,59.785389,27.845977,0.059998,0.059988,0.0,0.0
31722,5bda83ea,INT-European Championships,2425,2024-06-17 Belgium-Slovakia,Group stage,1.0,Mon,2024-06-17,18:00:00,Belgium,...,0.0,0.0,1541.754917,1500.0,28.037093,350.0,0.059992,0.06,0.0,0.0
31723,3d8961eb,INT-European Championships,2425,2024-06-17 Austria-France,Group stage,1.0,Mon,2024-06-17,21:00:00,Austria,...,0.0,0.0,1485.499373,1516.037108,34.885691,29.078495,0.059995,0.059974,0.0,0.0
31724,67c17ad9,INT-European Championships,2425,2024-06-17 Romania-Ukraine,Group stage,1.0,Mon,2024-06-17,15:00:00,Romania,...,0.0,0.0,1490.574717,1481.169557,36.677593,31.867851,0.059993,0.059998,0.0,0.0


In [31]:
from sklearn.feature_selection import SelectKBest, f_classif

pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ])
X_col_sofifa =  ['home_overall', 'home_attack', 
         'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
         'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
         'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
         'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']

Y_col = 'FTR'

X_col = X_col_sofifa + X_col_scores

train_test_split_fn = lambda df : train_test_split_expanding_windows(df, split=5, test_prop=0.2, date_col="date")
result_df_all_splits = False
m = 10
beta = 1

metrics_mean, metrics, _ = train_test(fbref_df_date_filtered_concat_no_nan__train, pipeline, X_col, Y_col, train_test_split_fn, result_df_all_splits, m, beta)
pd.DataFrame({'metrics': metrics_mean.keys(), 'values': metrics_mean.values()})

Unnamed: 0,metrics,values
0,accuracy,0.526958
1,weighted_accuracy,0.668501
2,accuracy_home,0.594718
3,accuracy_draw,0.74153
4,accuracy_away,0.717668
5,recall_all,1.346776
6,weighted_recall,0.526958
7,balanced_accuracy,0.448925
8,recall_home,0.844371
9,recall_draw,0.0


In [32]:
from sklearn.feature_selection import SelectKBest, f_classif

pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ])
X_col_sofifa =  ['home_overall', 'home_attack', 
         'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
         'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
         'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
         'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']


Y_col = 'FTR'

X_col = X_col_sofifa + X_col_scores

pipeline.fit(fbref_df_date_filtered_concat_no_nan__train[X_col], fbref_df_date_filtered_concat_no_nan__train[Y_col])
fbref_df_date_filtered_concat_no_nan__train['pred'] = pipeline.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])
fbref_df_date_filtered_concat_no_nan__train['proba_home_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__train[X_col])[:, 2]
fbref_df_date_filtered_concat_no_nan__train['proba_draw'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__train[X_col])[:, 1]
fbref_df_date_filtered_concat_no_nan__train['proba_away_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__train[X_col])[:, 0]

accucacy = accuracy_fn(fbref_df_date_filtered_concat_no_nan__train, 'FTR', 'pred')[0]
log_loss = log_loss_fn(fbref_df_date_filtered_concat_no_nan__train['FTR'], fbref_df_date_filtered_concat_no_nan__train['proba_home_win'], fbref_df_date_filtered_concat_no_nan__train['proba_draw'], fbref_df_date_filtered_concat_no_nan__train['proba_away_win'])

pd.DataFrame({'metrics': ['accuracy', 'log_loss'], 'values': [accucacy, log_loss]})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__train['pred'] = pipeline.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__train['proba_home_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__train[X_col])[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

Unnamed: 0,metrics,values
0,accuracy,0.521996
1,log_loss,0.987674


In [33]:
fbref_df_date_filtered_concat_no_nan__infer[X_col[10:30]]

Unnamed: 0,home_whole_team_average_age,away_overall,away_attack,away_midfield,away_defence,away_transfer_budget,away_club_worth,away_defence_domestic_prestige,away_international_prestige,away_players,away_starting_xi_average_age,away_whole_team_average_age,home_team_number_of_match_played,away_team_number_of_match_played,glicko2_home_before,glicko2_away_before,glicko2_rd_home_before,glicko2_rd_away_before,glicko2_vol_home_before,glicko2_vol_away_before
31715,28.04,76.0,72.0,77.0,76.0,0.0,0.0,10.0,4.0,26.0,26.73,26.85,0.0,0.0,1526.719791,1473.671067,27.909337,49.860495,0.059982,0.059995
31716,26.69,79.0,77.0,82.0,77.0,0.0,0.0,10.0,6.0,26.0,27.18,26.46,0.0,0.0,1526.232249,1527.140576,28.147176,29.021807,0.059958,0.059969
31717,26.19,79.0,77.0,78.0,78.0,0.0,0.0,10.0,6.0,26.0,28.09,26.62,0.0,0.0,1498.703488,1509.682545,33.523677,29.32791,0.059986,0.059976
31718,27.04,73.0,66.0,73.0,74.0,0.0,0.0,10.0,2.0,23.0,27.09,26.78,0.0,0.0,1519.102653,1500.0,29.092133,350.0,0.059972,0.06
31719,26.5,82.0,82.0,82.0,83.0,0.0,0.0,10.0,8.0,26.0,26.55,26.0,0.0,0.0,1483.375706,1535.631558,29.246638,28.332196,0.059975,0.059987
31720,26.46,85.0,87.0,86.0,83.0,0.0,0.0,10.0,8.0,26.0,26.27,25.69,0.0,0.0,1472.577393,1512.768331,49.891746,28.374,0.059995,0.059975
31721,26.3,79.0,76.0,78.0,79.0,0.0,0.0,10.0,6.0,26.0,27.27,26.35,0.0,0.0,1489.878534,1483.186058,59.785389,27.845977,0.059998,0.059988
31722,26.23,75.0,73.0,76.0,75.0,0.0,0.0,10.0,3.0,23.0,29.64,27.96,0.0,0.0,1541.754917,1500.0,28.037093,350.0,0.059992,0.06
31723,25.87,84.0,86.0,85.0,83.0,0.0,0.0,10.0,10.0,26.0,27.27,25.65,0.0,0.0,1485.499373,1516.037108,34.885691,29.078495,0.059995,0.059974
31724,26.27,77.0,81.0,77.0,74.0,0.0,0.0,10.0,6.0,26.0,24.27,26.23,0.0,0.0,1490.574717,1481.169557,36.677593,31.867851,0.059993,0.059998


In [34]:
fbref_df_date_filtered_concat_no_nan__infer['pred'] = pipeline.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])
fbref_df_date_filtered_concat_no_nan__infer['proba_home_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__infer[X_col])[:, 2]
fbref_df_date_filtered_concat_no_nan__infer['proba_draw'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__infer[X_col])[:, 1]
fbref_df_date_filtered_concat_no_nan__infer['proba_away_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__infer[X_col])[:, 0]

fbref_df_date_filtered_concat_no_nan__infer[['date', 'home_team', 'away_team', 'proba_home_win', 'proba_draw', 'proba_away_win', 'pred']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__infer['pred'] = pipeline.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__infer['proba_home_win'] = pipeline.predict_proba(fbref_df_date_filtered_concat_no_nan__infer[X_col])[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

Unnamed: 0,date,home_team,away_team,proba_home_win,proba_draw,proba_away_win,pred
31715,2024-06-14,Germany,Scotland,0.586792,0.238198,0.17501,1.0
31716,2024-06-15,Spain,Croatia,0.48298,0.244418,0.272602,1.0
31717,2024-06-15,Hungary,Switzerland,0.292353,0.282151,0.425496,-1.0
31718,2024-06-15,Italy,Albania,0.475029,0.242307,0.282665,1.0
31719,2024-06-16,Poland,Netherlands,0.226127,0.23276,0.541114,-1.0
31720,2024-06-16,Serbia,England,0.245067,0.216695,0.538238,-1.0
31721,2024-06-16,Slovenia,Denmark,0.280203,0.282893,0.436904,-1.0
31722,2024-06-17,Belgium,Slovakia,0.444244,0.232932,0.322824,1.0
31723,2024-06-17,Austria,France,0.253983,0.241335,0.504681,-1.0
31724,2024-06-17,Romania,Ukraine,0.272765,0.264514,0.462721,-1.0


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Suppose the target columns are 'home_goals' and 'away_goals'
Y_col_home = 'home_g'
Y_col_away = 'away_g'

X_col_sofifa =  ['home_overall', 'home_attack', 
                 'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
                 'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
                 'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
                 'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']

X_col = X_col_sofifa + X_col_scores

# Define the pipeline
pipeline_home = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

pipeline_away = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

# Fit the models
pipeline_home.fit(fbref_df_date_filtered_concat_no_nan__train[X_col], fbref_df_date_filtered_concat_no_nan__train[Y_col_home])
pipeline_away.fit(fbref_df_date_filtered_concat_no_nan__train[X_col], fbref_df_date_filtered_concat_no_nan__train[Y_col_away])

# Predict goals
fbref_df_date_filtered_concat_no_nan__train['pred_home_goals'] = pipeline_home.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])
fbref_df_date_filtered_concat_no_nan__train['pred_away_goals'] = pipeline_away.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])

# Calculate metrics
mse_home = mean_squared_error(fbref_df_date_filtered_concat_no_nan__train[Y_col_home], fbref_df_date_filtered_concat_no_nan__train['pred_home_goals'])
mse_away = mean_squared_error(fbref_df_date_filtered_concat_no_nan__train[Y_col_away], fbref_df_date_filtered_concat_no_nan__train['pred_away_goals'])
r2_home = r2_score(fbref_df_date_filtered_concat_no_nan__train[Y_col_home], fbref_df_date_filtered_concat_no_nan__train['pred_home_goals'])
r2_away = r2_score(fbref_df_date_filtered_concat_no_nan__train[Y_col_away], fbref_df_date_filtered_concat_no_nan__train['pred_away_goals'])

# Display metrics
pd.DataFrame({
    'metrics': ['mse_home', 'mse_away', 'r2_home', 'r2_away'],
    'values': [mse_home, mse_away, r2_home, r2_away]
})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__train['pred_home_goals'] = pipeline_home.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__train['pred_away_goals'] = pipeline_away.predict(fbref_df_date_filtered_concat_no_nan__train[X_col])


Unnamed: 0,metrics,values
0,mse_home,1.485994
1,mse_away,1.212352
2,r2_home,0.12223
3,r2_away,0.094871


In [166]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Suppose the target columns are 'home_goals' and 'away_goals'
Y_col_home = 'home_g'
Y_col_away = 'away_g'

X_col_sofifa =  ['home_overall', 'home_attack', 
                 'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
                 'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
                 'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
                 'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']

X_col = X_col_sofifa + X_col_scores

# Define the pipeline
pipeline_home = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestClassifier())
    ])

pipeline_away = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestClassifier())
    ])

split = train_test_split_expanding_windows(fbref_df_date_filtered_concat_no_nan__train, split=5, test_prop=0.2, date_col="date")

mse_home = []
mse_away = []
r2_home = []
r2_away = []

for train, test in split:
    # Fit the models
    pipeline_home.fit(train[X_col], train[Y_col_home])
    pipeline_away.fit(train[X_col], train[Y_col_away])

    # Predict goals
    test['pred_home_goals'] = pipeline_home.predict(test[X_col])
    test['pred_away_goals'] = pipeline_away.predict(test[X_col])

    # Calculate metrics
    mse_home.append(mean_squared_error(test[Y_col_home], test['pred_home_goals']))
    mse_away.append(mean_squared_error(test[Y_col_away], test['pred_away_goals']))
    r2_home.append(r2_score(test[Y_col_home], test['pred_home_goals']))
    r2_away.append(r2_score(test[Y_col_away], test['pred_away_goals']))

# Display metrics
pd.DataFrame({
    'metrics': ['mse_home', 'mse_away', 'r2_home', 'r2_away'],
    'values': [np.mean(mse_home), np.mean(mse_away), np.mean(r2_home), np.mean(r2_away)]
})


Unnamed: 0,metrics,values
0,mse_home,1.922222
1,mse_away,1.676503
2,r2_home,-0.164873
3,r2_away,-0.27917


In [62]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Suppose the target columns are 'home_goals' and 'away_goals'
Y_col_home = 'home_g'
Y_col_away = 'away_g'

X_col_sofifa =  ['home_overall', 'home_attack', 
                 'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
                 'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
                 'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
                 'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']

X_col = X_col_sofifa + X_col_scores

# Define the pipeline
pipeline_home = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

pipeline_away = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

split = train_test_split_expanding_windows(fbref_df_date_filtered_concat_no_nan__train, split=5, test_prop=0.2, date_col="date")

mse_home = []
mse_away = []
r2_home = []
r2_away = []

for train, test in split:
    # Fit the models
    pipeline_home.fit(train[X_col], train[Y_col_home])
    pipeline_away.fit(train[X_col], train[Y_col_away])

    # Predict goals
    test['pred_home_goals'] = pipeline_home.predict(test[X_col])
    test['pred_away_goals'] = pipeline_away.predict(test[X_col])

    # Calculate metrics
    mse_home.append(mean_squared_error(test[Y_col_home], test['pred_home_goals']))
    mse_away.append(mean_squared_error(test[Y_col_away], test['pred_away_goals']))
    r2_home.append(r2_score(test[Y_col_home], test['pred_home_goals']))
    r2_away.append(r2_score(test[Y_col_away], test['pred_away_goals']))

# Display metrics
pd.DataFrame({
    'metrics': ['mse_home', 'mse_away', 'r2_home', 'r2_away'],
    'values': [np.mean(mse_home), np.mean(mse_away), np.mean(r2_home), np.mean(r2_away)]
})


Unnamed: 0,metrics,values
0,mse_home,1.435947
1,mse_away,1.189226
2,r2_home,0.129387
3,r2_away,0.092697


In [63]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Suppose the target columns are 'home_goals' and 'away_goals'
Y_col_home = 'home_g'
Y_col_away = 'away_g'

X_col_sofifa =  ['home_overall', 'home_attack', 
                 'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
                 'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
                 'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
                 'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'glicko2_home_before', 'glicko2_away_before'
]

X_col = X_col_sofifa + X_col_scores

# Define the pipeline
pipeline_home = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

pipeline_away = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ])

split = train_test_split_expanding_windows(fbref_df_date_filtered_concat_no_nan__train, split=5, test_prop=0.2, date_col="date")

mse_home = []
mse_away = []
r2_home = []
r2_away = []

for train, test in split:
    # Fit the models
    pipeline_home.fit(train[X_col], train[Y_col_home])
    pipeline_away.fit(train[X_col], train[Y_col_away])

    # Predict goals
    test['pred_home_goals'] = pipeline_home.predict(test[X_col])
    test['pred_away_goals'] = pipeline_away.predict(test[X_col])

    # Calculate metrics
    mse_home.append(mean_squared_error(test[Y_col_home], test['pred_home_goals']))
    mse_away.append(mean_squared_error(test[Y_col_away], test['pred_away_goals']))
    r2_home.append(r2_score(test[Y_col_home], test['pred_home_goals']))
    r2_away.append(r2_score(test[Y_col_away], test['pred_away_goals']))

# Display metrics
pd.DataFrame({
    'metrics': ['mse_home', 'mse_away', 'r2_home', 'r2_away'],
    'values': [np.mean(mse_home), np.mean(mse_away), np.mean(r2_home), np.mean(r2_away)]
})


Unnamed: 0,metrics,values
0,mse_home,1.438988
1,mse_away,1.191142
2,r2_home,0.127564
3,r2_away,0.091075


In [64]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
#adaboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Suppose the target columns are 'home_goals' and 'away_goals'
Y_col_home = 'home_g'
Y_col_away = 'away_g'

X_col_sofifa =  ['home_overall', 'home_attack', 
                 'home_midfield', 'home_defence', 'home_transfer_budget', 'home_club_worth', 'home_defence_domestic_prestige', 'home_international_prestige', 
                 'home_players', 'home_starting_xi_average_age', 'home_whole_team_average_age', 'away_overall', 'away_attack', 'away_midfield', 'away_defence', 
                 'away_transfer_budget', 'away_club_worth', 'away_defence_domestic_prestige', 'away_international_prestige', 'away_players', 
                 'away_starting_xi_average_age', 'away_whole_team_average_age']

X_col_scores = [
       'home_team_number_of_match_played', 'away_team_number_of_match_played',
       'glicko2_home_before', 'glicko2_away_before', 'glicko2_rd_home_before',
       'glicko2_rd_away_before', 'glicko2_vol_home_before',
       'glicko2_vol_away_before', 'trueskill_home_before',
       'trueskill_away_before']

X_col = X_col_sofifa + X_col_scores

# Define the pipeline
pipeline_home = Pipeline([
        ('scaler', StandardScaler()),
        ('model', AdaBoostClassifier())
    ])

pipeline_away = Pipeline([
        ('scaler', StandardScaler()),
        ('model', AdaBoostClassifier())
    ])

split = train_test_split_expanding_windows(fbref_df_date_filtered_concat_no_nan__train, split=5, test_prop=0.2, date_col="date")

mse_home = []
mse_away = []
r2_home = []
r2_away = []

for train, test in split:
    # Fit the models
    pipeline_home.fit(train[X_col], train[Y_col_home])
    pipeline_away.fit(train[X_col], train[Y_col_away])

    # Predict goals
    test['pred_home_goals'] = pipeline_home.predict(test[X_col])
    test['pred_away_goals'] = pipeline_away.predict(test[X_col])

    # Calculate metrics
    mse_home.append(mean_squared_error(test[Y_col_home], test['pred_home_goals']))
    mse_away.append(mean_squared_error(test[Y_col_away], test['pred_away_goals']))
    r2_home.append(r2_score(test[Y_col_home], test['pred_home_goals']))
    r2_away.append(r2_score(test[Y_col_away], test['pred_away_goals']))

# Display metrics
pd.DataFrame({
    'metrics': ['mse_home', 'mse_away', 'r2_home', 'r2_away'],
    'values': [np.mean(mse_home), np.mean(mse_away), np.mean(r2_home), np.mean(r2_away)]
})




KeyboardInterrupt: 

In [36]:
fbref_df_date_filtered_concat_no_nan__infer["pred_home_goals"] = pipeline_home.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])
fbref_df_date_filtered_concat_no_nan__infer["pred_away_goals"] = pipeline_away.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])

fbref_df_date_filtered_concat_no_nan__infer[['date', 'home_team', 'away_team', 'proba_home_win', 'proba_draw', 'proba_away_win', 'pred', 'pred_home_goals', 'pred_away_goals']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__infer["pred_home_goals"] = pipeline_home.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fbref_df_date_filtered_concat_no_nan__infer["pred_away_goals"] = pipeline_away.predict(fbref_df_date_filtered_concat_no_nan__infer[X_col])


Unnamed: 0,date,home_team,away_team,proba_home_win,proba_draw,proba_away_win,pred,pred_home_goals,pred_away_goals
31715,2024-06-14,Germany,Scotland,0.586792,0.238198,0.17501,1.0,1.677864,0.835086
31716,2024-06-15,Spain,Croatia,0.48298,0.244418,0.272602,1.0,1.467555,1.109838
31717,2024-06-15,Hungary,Switzerland,0.292353,0.282151,0.425496,-1.0,0.952899,1.321227
31718,2024-06-15,Italy,Albania,0.475029,0.242307,0.282665,1.0,1.383499,0.915947
31719,2024-06-16,Poland,Netherlands,0.226127,0.23276,0.541114,-1.0,0.903779,1.602924
31720,2024-06-16,Serbia,England,0.245067,0.216695,0.538238,-1.0,0.993241,1.651812
31721,2024-06-16,Slovenia,Denmark,0.280203,0.282893,0.436904,-1.0,0.830808,1.287112
31722,2024-06-17,Belgium,Slovakia,0.444244,0.232932,0.322824,1.0,1.440901,1.075845
31723,2024-06-17,Austria,France,0.253983,0.241335,0.504681,-1.0,0.92529,1.523602
31724,2024-06-17,Romania,Ukraine,0.272765,0.264514,0.462721,-1.0,0.867528,1.414744


In [68]:
fbref_df_date_filtered_concat_no_nan__infer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 31715 to 31750
Data columns (total 99 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   game_id                                          10 non-null     object        
 1   league                                           28 non-null     object        
 2   season                                           28 non-null     object        
 3   game                                             28 non-null     object        
 4   round                                            28 non-null     object        
 5   week                                             28 non-null     float64       
 6   day                                              28 non-null     object        
 7   date                                             28 non-null     datetime64[ns]
 8   time                                    