In [64]:
import soccerdata as sd
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, deque 
import requests
from bs4 import BeautifulSoup

from sklearn.preprocessing import StandardScaler

# SoccerData Scraper

In [65]:
# Create a scraper class instance for  for the 2020/21 & 2021/22
mh = sd.MatchHistory(leagues=['GER-Bundesliga'], seasons=['2020-2021'])
mh2= sd.MatchHistory(leagues=['GER-Bundesliga'], seasons=['2021-2022'])
fbref1 = sd.FBref(leagues=['GER-Bundesliga'], seasons=['2020-2021'])
# Get match data of each team
match_stats= mh.read_games()
test_data = mh2.read_games()
additional_train = fbref1.read_team_match_stats()
additional_test = pd.read_csv('/Users/kevinwildprett/Desktop/Hochschule/5. Semester/Projektarbeit/Neuer Ordner/additional_train.csv', sep=';') ## !!!!!!

# Beautiful Soup Scraper

### Get team value for train data

In [66]:
url= 'https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=2020'

page = requests.get(url)

# Header imitates browser request
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}

# HTTP-Request send
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find(id= 'yw1')
teams = table.find_all('td', {'class': 'hauptlink no-border-links'})
all_teams = [team.text.replace('\xa0', ' ').strip() for team in teams]

# Find  <tbody> 
tbody = table.find('tbody')

# Find <td>-elements with class "rechts" in <tbody> 
if tbody:
    gesamtwert = tbody.find_all('td', {'class': 'rechts'})
all_values = [gesamtwert.text for i, gesamtwert in enumerate(gesamtwert) if i % 2 != 0]

df_teams = pd.DataFrame({
    'team': all_teams,
    'total_values': all_values
})

### Get team value for test data

In [67]:
url= 'https://www.transfermarkt.de/bundesliga/startseite/wettbewerb/L1/saison_id/2021'

page = requests.get(url)


headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}


response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find(id= 'yw1')
teams = table.find_all('td', {'class': 'hauptlink no-border-links'})
all_teams = [team.text.replace('\xa0', ' ').strip() for team in teams]


tbody = table.find('tbody')


if tbody:
    gesamtwert = tbody.find_all('td', {'class': 'rechts'})
all_values = [gesamtwert.text for i, gesamtwert in enumerate(gesamtwert) if i % 2 != 0]

df_teams2 = pd.DataFrame({
    'team': all_teams,
    'total_values': all_values
})

# Data Preperation

In [68]:
# Map the names to the ones in the main frame
team_mapping = {
    'FC Bayern München': 'Bayern Munich',
    'Borussia Dortmund': 'Dortmund',
    'RB Leipzig': 'RB Leipzig',
    'Bayer 04 Leverkusen': 'Leverkusen',
    'Borussia Mönchengladbach': "M\'gladbach",
    'Eintracht Frankfurt': 'Ein Frankfurt',
    'VfL Wolfsburg': 'Wolfsburg',
    'Hertha BSC': 'Hertha',
    'TSG 1899 Hoffenheim': 'Hoffenheim',
    'VfB Stuttgart': 'Stuttgart',
    '1.FSV Mainz 05': 'Mainz',
    'SC Freiburg': 'Freiburg',
    'FC Schalke 04': 'Schalke 04',
    '1.FC Köln': 'FC Koln',
    'SV Werder Bremen': 'Werder Bremen',
    'FC Augsburg': 'Augsburg',
    '1.FC Union Berlin': 'Union Berlin',
    'Arminia Bielefeld': 'Bielefeld',
    'SpVgg Greuther Fürth' : 'Greuther Furth',
    'VfL Bochum' : 'Bochum'
}


In [69]:
# Use mapping on team value dataframe
df_teams['team'] = df_teams['team'].replace(team_mapping)
df_teams2['team'] = df_teams2['team'].replace(team_mapping)

In [70]:
# Convert the 'total_values' column 
df_teams['total_values'] = df_teams['total_values'].str.replace(' Mio. €', '', regex=False).str.replace(',', '.').astype(float, errors='ignore')
df_teams2['total_values'] = df_teams2['total_values'].str.replace(' Mio. €', '', regex=False).str.replace(',', '.').astype(float, errors='ignore')


In [71]:
# Convert MultiIndex into regular one
match_stats = match_stats.reset_index()
match_stats.reset_index(drop=True, inplace=True)

test_data = test_data.reset_index()
test_data.reset_index(drop=True, inplace=True)


### Add team value to train data

In [72]:
# Merge für die home_market_value-Spalte
match_stats = pd.merge(match_stats,  df_teams[['team', 'total_values']], left_on='home_team', right_on='team').rename(columns={'total_values': 'home_value'})
match_stats = match_stats.drop(columns=['team'])

In [73]:
# Merge für die away_market_value-Spalte
match_stats = pd.merge(match_stats,  df_teams[['team', 'total_values']], left_on='away_team', right_on='team').rename(columns={'total_values': 'away_value'})
match_stats = match_stats.drop(columns=['team'])

### Add team value to test data

In [74]:
# Merge für die home_market_value-Spalte
test_data = pd.merge(test_data,  df_teams2[['team', 'total_values']], left_on='home_team', right_on='team').rename(columns={'total_values': 'home_value'})
test_data = test_data.drop(columns=['team'])

In [75]:
# Merge für die away_market_value-Spalte
test_data = pd.merge(test_data,  df_teams2[['team', 'total_values']], left_on='away_team', right_on='team').rename(columns={'total_values': 'away_value'})
test_data = test_data.drop(columns=['team'])

In [76]:
match_stats = match_stats.sort_values(by='date').reset_index(drop=True)
test_data = test_data.sort_values(by='date').reset_index(drop=True)

### Additional dataframe (Fbref) preperation

In [77]:
#Fill nan value with value above
additional_test['team'] = additional_test['team'].fillna(method='ffill')

# Fill the xGoals with 0
additional_test['xG'] = additional_test['xG'].fillna(0)
additional_test['xGA'] = additional_test['xGA'].fillna(0)

additional_train['xG'] = additional_train['xG'].fillna(0)
additional_train['xGA'] = additional_train['xGA'].fillna(0)

additional_test['xG'] = additional_test['xG'].str.replace(',', '.', regex=False)
additional_test['xGA'] = additional_test['xGA'].str.replace(',', '.', regex=False)



In [78]:
# Convert MultiIndex into regular one
additional_test = additional_test.reset_index()
additional_test.reset_index(drop=False, inplace=True)

additional_train = additional_train.reset_index()
additional_train.reset_index(drop=False, inplace=True)

In [79]:
# Clean up additional data for merging
additional_train = additional_train.sort_values(by='date')
additional_test = additional_test.sort_values(by='date')
# Keep olny matchdays rows
additional_train = additional_train[additional_train['round'].str.startswith('Matchweek', na=False)]
additional_test = additional_test[additional_test['round'].str.startswith('Matchweek', na=False)]

In [80]:
# Map the additional dataframe onto the main ones
team_mapping = {
    'Arminia': 'Bielefeld',
    'Augsburg': 'Augsburg',
    'Bayern Munich': 'Bayern Munich',
    'Dortmund': 'Dortmund',
    'Eint Frankfurt': 'Ein Frankfurt',
    'Freiburg': 'Freiburg',
    'Gladbach': "M\'gladbach",
    'Hertha BSC': 'Hertha',
    'Hoffenheim': 'Hoffenheim',
    'Köln': 'FC Koln',
    'Leverkusen': 'Leverkusen',
    'Mainz 05': 'Mainz',
    'RB Leipzig': 'RB Leipzig',
    'Schalke 04': 'Schalke 04',
    'Stuttgart': 'Stuttgart',
    'Union Berlin': 'Union Berlin',
    'Werder Bremen': 'Werder Bremen',
    'Wolfsburg': 'Wolfsburg',
    'Greuther Fürth': 'Greuther Furth' 
    
}

In [81]:
columns_to_replace = ['team', 'opponent']

# Use mapping
additional_train[columns_to_replace] = additional_train[columns_to_replace].replace(team_mapping)
additional_test[columns_to_replace] = additional_test[columns_to_replace].replace(team_mapping)

### Merge expected goals to train dataset 

In [82]:
# For home_team column
match_stats = pd.merge(
    match_stats,
    additional_train[['team', 'opponent', 'xG']],
    left_on=['home_team', 'away_team'],
    right_on=['team', 'opponent'],
    how='left'
   
)
match_stats = match_stats.drop(columns=['team', 'opponent'])

In [83]:
#match_stats = match_stats.drop_duplicates(subset=['home_team', 'away_team'])
#print(match_stats.duplicated(subset=['home_team', 'away_team']).sum())

In [84]:
#For away_team

match_stats = pd.merge(
    match_stats,
    additional_train[['team', 'opponent', 'xGA']],
    left_on=['home_team', 'away_team'],  
    right_on=['team', 'opponent'],      
    how='left'
)
match_stats = match_stats.drop(columns=['team', 'opponent'])

In [85]:
#match_stats = match_stats.drop_duplicates(subset=['home_team', 'away_team'])
#print(match_stats.duplicated(subset=['home_team', 'away_team']).sum())

In [86]:
#match_stats.columns.values

### Merge expected goals to test data 

In [87]:
#For home_team
test_data = pd.merge(
    test_data,
    additional_test[['team', 'opponent', 'xG']],
    left_on=['home_team', 'away_team'],
    right_on=['team', 'opponent'],
    how='left'
)

test_data = test_data.drop(columns=['team', 'opponent'])

In [88]:
#For away_team

test_data = pd.merge(
    test_data,
    additional_test[['team', 'opponent', 'xGA']],
    left_on=['home_team', 'away_team'],  
    right_on=['team', 'opponent'],      
    how='left'
)
test_data = test_data.drop(columns=['team', 'opponent'])

In [89]:
#test_data = test_data.drop_duplicates(subset=['home_team', 'away_team'])
#print(test_data.duplicated(subset=['home_team', 'away_team']).sum())

In [90]:
#test_data.columns.values

# Feature-Engineering

Matchdays (possitive impact)

In [91]:
# Function to calculate matchdays for each team
def update_matchday(row,team_matchday_counts):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Increment matchday count for home and away teams
    team_matchday_counts[home_team] += 1
    team_matchday_counts[away_team] += 1
    
    # Assign matchday counts to the row
    row['home_team_matchday'] = team_matchday_counts[home_team]
    row['away_team_matchday'] = team_matchday_counts[away_team]
    
    return row
# Apply the function to the first DataFrame with a separate dictionary
team_matchday_counts_df1 = defaultdict(int)
match_stats = match_stats.apply(lambda row: update_matchday(row, team_matchday_counts_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_matchday_counts_df2 = defaultdict(int)
test_data = test_data.apply(lambda row: update_matchday(row, team_matchday_counts_df2), axis=1)

Cumulative Points (possitive impact)

In [92]:
# Function to update cumulative points for each team
def update_cumulative_points(row, team_points):
    home_team = row['home_team']
    away_team = row['away_team']
    home_matchday = row['home_team_matchday']
    away_matchday = row['away_team_matchday']
    
    # Initialize cumulative points as 0 for the first matchday
    if home_matchday == 1:
        row['home_cumulative_points'] = 0
    else:
        row['home_cumulative_points'] = team_points[home_team]
    
    if away_matchday == 1:
        row['away_cumulative_points'] = 0
    else:
        row['away_cumulative_points'] = team_points[away_team]
    
    # Update points based on match result (add to history after setting current cumulative points)
    if row['FTR'] == 'H':  # Home team wins
        team_points[home_team] += 3  # 3 points for home team
        team_points[away_team] += 0  # 0 points for away team
    elif row['FTR'] == 'A':  # Away team wins
        team_points[home_team] += 0  # 0 points for home team
        team_points[away_team] += 3  # 3 points for away team
    else:  # Draw
        team_points[home_team] += 1  # 1 point for home team
        team_points[away_team] += 1  # 1 point for away team
    
    return row

In [93]:
# Apply the function to the first DataFrame with a separate dictionary
team_points_df1 = defaultdict(int)
match_stats = match_stats.apply(lambda row: update_cumulative_points(row, team_points_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_points_df2 = defaultdict(int)
test_data = test_data.apply(lambda row: update_cumulative_points(row, team_points_df2), axis=1)

Wins in last 5 matchdays (negative impact)

In [94]:
# Function to update the last 5 match history and count wins
def update_last_5_wins(row, team_last_5_results):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the number of wins in the last 5 matches or fewer if not enough matches have been played
    row['home_last_5_wins'] = sum(team_last_5_results[home_team])
    row['away_last_5_wins'] = sum(team_last_5_results[away_team])
    
    # Add current match result to history (after calculation)
    if row['FTR'] == 'H':  # Home team wins
        team_last_5_results[home_team].append(1)
        team_last_5_results[away_team].append(0)
    elif row['FTR'] == 'A':  # Away team wins
        team_last_5_results[home_team].append(0)
        team_last_5_results[away_team].append(1)
    else:  # Draw
        team_last_5_results[home_team].append(0)
        team_last_5_results[away_team].append(0)
    
    return row



In [95]:
# Apply the function to the first DataFrame with a separate dictionary
team_last_5_results_df1 = defaultdict(lambda: deque(maxlen=5))
match_stats = match_stats.apply(lambda row: update_last_5_wins(row, team_last_5_results_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_last_5_results_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: update_last_5_wins(row, team_last_5_results_df2), axis=1)

Shots on Goal (negative impact)

In [96]:
# Function to update the number of shots on target for each team in the last 5 matches
def update_last_5_shots(row, team_last_5_shots):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the total shots on target in the last 5 matches or fewer if not enough matches have been played
    row['home_last_5_shots'] = sum(team_last_5_shots[home_team])
    row['away_last_5_shots'] = sum(team_last_5_shots[away_team])
    
    # Add current match shots on target to history
    team_last_5_shots[home_team].append(row['HST'])  # Home shots on target
    team_last_5_shots[away_team].append(row['AST'])  # Away shots on target
    
    return row

In [97]:
# Apply the function to the first DataFrame with a separate dictionary
team_last_5_shots_df1 = defaultdict(lambda: deque(maxlen=5))
match_stats = match_stats.apply(lambda row: update_last_5_shots(row, team_last_5_shots_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_last_5_shots_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: update_last_5_shots(row, team_last_5_shots_df2), axis=1)

Goaldifference (possitive impact)

In [98]:
# Function to update the goal difference for each team in the last 5 matches
def update_last_5_goal_diff(row, team_last_5_goal_diff):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the current match goal difference
    home_goal_diff = row['FTHG'] - row['FTAG']
    away_goal_diff = -home_goal_diff  # Invert for the away team

    # Calculate the total goal difference in the last 5 matches or fewer if not enough matches have been played
    row['home_last_5_goal_diff'] = sum(team_last_5_goal_diff[home_team])
    row['away_last_5_goal_diff'] = sum(team_last_5_goal_diff[away_team])
    
    # Add current match goal difference to history
    team_last_5_goal_diff[home_team].append(home_goal_diff)
    team_last_5_goal_diff[away_team].append(away_goal_diff)
    
    return row

In [99]:
# Apply the function to the first DataFrame with a separate dictionary
team_last_5_goal_diff_df1 = defaultdict(lambda: deque(maxlen=5))
match_stats = match_stats.apply(lambda row: update_last_5_goal_diff(row, team_last_5_goal_diff_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_last_5_goal_diff_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: update_last_5_goal_diff(row, team_last_5_goal_diff_df2), axis=1)

Amount halftime goals (possitive impact)

In [100]:
# Function to calculate and update the sum of half-time goals for the last 5 matches for each team
def calculate_last_5_half_time_goals(row, team_last_5_half_time_goals):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the sum of half-time goals in the last 5 matches or fewer if not enough matches have been played
    row['home_last_5_half_time_goals'] = sum(team_last_5_half_time_goals[home_team])
    row['away_last_5_half_time_goals'] = sum(team_last_5_half_time_goals[away_team])
    
    # Add the current match half-time goals to history
    team_last_5_half_time_goals[home_team].append(row['HTHG'])  # Home half-time goals
    team_last_5_half_time_goals[away_team].append(row['HTAG'])  # Away half-time goals
    
    return row

In [101]:
# Apply the function to the first DataFrame with a separate dictionary
team_last_5_half_time_goals_df1 = defaultdict(lambda: deque(maxlen=5))
match_stats = match_stats.apply(lambda row: calculate_last_5_half_time_goals(row, team_last_5_half_time_goals_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_last_5_half_time_goals_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: calculate_last_5_half_time_goals(row, team_last_5_half_time_goals_df2), axis=1)


Corner shots (possitive impact)

In [102]:
# Function to calculate and update the sum of shot corners for the last 5 matches for each team
def calculate_last_5_corners_shot(row, team_last_5_corners_shot):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the sum of Corners in the last 5 matches or fewer if not enough matches have been played
    row['home_last_5_half_corners_shot'] = sum(team_last_5_corners_shot[home_team])
    row['away_last_5_half_corners_shot'] = sum(team_last_5_corners_shot[away_team])
    
    # Add the current match corners to history
    team_last_5_corners_shot[home_team].append(row['HC'])  # Home Corners
    team_last_5_corners_shot[away_team].append(row['AC'])  # Away Corners
    
    return row


In [103]:
# Apply the function to the first DataFrame with a separate dictionary
team_last_5_corners_shot_df1 = defaultdict(lambda: deque(maxlen=5))
match_stats = match_stats.apply(lambda row: calculate_last_5_corners_shot(row, team_last_5_corners_shot_df1), axis=1)

# Apply the function to the second DataFrame with a new dictionary
team_last_5_corners_shot_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: calculate_last_5_corners_shot(row, team_last_5_corners_shot_df2), axis=1)

ELO-Ratings (possitive impact)

In [104]:
# Dictionary to keep track of current Elo ratings for each team
# Reset Elo ratings and ensure initial value is 1500 for all teams
# Re-initialize Elo calculation to ensure correct data
team_elo = defaultdict(lambda: 1500)
K=32

# Lists to store Elo ratings for current matchdays
current_home_elo_list = []
current_away_elo_list = []

# Iterate over each match to calculate Elo ratings correctly
for index, row in match_stats.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Get the current Elo ratings from the dictionary
    current_home_elo = team_elo[home_team]
    current_away_elo = team_elo[away_team]

    # Append the current Elo ratings to the lists
    current_home_elo_list.append(current_home_elo)
    current_away_elo_list.append(current_away_elo)

    # Calculate expected outcomes
    expected_home = 1 / (1 + 10 ** ((current_away_elo - current_home_elo) / 400))
    expected_away = 1 / (1 + 10 ** ((current_home_elo - current_away_elo) / 400))

    # Determine actual match outcome
    if row['FTHG'] > row['FTAG']:  # Home win
        actual_home = 1
        actual_away = 0
    elif row['FTHG'] < row['FTAG']:  # Away win
        actual_home = 0
        actual_away = 1
    else:  # Draw
        actual_home = 0.5
        actual_away = 0.5

    # Update Elo ratings using the K factor
    new_home_elo = current_home_elo + K * (actual_home - expected_home)
    new_away_elo = current_away_elo + K * (actual_away - expected_away)

    # Update the dictionary with the new Elo ratings for the next match
    team_elo[home_team] = new_home_elo
    team_elo[away_team] = new_away_elo

# Add the current Elo ratings to the DataFrame
match_stats['current_home_elo'] = current_home_elo_list
match_stats['current_away_elo'] = current_away_elo_list

  match_stats['current_home_elo'] = current_home_elo_list
  match_stats['current_away_elo'] = current_away_elo_list


In [105]:
# 4 test data
# Dictionary to keep track of current Elo ratings for each team
# Reset Elo ratings and ensure initial value is 1500 for all teams
# Re-initialize Elo calculation to ensure correct data
team_elo = defaultdict(lambda: 1500)
K=32

# Lists to store Elo ratings for current matchdays
current_home_elo_list = []
current_away_elo_list = []

# Iterate over each match to calculate Elo ratings correctly
for index, row in test_data.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Get the current Elo ratings from the dictionary
    current_home_elo = team_elo[home_team]
    current_away_elo = team_elo[away_team]

    # Append the current Elo ratings to the lists
    current_home_elo_list.append(current_home_elo)
    current_away_elo_list.append(current_away_elo)

    # Calculate expected outcomes
    expected_home = 1 / (1 + 10 ** ((current_away_elo - current_home_elo) / 400))
    expected_away = 1 / (1 + 10 ** ((current_home_elo - current_away_elo) / 400))

    # Determine actual match outcome
    if row['FTHG'] > row['FTAG']:  # Home win
        actual_home = 1
        actual_away = 0
    elif row['FTHG'] < row['FTAG']:  # Away win
        actual_home = 0
        actual_away = 1
    else:  # Draw
        actual_home = 0.5
        actual_away = 0.5

    # Update Elo ratings using the K factor
    new_home_elo = current_home_elo + K * (actual_home - expected_home)
    new_away_elo = current_away_elo + K * (actual_away - expected_away)

    # Update the dictionary with the new Elo ratings for the next match
    team_elo[home_team] = new_home_elo
    team_elo[away_team] = new_away_elo

# Add the current Elo ratings to the DataFrame
test_data['current_home_elo'] = current_home_elo_list
test_data['current_away_elo'] = current_away_elo_list

  test_data['current_home_elo'] = current_home_elo_list
  test_data['current_away_elo'] = current_away_elo_list


streak (possitive impact)

In [106]:
# Function to calculate and update the winning streak for each team
def calculate_streak(row, team_streaks):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Store the current winning streak before updating it, so it appears for the next matchday
    row['home_team_streak'] = team_streaks[home_team]
    row['away_team_streak'] = team_streaks[away_team]
    
    # Update the winning streak based on the result of the current matchday
    if row['FTR'] == 'H':  # Home team wins
        team_streaks[home_team] += 1  # Increment home team winning streak
        team_streaks[away_team] = 0   # Reset away team streak
    elif row['FTR'] == 'A':  # Away team wins
        team_streaks[away_team] += 1  # Increment away team winning streak
        team_streaks[home_team] = 0   # Reset home team streak
    else:  # Draw
        team_streaks[home_team] = 0   # Reset both streaks
        team_streaks[away_team] = 0

    return row

In [107]:
# Create a dictionary to track the winning streaks for each team
team_streaks_df1 = defaultdict(int)

# Use the function on the first DataFrame 
match_stats = match_stats.apply(lambda row: calculate_streak(row, team_streaks_df1), axis=1)

# Create for the second DataFrame a new Dictionary 
team_streaks_df2 = defaultdict(int)
test_data = test_data.apply(lambda row: calculate_streak(row, team_streaks_df2), axis=1)

goals last 5 matches (negative impact)

In [108]:
# Function to calculate the total goals scored in the last 5 matches
def calculate_last_5_goals(row, team_goals_history):
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Calculate the total goals for the last 5 matches
    row['home_last_5_goals'] = sum(team_goals_history[home_team])
    row['away_last_5_goals'] = sum(team_goals_history[away_team])
    
    # Update the goals history for the current match
    team_goals_history[home_team].append(row['FTHG'])  # Full Time Home Goals
    team_goals_history[away_team].append(row['FTAG'])  # Full Time Away Goals
    
    return row


In [109]:
# Create a dictionary to track the last 5 goals for each team for the first DataFrame
team_goals_history_df1 = defaultdict(lambda: deque(maxlen=5))

# Apply the function to the first DataFrame
match_stats = match_stats.apply(lambda row: calculate_last_5_goals(row, team_goals_history_df1), axis=1)

# For the second DataFrame, use a new dictionary
team_goals_history_df2 = defaultdict(lambda: deque(maxlen=5))
test_data = test_data.apply(lambda row: calculate_last_5_goals(row, team_goals_history_df2), axis=1)

average xG (possitive impact)

In [110]:
# Important, because of csv import of expected goals 2021/22
test_data['xG'] = pd.to_numeric(test_data['xG'], errors='coerce')
test_data['xGA'] = pd.to_numeric(test_data['xGA'], errors='coerce')

In [111]:
def calculate_avg_xg_xga(row, team_stats):
    # Hometeam Data
    home_team = row['home_team']
    home_matchday = row['home_team_matchday']
    
    # Awayteam Data
    away_team = row['away_team']
    away_matchday = row['away_team_matchday']
    
    # Initialise xG & matche
    if home_team not in team_stats:
        team_stats[home_team] = {'xG_total': 0, 'matches': 0}
    if away_team not in team_stats:
        team_stats[away_team] = {'xG_total': 0, 'matches': 0}
    
    # Average xG for the Hometeam 
    if home_matchday > 1:
        row['home_avg_xG'] = team_stats[home_team]['xG_total'] / team_stats[home_team]['matches']
    else:
        row['home_avg_xG'] = 0  # Initalise 1. Day

    # Average xGA for Awayteam  
    if away_matchday > 1:
        row['away_avg_xG'] = team_stats[away_team]['xG_total'] / team_stats[away_team]['matches']
    else:
        row['away_avg_xG'] = 0  # Initalise 1. Day
    
    # After Calculation: Add Game to Teamstatistic 
    team_stats[home_team]['xG_total'] += row['xG']
    team_stats[home_team]['matches'] += 1
    
    team_stats[away_team]['xG_total'] += row['xGA']
    team_stats[away_team]['matches'] += 1
    
    return row

# Initalize Team-Statistic
team_stats = defaultdict(lambda: {'xG_sum': 0, 'matches': 0})
team_stats2 = defaultdict(lambda: {'xG_sum': 0, 'matches': 0})
# Use the function on the DataFrame
match_stats = match_stats.apply(lambda row: calculate_avg_xg_xga(row, team_stats), axis=1)
test_data = test_data.apply(lambda row: calculate_avg_xg_xga(row, team_stats2), axis=1)


select Dataframes for Model 

In [112]:
 prediction_df = match_stats[['current_home_elo', 'current_away_elo', 'FTR',
                            'home_last_5_half_corners_shot', 'away_last_5_half_corners_shot', 
                             'home_value', 'away_value','home_last_5_goal_diff', 'away_last_5_goal_diff',
                             'home_cumulative_points', 'away_cumulative_points',
                            'home_last_5_half_time_goals', 'away_last_5_half_time_goals',
                            'home_team_matchday','away_team_matchday',  'home_team_streak', 'away_team_streak',
                              'home_avg_xG', 'away_avg_xG'
                             ]]


In [113]:
test_df = test_data[[ 'current_home_elo', 'current_away_elo', 'FTR', 
                     'home_last_5_half_corners_shot', 'away_last_5_half_corners_shot', 
                     'home_value', 'away_value', 'home_last_5_goal_diff', 'away_last_5_goal_diff',   
                     'home_cumulative_points', 'away_cumulative_points', 
                    'home_last_5_half_time_goals', 'away_last_5_half_time_goals',
                    'home_team_matchday','away_team_matchday',  'home_team_streak', 'away_team_streak',
                     'home_avg_xG', 'away_avg_xG', 
                     
                    ]]

In [114]:
# Scale Data
le = LabelEncoder()
scaler = StandardScaler()
# Test/ Train split
y_train = le.fit_transform(prediction_df['FTR'])
X_train = prediction_df.drop(columns=['FTR'])   
y_test = le.fit_transform(test_df['FTR'])
X_test = test_df.drop(columns=['FTR']) 

# Create Korrelationsmatrix 
correlation_matrix = X_train.corr()

# Clone data for boxplot
X_train_box = X_train

# Skale data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)


label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
print("Label mapping:", label_mapping)

Label mapping: {0: 'A', 1: 'D', 2: 'H'}


## Implementing Random Forest + Training

In [115]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    class_weight='balanced'
)

# Train the model on the initial training data
rf_model.fit(X_train, y_train)


# Calculate initial accuracy on the test data without modifying it
initial_y_pred = rf_model.predict(X_test)
initial_accuracy = accuracy_score(y_test, initial_y_pred)
print(f"Initial Accuracy on Test Data: {initial_accuracy:.2f}")

gameday = 1
accuracies = []  # List to store accuracy for each matchday
all_game_data = []  # List to store odds and team data for each matchday

# Iterative process
while X_test.size > 0:  # Check for NumPy array or Pandas DataFrame size
    # Extract the test data for the current matchday (9 matches)
    X_test_batch = X_test.iloc[:9]
    y_test_batch = y_test[:9]
    teams_batch = test_data.iloc[:9][['home_team', 'away_team']].reset_index(drop=True)

    # Train the model on the current training data
    rf_model.fit(X_train, y_train)
    
    # Make predictions and calculate probabilities (for Quotas)
    y_pred = rf_model.predict(X_test_batch)
    y_proba = rf_model.predict_proba(X_test_batch)

    # Calculate and store accuracy
    accuracy = accuracy_score(y_test_batch, y_pred)
    accuracies.append(accuracy)
    print(f"Gameday {gameday}: Accuracy on Testdata = {accuracy:.2f}")

    # Analyze each match of the current matchday
    for i, probas in enumerate(y_proba):
        home_team = teams_batch.iloc[i]['home_team']
        away_team = teams_batch.iloc[i]['away_team']

        # Calculate odds
        odds = {
            0: round(1 / probas[0], 2) if probas[0] > 0 else 'N/A',
            1: round(1 / probas[1], 2) if probas[1] > 0 else 'N/A',
            2: round(1 / probas[2], 2) if probas[2] > 0 else 'N/A'
        }

        # Output match information
        print(f"\nMatch {i + 1}: {home_team} vs {away_team}")
        print(f"Probabilities: Home Win ({probas[2]:.2f}), Draw ({probas[1]:.2f}), Away Win ({probas[0]:.2f})")
        print(f"Odds: Home ({odds[2]}), Draw ({odds[1]}), Away ({odds[0]})")

        # Save matchday data
        all_game_data.append({
            'Gameday': gameday,
            'HomeTeam': home_team,
            'AwayTeam': away_team,
            'Probabilities': probas,
            'Classes': rf_model.classes_,
            'HomeOdds': odds[2],
            'AwayOdds': odds[0],
            'DrawOdds': odds[1]
        })
  

    # Add test data to training data
    X_train = np.vstack([X_train, X_test_batch])
    y_train = np.concatenate([y_train, y_test_batch])

    # Remove the used test data
    X_test = X_test.iloc[9:]
    y_test = y_test[9:]
    test_data = test_data.iloc[9:]

    gameday += 1

# Calculate overall accuracy
overall_accuracy = np.mean(accuracies)
print(f"\nOverall Accuracy: {overall_accuracy}")




ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Evaluation ##

In [None]:
print("\nMatchday and Odds:")
for game in all_game_data:
    print(f"Matchday {game['Gameday']} {game['HomeTeam']} {game['HomeOdds']} : {game['AwayTeam']} {game['AwayOdds']} (Draw: {game['DrawOdds']})")

## Corelation Matrix of Features

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)


  
plt.title("Correlation Matrix")
plt.xticks(fontsize=15)  
plt.yticks(fontsize=15) 
plt.show()
plt.close()



## Accuracy over Matchdays

In [None]:
# Matchday axis (x-axis) and accuracy values (y-axis)
gamedays = list(range(1, len(accuracies) + 1))  # Matchdays (1, 2, ..., n)

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(gamedays, accuracies, marker='o', linestyle='-', linewidth=2)

# Add labels to the plot
plt.title('Accuracy Trend Over Matchdays', fontsize=16)
plt.xlabel('Matchday', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.xticks(gamedays)  # Optional: Display each matchday as a tick
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()

## Comparasion scaled vs unscaled data

In [None]:
df_scaled = pd.DataFrame(X_train, columns=list(X_train_box.columns))

In [None]:
df_current_elo = X_train_box[['current_home_elo']]
df_scaled_current_elo = df_scaled[['current_home_elo']]
# Prepare data (Unscaled)
df_unscaled_melted = df_current_elo.melt(var_name='Feature', value_name='Value')
df_unscaled_melted['Type'] = 'Unscaled'

# Prepare data (Scaled)
df_scaled_melted = df_scaled_current_elo.melt(var_name='Feature', value_name='Value')
df_scaled_melted['Type'] = 'Scaled'

# Combine data
df_combined = pd.concat([df_unscaled_melted, df_scaled_melted])

# Create boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x='Feature', y='Value', hue='Type', data=df_combined, palette='Set2')
plt.title("Comparison of Unscaled and Scaled Home Elo", fontsize=16)
plt.ylabel("Values", fontsize=14)
plt.xlabel("Feature", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Print the results
print("Values Range for 'current_home_elo':")
print(f"Unscaled: Min = {df_current_elo['current_home_elo'].min()}, Max = {df_current_elo['current_home_elo'].max()}")
print(f"Scaled: Min = {df_scaled_current_elo['current_home_elo'].min():.2f}, Max = {df_scaled_current_elo['current_home_elo'].max():.2f}")