In [38]:
# import of the needed packages
import pandas as pd
import numpy as np
import re
import math
import csv

# variable for the file with the raw dara
file = 'cumulated_data_bl1_v3_with_mw_and_form.csv'

In [39]:
# function to load the raw data and add the necessary columns for our calculated features
def load_data(file):
    # load the raw data
    data = pd.read_csv(file, delimiter=',')
    # adding the columns for the calculated features
    data['HomeElo'] = 0
    data['AwayElo'] = 0
    data['HomeAttack'] = 0
    data['HomeDefend'] = 0
    data['AwayAttack'] = 0
    data['AwayDefend'] = 0
    data['HomeEloOld'] = 0
    data['AwayEloOld'] = 0
    data['HomeAttackOld'] = 0
    data['HomeDefendOld'] = 0
    data['AwayAttackOld'] = 0
    data['AwayDefendOld'] = 0
    data['DiffEloOld'] = 0
    data['DiffAttackOld'] = 0
    data['DiffDefendOld'] = 0
    data['DiffElo'] = 0
    data['DiffAttack'] = 0
    data['DiffDefend'] = 0
    data['QuotEloOld'] = 0
    data['QuotAttackOld'] = 0
    data['QuotDefendOld'] = 0
    data['QuotElo'] = 0
    data['QuotAttack'] = 0
    data['QuotDefend'] = 0
    
    return data

In [40]:
# function to get the names of all teams in the data frame
def get_all_teams(data):
    teams = []
    # loop to store the team names into a list
    for x in data['HomeTeam']:
        if x not in teams:
            teams.append(x)
            
    return teams

In [41]:
# function to create a dictionary to store the home/away elo and attack/defend values for each team
def create_elo_dictionary(teams):

    teams_elo_avstaerke = {}
    # loop for generating the dictionary to store the calculated values
    # setting standard values for home/away elo, attack/defend values to 
    # avoid problems with divering values for the teams that are promoted to the BL and would start with 0
    for x in teams:
        team_stats = {x: [1000, 1000, 1.34, -1.34]}
        teams_elo_avstaerke.update(team_stats)
        
    return teams_elo_avstaerke

In [42]:
# function to calculate the difference between the elos
def diffelo(homeelo, awayelo):
    diffelo = homeelo - awayelo
    
    return diffelo

In [43]:
# function to calucalate the elo gain for the teams in the match
def elogain_2(z, data, hometeam, awayteam, teams_elo_avstaerke):
    # loading the elo values for the home and the away team from the dictionary
    homeelo = teams_elo_avstaerke[hometeam][0]
    awayelo = teams_elo_avstaerke[awayteam][0]

    # getting the true outcome of the game from the data frame
    trueoutcome = outcometrue_2(z, data)

    # calculating the possible outcome and a margin of victory based on the elo values of the teams
    expectedoutcome = outcomeexpected_2(z, homeelo, awayelo)
    ofvictorymargin = marginofvictory_2(z, data, homeelo, awayelo)

    # calculating the elo gain for both teams and adding it to their previous value while taking the marginofvictory into account
    elogain = 20 * (trueoutcome - expectedoutcome) * ofvictorymargin
    elohome = homeelo + elogain
    eloaway = awayelo - elogain

    # updating the dictionary with the newly calculated elo values
    teams_elo_avstaerke[hometeam][0] = elohome
    teams_elo_avstaerke[awayteam][1] = eloaway

    return elohome, eloaway, homeelo, awayelo

In [44]:
# function for calculating the margin of victory
# the margin of victory is used to incorporate the difference of the teams
# as its easier for good teams to score against bad teams
# scoring against good teams is honored, scoring against bad teams gets downgraded 
def marginofvictory_2(z, data, homeelo, awayelo):
    # getting the elo and the goal difference between the teams
    elodiff = diffelo(homeelo, awayelo)
    diffgoal = data.FTHG[z] - data.FTAG[z]

    if diffgoal <= 1:
        marginofvictory = 1
    else:
        # calculating the margin of victory to honor scoring against good teams
        marginofvictory = (math.log2(1.7 * diffgoal) * 2) / (2 +
                                                             0.001 * elodiff)
    return marginofvictory

In [45]:
# function to get the outcome of the game and transferring it into numerical values
def outcometrue_2(z, data):
    if data.FTR[z] == "H":
        outcometrue = 1
    elif data.FTR[z] == "D":
        outcometrue = 0.5
    else:
        outcometrue = 0
    return outcometrue

In [46]:
# function to calculate a possible outcome for the game
def outcomeexpected_2(z, homeelo, awayelo):
    elodiff = diffelo(homeelo, awayelo)
    # calculating the game outcome based on the elo difference
    outcomeexpected = 1 / (1 + 10**(-elodiff / 400))
    return outcomeexpected

In [47]:
# function to calculate the attack value for the home team
def home_attack(z, data, alpha, ratio, hometeam, awayteam,
                teams_elo_avstaerke):
    # get the match goals and the old attack and defend values
    goalsshot = data.FTHG[z]
    attack_old = teams_elo_avstaerke[hometeam][2]
    otherteamdefend = teams_elo_avstaerke[awayteam][3]
    
    # calculate the new attack value
    homeattack = attack_old + ((goalsshot - attack_old) * ratio +
                               (goalsshot + otherteamdefend) *
                               (1 - ratio)) * alpha
    # update the attack value in the dictionary
    teams_elo_avstaerke[hometeam][2] = homeattack
    return homeattack, attack_old

In [48]:
# function to calculate the defend value for the home team
def home_defend(z, alpha, ratio, data, hometeam, awayteam,
                teams_elo_avstaerke):
    # get the match goals and the old attack and defend values
    goalsgot = data.FTAG[z]
    defend_old = teams_elo_avstaerke[hometeam][3]
    otherteamattack = teams_elo_avstaerke[awayteam][2]
    # calculate the new defend value
    homedefend = defend_old - ((goalsgot + defend_old) * ratio +
                               (goalsgot - otherteamattack) *
                               (1 - ratio)) * alpha
    # update the defend value in the dictionary
    teams_elo_avstaerke[hometeam][3] = homedefend
    return homedefend, defend_old

In [49]:
# function to calculate the attack value for the away team
def away_attack(z, alpha, ratio, data, hometeam, awayteam, teams_elo_avstaerke):
    # get the match goals and the old attack and defend values
    goalsshot = data.FTAG[z]
    attack_old = teams_elo_avstaerke[awayteam][2]
    otherteamdefend = teams_elo_avstaerke[hometeam][3]
    # calculate the new attack value
    awayattack = attack_old + ((goalsshot - attack_old) * ratio +
                               (goalsshot + otherteamdefend) *
                               (1 - ratio)) * alpha
    # update the attack value in the dictionary
    teams_elo_avstaerke[awayteam][2] = awayattack
    return awayattack, attack_old

In [50]:
# function to calculate the defend value for the away team
def away_defend(z, alpha, ratio, data, hometeam, awayteam, teams_elo_avstaerke):
    # get the match goals and the old attack and defend values
    goalsgot = data.FTHG[z]
    defend_old = teams_elo_avstaerke[awayteam][3]
    otherteamattack = teams_elo_avstaerke[hometeam][2]
    # calculate the new defend value
    awaydefend = defend_old - ((goalsgot + defend_old) * ratio +
                               (goalsgot - otherteamattack) *
                               (1 - ratio)) * alpha
    # update the attack value in the dictionary
    teams_elo_avstaerke[awayteam][3] = awaydefend
    return awaydefend, defend_old

In [51]:
def add_new_columns1(data):
    data['HomeElo'] = 0
    data['AwayElo'] = 0
    data['HomeAttack'] = 0
    data['HomeDefend'] = 0
    data['AwayAttack'] = 0
    data['AwayDefend'] = 0
    data['HomeEloOld'] = 0
    data['AwayEloOld'] = 0
    data['HomeAttackOld'] = 0
    data['HomeDefendOld'] = 0
    data['AwayAttackOld'] = 0
    data['AwayDefendOld'] = 0

In [52]:
def add_new_columns2(data):
    data['DiffEloOld'] = 0
    data['DiffAttackOld'] = 0
    data['DiffDefendOld'] = 0
    data['DiffElo'] = 0
    data['DiffAttack'] = 0
    data['DiffDefend'] = 0
    data['QuotEloOld'] = 0
    data['QuotAttackOld'] = 0
    data['QuotDefendOld'] = 0
    data['QuotElo'] = 0
    data['QuotAttack'] = 0
    data['QuotDefend'] = 0

In [53]:
# function to use all functions to calculate the elos and the attack/defend values for a game
def calculate_elo_attack_defense(data, teams_elo_avstaerke):
    z = 0
    # setting the values for alpha and ratio
    # explanation for what we use them!!!
    alpha = 0.5
    ratio = 0.75
    # loop to calculate the values for the whole data frame
    while z < len(data):
        # getting the home and away team for the game
        hometeam = data.HomeTeam[z]
        awayteam = data.AwayTeam[z]

        try:
            # calling the function to calculate the elo gains
            homeelo, awayelo, homeeloold, awayeloold = elogain_2(
            z, data, hometeam, awayteam, teams_elo_avstaerke)
            # calling the function to calculate the attack value for the home team
            homeattack1, homeattackold = home_attack(z, data, alpha, ratio,
                                                     hometeam, awayteam,
                                                     teams_elo_avstaerke)
            # calling the function to calculate the defend value for the home team
            homedefend1, homedefendold = home_defend(z, alpha, ratio, data,
                                                     hometeam, awayteam,
                                                     teams_elo_avstaerke)
            # calling the function to calculate the attack value for the away team
            awayattack1, awayattackold = away_attack(z, alpha, ratio, data,
                                                     hometeam, awayteam,
                                                     teams_elo_avstaerke)
            # calling the function to calculate the defend value for the away team
            awaydefend1, awaydefendold = away_defend(z, alpha, ratio, data,
                                                     hometeam, awayteam,
                                                     teams_elo_avstaerke)
        except:
            print(f"Can't find data for team: {hometeam, awayteam}")
            break
        # adding the calucalted values to the data frame
        data['HomeElo'][z] = homeelo
        data['AwayElo'][z] = awayelo
        data['HomeAttack'][z] = homeattack1
        data['HomeDefend'][z] = homedefend1
        data['AwayAttack'][z] = awayattack1
        data['AwayDefend'][z] = awaydefend1
        data['HomeEloOld'][z] = homeeloold
        data['AwayEloOld'][z] = awayeloold
        data['HomeAttackOld'][z] = homeattackold
        data['HomeDefendOld'][z] = homedefendold
        data['AwayAttackOld'][z] = awayattackold
        data['AwayDefendOld'][z] = awaydefendold
        z+=1
    return data
    # return homeelo, awayelo, homeeloold, awayeloold, homeattack1, homeattackold, homedefend1, homedefendold, awayattack1, awayattackold, awaydefend1, awaydefendold


In [54]:
def add_elo_attack_defense(homeelo, awayelo, homeeloold, awayeloold, homeattack1, homeattackold, homedefend1, homedefendold, awayattack1, awayattackold, awaydefend1, awaydefendold):
    data['HomeElo'][z] = homeelo
    data['AwayElo'][z] = awayelo
    data['HomeAttack'][z] = homeattack1
    data['HomeDefend'][z] = homedefend1
    data['AwayAttack'][z] = awayattack1
    data['AwayDefend'][z] = awaydefend1
    data['HomeEloOld'][z] = homeeloold
    data['AwayEloOld'][z] = awayeloold
    data['HomeAttackOld'][z] = homeattackold
    data['HomeDefendOld'][z] = homedefendold
    data['AwayAttackOld'][z] = awayattackold
    data['AwayDefendOld'][z] = awaydefendold


In [55]:
# function for adding the difference values for the elo, attack and defend values
def add_differences(data):
    data['DiffEloOld'] = data['HomeEloOld'] - data['AwayEloOld']
    data['DiffAttackOld'] = data['HomeAttackOld'] - data['AwayAttackOld']
    data['DiffDefendOld'] = data['HomeDefendOld'] - data['AwayDefendOld']
    data['DiffElo'] = data['HomeElo'] - data['AwayElo']
    data['DiffAttack'] = data['HomeAttack'] - data['AwayAttack']
    data['DiffDefend'] = data['HomeDefend'] - data['AwayDefend']
    return data

In [56]:
# function for adding the quotient values for the elo, attack and defend values
def add_qoutient(data):
    data['QuotEloOld'] = (data['HomeEloOld']) / (data['AwayEloOld'])
    data['QuotAttackOld'] = (data['HomeAttackOld']) / (data['AwayAttackOld'])
    data['QuotDefendOld'] = (data['HomeDefendOld']) / (data['AwayDefendOld'])
    data['QuotElo'] = (data['HomeElo']) / (data['AwayElo'])
    data['QuotAttack'] = (data['HomeAttack']) / (data['AwayAttack'])
    data['QuotDefend'] = (data['HomeDefend']) / (data['AwayDefend'])
    return data

In [57]:
# function to get the direct diffenrence betwenn the teams of a game
def direct_comparison(data):
    # get direct comparison of the two teams

    direct_comparison_hg = []
    direct_comparison_ag = []
    direct_comparison_hp = []
    direct_comparison_ap = []

    for counter, rows in data.iterrows():
        data_games_played_until_now = data.iloc[:counter]
        direct_comparison_teams_temp = data_games_played_until_now[(data_games_played_until_now.HomeTeam == rows.HomeTeam)&(data_games_played_until_now.AwayTeam == rows.AwayTeam)|
        (data_games_played_until_now.HomeTeam == rows.AwayTeam)&(data_games_played_until_now.AwayTeam == rows.HomeTeam)]
        if len(direct_comparison_teams_temp) >= 3:
            if len(direct_comparison_teams_temp) >= 5:
                # take only the last 5 matches
                direct_comparison_teams_temp = direct_comparison_teams_temp.iloc[-5:]
                #direct_comparison_teams_temp.iloc[-5:].value_counts(["HomeTeam", "FTAG"])
                number_of_games = 5
            else:
                number_of_games = len(direct_comparison_teams_temp)

            # get average number of goals, that the hometeam has scored in the last 5 direct matches
            home_goals = (direct_comparison_teams_temp[direct_comparison_teams_temp.HomeTeam == rows.HomeTeam].FTHG.sum() + \
                direct_comparison_teams_temp[direct_comparison_teams_temp.AwayTeam == rows.HomeTeam].FTAG.sum()) / number_of_games

            # get average number of goals, that the awayteam has scored in the last 5 direct matches
            away_goals = (direct_comparison_teams_temp[direct_comparison_teams_temp.HomeTeam == rows.AwayTeam].FTHG.sum() + \
                direct_comparison_teams_temp[direct_comparison_teams_temp.AwayTeam == rows.AwayTeam].FTAG.sum()) / number_of_games

        else:
            # if the direct comparison is not possible yet (because both teams haven't played against each other) the scored goals of the last 10 matches would be measured to avoid null values
            goals_scored_ht = data_games_played_until_now[(data_games_played_until_now.HomeTeam == rows.HomeTeam)|(data_games_played_until_now.AwayTeam == rows.HomeTeam)].iloc[-10:]
            goals_scored_at = data_games_played_until_now[(data_games_played_until_now.HomeTeam == rows.AwayTeam)|(data_games_played_until_now.AwayTeam == rows.AwayTeam)].iloc[-10:]

            if len(goals_scored_ht) >= 10:
                home_goals = (goals_scored_ht[goals_scored_ht.HomeTeam == rows.HomeTeam].FTHG.sum() + \
                    goals_scored_ht[goals_scored_ht.AwayTeam == rows.HomeTeam].FTAG.sum()) / 10
                away_goals = (goals_scored_at[goals_scored_at.HomeTeam == rows.AwayTeam].FTHG.sum() + \
                    goals_scored_at[goals_scored_at.AwayTeam == rows.AwayTeam].FTAG.sum()) / 10
            else:
                home_goals = None
                away_goals = None

        direct_comparison_hg.append(home_goals)
        direct_comparison_ag.append(away_goals)

    # adding the calculated values to the data frame
    data['DirectComparisonHG'] = direct_comparison_hg
    data['DirectComparisonAG'] = direct_comparison_ag
    data['DirectComparisonGoalDiff'] = data[
            'DirectComparisonHG'] - data['DirectComparisonAG']
    data['DirectComparisonGoalQuot'] = (data['DirectComparisonHG']) / (data[
    'DirectComparisonAG'])
    return data


In [58]:
# function to reduce the data frame to only the columns the model needs
def select_necessary_columns(data):
    data_reduced = data[[
        "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR", "HS", "AS",
        "HST", "AST", "HF", "AF", "HC", "AC", "HY", "AY", "HR", "AR", "PointsLast3MatchesHT",
        "PointsLast3MatchesAT", "PointsLast5MatchesHT", "PointsLast5MatchesAT",
        "PointsLast10MatchesHT", "PointsLast10MatchesAT", "PointsLastAllMatchesHT",
        "PointsLastAllMatchesAT", "PDiff3Matches", "PQuot3Matches", "PDiff5Matches",
        "PQuot5Matches", "PDiff10Matches", "PQuot10Matches", "PDiffAllMatches",
        "PQuotAllMatches", "MarketValueHT", "MarketValueAT", "MarketValueDiff",
        "MarketValueQuot", "HomeElo", "AwayElo", "HomeAttack", "HomeDefend", "AwayAttack",
        "AwayDefend", "HomeEloOld", "AwayEloOld", "HomeAttackOld", "HomeDefendOld",
        "AwayAttackOld", "AwayDefendOld", "DiffEloOld", "DiffAttackOld", "DiffDefendOld",
        "DiffElo", "DiffAttack", "DiffDefend", "QuotEloOld", "QuotAttackOld",
        "QuotDefendOld", "QuotElo", "QuotAttack", "QuotDefend", "DirectComparisonHG",
        "DirectComparisonAG", "DirectComparisonGoalDiff", "DirectComparisonGoalQuot"
    ]]
    return data_reduced


In [59]:
# function to save the data frame to a csv file
def save_to_csv(data_reduced):
    data_reduced.to_csv("./Data/preprocessed_dataframe_with_elo_mw_form_3_v3.csv")

In [60]:
# function to run all functions with one click
def run_all(file):
    data = load_data(file)
    teams = get_all_teams(data)
    teams_elo_avstaerke = create_elo_dictionary(teams)
    data = calculate_elo_attack_defense(data, teams_elo_avstaerke)
    data = add_differences(data)
    data = add_qoutient(data)
    data = direct_comparison(data)
    data_reduced = select_necessary_columns(data)
    # functions for the points over the last games
    # function for the market value
    save_to_csv(data_reduced)
    return data_reduced


data = run_all(file)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['HomeElo'][z] = homeelo
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['AwayElo'][z] = awayelo
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['HomeAttack'][z] = homeattack1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['HomeDefend'][z] = homedefend1
A value is trying to be set on a copy 