# Model to calculate the teams strength

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from statistics import mean

In [3]:
## Historical data from 
# https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017
Historical_results = pd.read_csv("data/results.csv")

## FIFA World Cup Teams and Groups
# https://digitalhub.fifa.com/m/6a616c6cf19bc57a/original/FWC-2022-Match-Schedule.pdf
Groups = pd.read_csv("data/World_Cup_Groups_temp.csv")

## Data Preprocessing
There are a few ways to preprocess the data to better fit the theory in Maher's model:
1. Only consider games between the 32 teams qualified for the 2022 World Cup
2. Only consider games when a team plays a opponent with a high world ranking (Ghana is the lowest ranked team at No. 60)
3. Consider all games

In this model, we only considered games between the 32 teams qualified since 2014. The code for the second way above is also included but commented.

In [4]:
Historical_results["date"] = pd.to_datetime(Historical_results["date"],format = '%Y-%m-%d')
Modern_era = Historical_results[Historical_results["date"].dt.year>=2014].copy().reset_index()

In [None]:
Aggregate = Modern_era[Modern_era['home_team'].isin(Groups["Squad"]) & Modern_era['away_team'].isin(Groups["Squad"])].copy().reset_index()
Aggregate['GF'] = abs(Aggregate['home_score']-Aggregate['away_score'])
#Aggregate.drop(Aggregate[Aggregate['GF']>=5].index, inplace=True)

In [6]:
#Ranking = pd.read_csv('data/ranking.csv')
#Ranking = Ranking.head(60)
#Aggregate = Modern_era[Modern_era['home_team'].isin(Ranking["Country"]) & Modern_era['away_team'].isin(Ranking["Country"])].copy().reset_index()

In [7]:
print("Average home score: ",Aggregate['home_score'].mean())
print(" ")
print("Average away score: ",Aggregate['away_score'].mean())

Average home score:  1.44
 
Average away score:  1.1447619047619049


## Calculating the attack and defense strength score for each team

Maher's model considered the difference between home and away games. The World Cup is played in a neutral ground except for the host nation. Therefore, we combined the home and away match outcomes in the calculation.

In [9]:
overal_scored_home = Aggregate['home_score'].mean()
overal_scored_away = Aggregate['away_score'].mean()
overal_conceded_home = overal_scored_away
overal_conceded_away = overal_scored_home
overal_avg = (overal_scored_home+overal_scored_away)/2
Output_overall = pd.DataFrame([overal_scored_home,overal_scored_away,overal_avg])
Output_overall.to_csv('data/reference_score.csv', header=False, index=False)

In [10]:
scored_home = []
scored_away = []
conceded_home = []
conceded_away = []
scored_ovr = []
scored_ovr_team = []
conceded_ovr = []
conceded_ovr_team = []

for i in range(len(Groups[["Squad"]])):
    scored_home.append(Aggregate[Aggregate['home_team']==Groups["Squad"][i]]['home_score'].mean())
    scored_away.append(Aggregate[Aggregate['away_team']==Groups["Squad"][i]]['away_score'].mean())
    scored_ovr_team = pd.concat([Aggregate[Aggregate['home_team']==Groups["Squad"][i]]['home_score'],Aggregate[Aggregate['away_team']==Groups["Squad"][i]]['away_score']])
    scored_ovr.append(scored_ovr_team.mean())
    conceded_home.append(Aggregate[Aggregate['home_team']==Groups["Squad"][i]]['away_score'].mean())
    conceded_away.append(Aggregate[Aggregate['away_team']==Groups["Squad"][i]]['home_score'].mean())
    conceded_ovr_team = pd.concat([Aggregate[Aggregate['home_team']==Groups["Squad"][i]]['away_score'],Aggregate[Aggregate['away_team']==Groups["Squad"][i]]['home_score']])
    conceded_ovr.append(conceded_ovr_team.mean())

In [11]:
'''
Strength_score = pd.DataFrame(list(zip(scored_home,scored_away,scored_ovr,conceded_home,conceded_away,conceded_ovr)), columns=["scored_home","scored_away","scored_ovr","conceded_home","conceded_away","conceded_ovr"])
Strength_score["Country"] = Groups[["Squad"]].copy()
Strength_score["home_attack_strength"] = Strength_score["scored_home"]/overal_scored_home
Strength_score["away_attack_strength"] = Strength_score["scored_away"]/overal_scored_away
Strength_score["ovr_attack_strength"] = Strength_score["scored_ovr"]/overal_avg
Strength_score["home_defense_strength"] = Strength_score["conceded_home"]/overal_conceded_home
Strength_score["away_defense_strength"] = Strength_score["conceded_away"]/overal_conceded_away
Strength_score["ovr_defense_strength"] = Strength_score["conceded_ovr"]/overal_avg
Strength_score = Strength_score[["Country","ovr_attack_strength","ovr_defense_strength","home_attack_strength","home_defense_strength","away_attack_strength","away_defense_strength","scored_home","scored_away","conceded_home","conceded_away"]]
Strength_score["Group"] = Groups[["Group"]].copy()
Strength_score["Group_Position"] = Groups[["Group_Position"]].copy()

Output_file = Strength_score[["Country","Group","ovr_attack_strength","ovr_defense_strength","home_attack_strength","home_defense_strength","away_attack_strength","away_defense_strength","Group_Position"]].copy()
Output_file.to_csv('data/input_file.csv')
'''

In [16]:
Strength_score = pd.DataFrame(list(zip(scored_home,scored_away,scored_ovr,conceded_home,conceded_away,conceded_ovr)), columns=["scored_home","scored_away","scored_ovr","conceded_home","conceded_away","conceded_ovr"])
Strength_score["Country"] = Groups[["Squad"]].copy()
Strength_score["home_attack_strength"] = Strength_score["scored_ovr"]/overal_avg
Strength_score["away_attack_strength"] = Strength_score["scored_ovr"]/overal_avg
Strength_score["ovr_attack_strength"] = Strength_score["scored_ovr"]/overal_avg
Strength_score["home_defense_strength"] = Strength_score["conceded_ovr"]/overal_avg
Strength_score["away_defense_strength"] = Strength_score["conceded_ovr"]/overal_avg
Strength_score["ovr_defense_strength"] = Strength_score["conceded_ovr"]/overal_avg
Strength_score = Strength_score[["Country","home_attack_strength","home_defense_strength","away_attack_strength","away_defense_strength","scored_home","scored_away","conceded_home","conceded_away"]]
Strength_score["Group"] = Groups[["Group"]].copy()
Strength_score["Group_Position"] = Groups[["Group_Position"]].copy()

Output_file = Strength_score[["Country","Group","home_attack_strength","home_defense_strength","away_attack_strength","away_defense_strength","Group_Position"]].copy()
Output_file.to_csv('data/input_file.csv')

## Output

Note: the table below has the extinction between home and away, but the value is the same as explained above.

In [17]:
Output_file

Unnamed: 0,Country,Group,home_attack_strength,home_defense_strength,away_attack_strength,away_defense_strength,Group_Position
0,Qatar,A,0.746131,1.326455,0.746131,1.326455,1
1,Ecuador,A,0.864797,1.433742,0.864797,1.433742,2
2,Senegal,A,0.952327,0.714245,0.952327,0.714245,3
3,Netherlands,A,1.246622,0.92422,1.246622,0.92422,4
4,England,B,1.002378,0.703423,1.002378,0.703423,1
5,Iran,B,0.628685,0.531964,0.628685,0.531964,2
6,United States,B,1.072721,0.949621,1.072721,0.949621,3
7,Argentina,C,1.115662,0.701787,1.115662,0.701787,3
8,Saudi Arabia,C,0.325796,1.221735,0.325796,1.221735,4
9,Mexico,C,1.093945,1.0139,1.093945,1.0139,1
