# FIFA World Cup 2022 Simulation

Import libraries

In [93]:
from datetime import datetime

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.offline as py

from sklearn.model_selection import train_test_split

Common data

In [50]:
groups = pd.DataFrame.from_dict({
    'A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
    'B': ['England', 'Iran', 'United States', 'Wales'],
    'C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
    'D': ['France', 'Australia', 'Denmark', 'Tunisia'],
    'E': ['Spain', 'Costa Rica', 'Germany', 'Japan'],
    'F': ['Belgium', 'Canada', 'Morocco', 'Croatia'],
    'G': ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
    'H': ['Portugal', 'Ghana', 'Uruguay', 'South Korea'],
})
groups

Unnamed: 0,A,B,C,D,E,F,G,H
0,Qatar,England,Argentina,France,Spain,Belgium,Brazil,Portugal
1,Ecuador,Iran,Saudi Arabia,Australia,Costa Rica,Canada,Serbia,Ghana
2,Senegal,United States,Mexico,Denmark,Germany,Morocco,Switzerland,Uruguay
3,Netherlands,Wales,Poland,Tunisia,Japan,Croatia,Cameroon,South Korea


## Exploratory Data Analysis

### Results dataset

Read dataset

In [2]:
results = pd.read_csv('results.csv')

In [3]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [10]:
results.describe()

Unnamed: 0,home_score,away_score
count,44059.0,44059.0
mean,1.740507,1.178306
std,1.747525,1.394815
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,31.0,21.0


In [12]:
results.dtypes

date           object
home_team      object
away_team      object
home_score    float64
away_score    float64
tournament     object
city           object
country        object
neutral          bool
dtype: object

Convert date to datime

In [15]:
results['date'] = pd.to_datetime(results['date'])

date          datetime64[ns]
home_team             object
away_team             object
home_score           float64
away_score           float64
tournament            object
city                  object
country               object
neutral                 bool
dtype: object

#### Teams

In [37]:
px.histogram(results.sort_values('home_team'), x="home_team")

Keep only classified countries

In [71]:
results = results[
    results['home_team'].isin(groups.values.reshape(-1,).tolist())
    & results['away_team'].isin(groups.values.reshape(-1,).tolist())
]
results

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
10,1879-01-18,England,Wales,2.0,1.0,Friendly,London,England,False
14,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,Wales,False
16,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,England,False
22,1882-03-13,Wales,England,5.0,3.0,Friendly,Wrexham,Wales,False
24,1883-02-03,England,Wales,5.0,0.0,Friendly,London,England,False
...,...,...,...,...,...,...,...,...,...
44028,2022-09-27,Ecuador,Japan,0.0,0.0,Kirin Challenge Cup,Düsseldorf,Germany,True
44034,2022-09-27,Iran,Senegal,1.0,1.0,Friendly,Maria Enzersdorf,Austria,True
44036,2022-09-27,South Korea,Cameroon,1.0,0.0,Friendly,Seoul,South Korea,False
44046,2022-09-27,Saudi Arabia,United States,0.0,0.0,Friendly,Murcia,Spain,True


#### Dates

In [72]:
px.histogram(results.sort_values('date'), x="date")

In [80]:
results = results[results['date'] > datetime(2009, 1, 1)]
results

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
31297,2009-01-05,Saudi Arabia,Qatar,0.0,0.0,Gulf Cup,Muscat,Oman,True
31391,2009-02-11,France,Argentina,0.0,2.0,Friendly,Marseille,France,False
31394,2009-02-11,Iran,South Korea,1.0,1.0,FIFA World Cup qualification,Tehran,Iran,False
31397,2009-02-11,Japan,Australia,0.0,0.0,FIFA World Cup qualification,Yokohama,Japan,False
31414,2009-02-11,Spain,England,2.0,0.0,Friendly,Seville,Spain,False
...,...,...,...,...,...,...,...,...,...
44028,2022-09-27,Ecuador,Japan,0.0,0.0,Kirin Challenge Cup,Düsseldorf,Germany,True
44034,2022-09-27,Iran,Senegal,1.0,1.0,Friendly,Maria Enzersdorf,Austria,True
44036,2022-09-27,South Korea,Cameroon,1.0,0.0,Friendly,Seoul,South Korea,False
44046,2022-09-27,Saudi Arabia,United States,0.0,0.0,Friendly,Murcia,Spain,True


#### Drop unnnecesary columns

In [81]:
results = results[['date', 'home_team', 'away_team', 'home_score', 'away_score']]
results

Unnamed: 0,date,home_team,away_team,home_score,away_score
31297,2009-01-05,Saudi Arabia,Qatar,0.0,0.0
31391,2009-02-11,France,Argentina,0.0,2.0
31394,2009-02-11,Iran,South Korea,1.0,1.0
31397,2009-02-11,Japan,Australia,0.0,0.0
31414,2009-02-11,Spain,England,2.0,0.0
...,...,...,...,...,...
44028,2022-09-27,Ecuador,Japan,0.0,0.0
44034,2022-09-27,Iran,Senegal,1.0,1.0
44036,2022-09-27,South Korea,Cameroon,1.0,0.0
44046,2022-09-27,Saudi Arabia,United States,0.0,0.0


Create output column

In [88]:
df = pd.DataFrame()

df['date'], df['home_team'], df['away_team'], df['home_score'], df['away_score'] = np.where(
    results.home_team > results.away_team,
    [results.date, results.away_team, results.home_team, results.away_score, results.home_score],
    [results.date, results.home_team, results.away_team, results.home_score, results.away_score]
)
df

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,1231113600000000000,Qatar,Saudi Arabia,0.0,0.0
1,1234310400000000000,Argentina,France,2.0,0.0
2,1234310400000000000,Iran,South Korea,1.0,1.0
3,1234310400000000000,Australia,Japan,0.0,0.0
4,1234310400000000000,England,Spain,0.0,2.0
...,...,...,...,...,...
919,1664236800000000000,Ecuador,Japan,0.0,0.0
920,1664236800000000000,Iran,Senegal,1.0,1.0
921,1664236800000000000,Cameroon,South Korea,0.0,1.0
922,1664236800000000000,Saudi Arabia,United States,0.0,0.0


In [89]:
df['date'] = pd.to_datetime(df['date'])
df

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,2009-01-05,Qatar,Saudi Arabia,0.0,0.0
1,2009-02-11,Argentina,France,2.0,0.0
2,2009-02-11,Iran,South Korea,1.0,1.0
3,2009-02-11,Australia,Japan,0.0,0.0
4,2009-02-11,England,Spain,0.0,2.0
...,...,...,...,...,...
919,2022-09-27,Ecuador,Japan,0.0,0.0
920,2022-09-27,Iran,Senegal,1.0,1.0
921,2022-09-27,Cameroon,South Korea,0.0,1.0
922,2022-09-27,Saudi Arabia,United States,0.0,0.0


In [92]:
df['score'] = df['home_score'] - df['away_score']
df = df[['date', 'home_team', 'away_team', 'score']]
df

Unnamed: 0,date,home_team,away_team,score
0,2009-01-05,Qatar,Saudi Arabia,0.0
1,2009-02-11,Argentina,France,2.0
2,2009-02-11,Iran,South Korea,0.0
3,2009-02-11,Australia,Japan,0.0
4,2009-02-11,England,Spain,-2.0
...,...,...,...,...
919,2022-09-27,Ecuador,Japan,0.0
920,2022-09-27,Iran,Senegal,0.0
921,2022-09-27,Cameroon,South Korea,-1.0
922,2022-09-27,Saudi Arabia,United States,0.0


## Modeling

In [94]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['date', 'home_team', 'away_team']],
    df['score'],
    test_size=0.3,
    train_size=0.7
)