### **IMPORTS**

In [383]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [384]:
dictionary = pickle.load(open('world_cup_groups_2022', 'rb'))
df_historical_results_data = pd.read_csv('fifa_worldcup_historical_results_data_cleaned.csv')
df_fixtures = pd.read_csv('fifa_worldcup_2022_fixtures_cleaned.csv')

### **ADD TEAM STRENGTH FEATURE**

In [385]:
dictionary["Group A"]

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Qatar (H),0,0,0,0,0,0,0,0
1,2,Ecuador,0,0,0,0,0,0,0,0
2,3,Senegal,0,0,0,0,0,0,0,0
3,4,Netherlands,0,0,0,0,0,0,0,0


In [386]:
#  splitting home and away data
df_home = df_historical_results_data[['home_team', 'home_goals', 'away_goals']]
df_away = df_historical_results_data[['away_team', 'away_goals', 'home_goals']]

In [387]:
df_home

Unnamed: 0,home_team,home_goals,away_goals
0,France,4,1
1,Uruguay,4,2
2,Uruguay,6,1
3,Argentina,6,1
4,Paraguay,1,0
...,...,...,...
894,Serbia,0,2
895,Serbia,1,2
896,Brazil,2,0
897,Costa Rica,0,1


In [388]:
# rename some columns for merging
df_home = df_home.rename(columns={'home_team': 'team', 'home_goals': 'goals_for', 'away_goals': 'goals_against'})
df_away = df_away.rename(columns={'away_team': 'team', 'away_goals': 'goals_for', 'home_goals': 'goals_against'})

In [389]:
# calculating average goals for and against per team
df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby('team').mean()

### **FUNCTION FOR PREDICTION OF NUMBER OF POINTS**

In [390]:
def points_prediction(home_team, away_team):
    if home_team in df_team_strength.index and away_team in df_team_strength.index:
        lambda_home  = df_team_strength.loc[home_team, 'goals_for'] * df_team_strength.loc[away_team, 'goals_against']
        lambda_away  = df_team_strength.loc[away_team, 'goals_for'] * df_team_strength.loc[home_team, 'goals_against']
        max_goals = 10
        probabiility_home, probabiility_away, probability_draw = 0, 0, 0
        for home_goals in range(0, max_goals + 1):
            for away_goals in range(0, max_goals + 1):
                p = poisson.pmf(home_goals, lambda_home) * poisson.pmf(away_goals, lambda_away)
                if home_goals > away_goals:
                    probabiility_home += p
                elif home_goals < away_goals:
                    probabiility_away += p
                else:
                    probability_draw += p
        home_points = probabiility_home * 3 + probability_draw 
        away_points = probabiility_away * 3 + probability_draw 
        return (home_points, away_points)   
    else:
        return (0, 0)

In [391]:
#test our points prediction function
points_prediction('Argentina', 'Mexico')
points_prediction('England', 'United States')
points_prediction('Qatar (H)', 'Ecuador') # home team not in historical data

(0, 0)

### **PREDICTING WORLD CUP RESULTS**

#### **GROUP STAGE PREDICTIONS**

In [392]:
# splitting fixtures into groups and knockout stages
df_fixture_group_48 = df_fixtures[:48].copy()
df_fixture_knockout = df_fixtures[48:56].copy()
df_fixture_quarterfinals = df_fixtures[56:60].copy()
df_fixture_semifinals = df_fixtures[60:62].copy()
df_fixture_final = df_fixtures[62:].copy()

In [393]:
df_fixture_quarterfinals

Unnamed: 0,home,score,away,year
56,Winners Match 53,Match 58,Winners Match 54,2022
57,Winners Match 49,Match 57,Winners Match 50,2022
58,Winners Match 55,Match 60,Winners Match 56,2022
59,Winners Match 51,Match 59,Winners Match 52,2022


In [394]:
# simulating group stage matches
for group, table in dictionary.items():
    teams_in_group = table['Team'].values

    # select only fixtures where the home team is in this group
    df_group = df_fixture_group_48[df_fixture_group_48['home'].isin(teams_in_group)].copy()

    # compute expected points for each fixture (home and away)
    df_group[['home_pts', 'away_pts']] = df_group.apply(
        lambda r: pd.Series(points_prediction(r['home'], r['away'])),
        axis=1
    )

    # aggregate points per team (home + away)
    pts_per_team = (
        pd.concat([
            df_group[['home', 'home_pts']].rename(columns={'home': 'Team', 'home_pts': 'Pts'}),
            df_group[['away', 'away_pts']].rename(columns={'away': 'Team', 'away_pts': 'Pts'})
        ])
        .groupby('Team', as_index=False)['Pts'].sum()
    )

    # merge aggregated points back into the group table
    table = table.merge(pts_per_team, on='Team', how='left', suffixes=('', '_add'))
    table['Pts'] = table['Pts'] + table['Pts_add'].fillna(0)

    # sort teams by points, reset index, and round points
    table = table[['Team', 'Pts']].sort_values('Pts', ascending=False).reset_index(drop=True)
    table['Pts'] = table['Pts'].round(0)

    dictionary[group] = table


In [395]:
# show all groups with predicted points
for group, table in dictionary.items():
    print(f"{group}:\n{table}\n")

Group A:
          Team  Pts
0  Netherlands  4.0
1      Senegal  2.0
2      Ecuador  2.0
3    Qatar (H)  0.0

Group B:
            Team  Pts
0        England  6.0
1          Wales  5.0
2  United States  3.0
3           Iran  2.0

Group C:
           Team  Pts
0     Argentina  7.0
1        Poland  6.0
2        Mexico  4.0
3  Saudi Arabia  1.0

Group D:
        Team  Pts
0     France  7.0
1    Denmark  6.0
2    Tunisia  3.0
3  Australia  2.0

Group E:
         Team  Pts
0     Germany  7.0
1       Spain  5.0
2       Japan  3.0
3  Costa Rica  2.0

Group F:
      Team  Pts
0  Croatia  7.0
1  Belgium  6.0
2  Morocco  4.0
3   Canada  0.0

Group G:
          Team  Pts
0       Brazil  8.0
1  Switzerland  4.0
2       Serbia  3.0
3     Cameroon  2.0

Group H:
          Team  Pts
0     Portugal  6.0
1      Uruguay  5.0
2        Ghana  4.0
3  South Korea  2.0



#### **SIMULATING KNOCKOUT STAGES**

In [396]:
df_fixture_knockout

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [397]:
# build mapping from placeholders to real team names
for group in dictionary:
    group_winner = dictionary[group].loc[0, 'Team']
    runners_up = dictionary[group].loc[1, 'Team']
    df_fixture_knockout.replace({f'Winners {group}':group_winner,
                                 f'Runners-up {group}':runners_up}, inplace=True)

df_fixture_knockout['winner'] = '?'
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,?
49,Argentina,Match 50,Denmark,2022,?
50,France,Match 52,Poland,2022,?
51,England,Match 51,Senegal,2022,?
52,Germany,Match 53,Belgium,2022,?
53,Brazil,Match 54,Uruguay,2022,?
54,Croatia,Match 55,Spain,2022,?
55,Portugal,Match 56,Switzerland,2022,?


In [398]:
# create a function to simulate a knockout match
def get_winner(df_fixture_updated):
    df = df_fixture_updated.copy()

    # compute points for each fixture (home, away)
    pts = df.apply(
        lambda r: points_prediction(r['home'], r['away']),
        axis=1
    )

    # decide winner using temporary points (not stored as columns)
    df['winner'] = [
        home if points_home > points_away else away
        for (home, away), (points_home, points_away)
        in zip(df[['home', 'away']].to_numpy(), pts.to_numpy())
    ]

    return df


In [399]:
df_fixture_knockout = get_winner(df_fixture_knockout)

#### **QUARTERFINALS STAGE**

In [400]:
df_fixture_quarterfinals

Unnamed: 0,home,score,away,year
56,Winners Match 53,Match 58,Winners Match 54,2022
57,Winners Match 49,Match 57,Winners Match 50,2022
58,Winners Match 55,Match 60,Winners Match 56,2022
59,Winners Match 51,Match 59,Winners Match 52,2022


In [401]:
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Denmark,2022,Argentina
50,France,Match 52,Poland,2022,France
51,England,Match 51,Senegal,2022,England
52,Germany,Match 53,Belgium,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Croatia,Match 55,Spain,2022,Spain
55,Portugal,Match 56,Switzerland,2022,Portugal


In [402]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winners {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2


In [403]:
update_table(df_fixture_knockout, df_fixture_quarterfinals)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Spain,Match 60,Portugal,2022,?
59,England,Match 59,France,2022,?


In [404]:
df_fixture_quarterfinals = get_winner(df_fixture_quarterfinals)

#### **SEMIFINAL STAGE**

In [405]:
df_fixture_quarterfinals

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,Brazil
57,Netherlands,Match 57,Argentina,2022,Netherlands
58,Spain,Match 60,Portugal,2022,Portugal
59,England,Match 59,France,2022,France


In [407]:
update_table(df_fixture_quarterfinals, df_fixture_semifinals)


Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,?
61,France,Match 62,Portugal,2022,?


In [408]:
df_fixture_semifinals = get_winner(df_fixture_semifinals)

In [409]:
df_fixture_semifinals

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,Brazil
61,France,Match 62,Portugal,2022,France


### **FINAL RESULT**

In [410]:
update_table(df_fixture_semifinals, df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Brazil,Match 64,France,2022,?


In [411]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Brazil,Match 64,France,2022,Brazil
