In [30]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [31]:
df_WC_2022 = pd.read_csv('WC_Datasets/wc_matches_2022_crawl.csv')

In [32]:
dict_table = pickle.load(open('WC_Datasets/dict_table','rb'))
df_historical_matches = pd.read_csv('WC_Datasets/clean_fifa_worldcup_matches.csv')
df_fixture = pd.read_csv('WC_Datasets/clean_fifa_worldcup_fixture.csv')

In [33]:
# df_historical_data = df_historical_matches.append(df_WC_2022, ignore_index=True)

df_historical_data = pd.concat([df_historical_matches,df_WC_2022])

## Calculate Team Strength

In [34]:
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

df_home = df_home.rename(columns={'HomeTeam':'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam':'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby(['Team']).mean()
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.461538
Angola,0.333333,0.666667
Argentina,1.666667,1.595238
Australia,2.736842,1.842105
Austria,1.482759,1.620690
...,...,...
Winners Match 60,0.000000,62.000000
Winners Match 61,64.000000,0.000000
Winners Match 62,0.000000,64.000000
Yugoslavia,1.666667,1.272727


## Function Predict_points

In [35]:
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # goals_scored * goals_conceded
        lamb_home = df_team_strength.at[home,'GoalsScored'] * df_team_strength.at[away,'GoalsConceded']
        lamb_away = df_team_strength.at[away,'GoalsScored'] * df_team_strength.at[home,'GoalsConceded']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11): #number of goals home team
            for y in range(0, 11): #number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0, 0)

## Testing Function

In [36]:
print(predict_points('England', 'Wales'))
print(predict_points('Argentina', 'Saudi Arabia'))
print(predict_points('Qatar (H)', 'Ecuador')) # Qatar vs Team X -> 0 points to both

(2.5560754339242426, 0.31136514177050184)
(1.1137940340176105, 1.7257150726654809)
(0, 0)


In [37]:
df_fixture_group_48 = df_fixture[:48].copy()
df_fixture_knockout = df_fixture[48:56].copy()
df_fixture_quarter = df_fixture[56:60].copy()
df_fixture_semi = df_fixture[60:62].copy()
df_fixture_final = df_fixture[62:].copy()

In [38]:
for group in dict_table:
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group_48[df_fixture_group_48['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away

    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index()
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

In [39]:
dict_table['Group A']

Unnamed: 0,Team,Pts
0,Netherlands,6.0
1,Ecuador,3.0
2,Senegal,0.0
3,Qatar (H),0.0


In [40]:
dict_table['Group B']

Unnamed: 0,Team,Pts
0,England,8.0
1,Wales,4.0
2,Iran,3.0
3,United States,2.0


In [41]:
dict_table['Group C']

Unnamed: 0,Team,Pts
0,Poland,7.0
1,Saudi Arabia,5.0
2,Argentina,4.0
3,Mexico,1.0


In [42]:
dict_table['Group D']

Unnamed: 0,Team,Pts
0,Tunisia,7.0
1,Australia,5.0
2,France,4.0
3,Denmark,1.0


In [43]:
dict_table['Group E']

Unnamed: 0,Team,Pts
0,Japan,6.0
1,Costa Rica,6.0
2,Germany,3.0
3,Spain,2.0


In [44]:
dict_table['Group F']

Unnamed: 0,Team,Pts
0,Croatia,5.0
1,Belgium,3.0
2,Canada,2.0
3,Morocco,0.0


In [45]:
dict_table['Group G']

Unnamed: 0,Team,Pts
0,Brazil,4.0
1,Cameroon,4.0
2,Serbia,4.0
3,Switzerland,0.0


In [46]:
dict_table['Group H']

Unnamed: 0,Team,Pts
0,Ghana,6.0
1,South Korea,4.0
2,Uruguay,2.0
3,Portugal,2.0


## Knock Out

In [47]:
df_fixture_knockout

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [48]:
for group in dict_table:
    group_winner = dict_table[group].loc[0, 'Team']
    runners_up = dict_table[group].loc[1, 'Team']
    df_fixture_knockout.replace({f'Winners {group}':group_winner,
                                 f'Runners-up {group}':runners_up}, inplace=True)

df_fixture_knockout['winner'] = '?'
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,?
49,Poland,Match 50,Australia,2022,?
50,Tunisia,Match 52,Saudi Arabia,2022,?
51,England,Match 51,Ecuador,2022,?
52,Japan,Match 53,Belgium,2022,?
53,Brazil,Match 54,South Korea,2022,?
54,Croatia,Match 55,Costa Rica,2022,?
55,Ghana,Match 56,Cameroon,2022,?


In [49]:
def get_winner(df_fixture_updated):
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        df_fixture_updated.loc[index, 'winner'] = winner
    return df_fixture_updated

In [50]:
get_winner(df_fixture_knockout)

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Poland,Match 50,Australia,2022,Poland
50,Tunisia,Match 52,Saudi Arabia,2022,Tunisia
51,England,Match 51,Ecuador,2022,England
52,Japan,Match 53,Belgium,2022,Japan
53,Brazil,Match 54,South Korea,2022,Brazil
54,Croatia,Match 55,Costa Rica,2022,Croatia
55,Ghana,Match 56,Cameroon,2022,Ghana


## Quarter Final

In [51]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winners {match}':winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

In [52]:

update_table(df_fixture_knockout, df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Japan,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Poland,2022,?
58,Croatia,Match 60,Ghana,2022,?
59,England,Match 59,Tunisia,2022,?


In [53]:
get_winner(df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Japan,Match 58,Brazil,2022,Japan
57,Netherlands,Match 57,Poland,2022,Netherlands
58,Croatia,Match 60,Ghana,2022,Ghana
59,England,Match 59,Tunisia,2022,England


## SemiFinal

In [54]:
update_table(df_fixture_quarter, df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Japan,2022,?
61,England,Match 62,Ghana,2022,?


In [55]:
get_winner(df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Japan,2022,Netherlands
61,England,Match 62,Ghana,2022,Ghana


## Final

In [56]:
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Netherlands,Match 64,Ghana,2022,?


In [57]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Netherlands,Match 64,Ghana,2022,Ghana
