In [1]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [2]:
dict_table = pickle.load(open('data/dict_table', 'rb'))
df_historical_data = pd.read_csv('data/fifa_worldcup_matches_clean.csv')
df_fixture = pd.read_csv('data/fifa_worldcup_fixture_clean.csv')

## 1. Calcular team strength (fuerza de un equipo)

In [3]:
dict_table

{'Group A':    Pos         Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1    Qatar (H)    0  0  0  0   0   0   0    0
 1    2      Ecuador    0  0  0  0   0   0   0    0
 2    3      Senegal    0  0  0  0   0   0   0    0
 3    4  Netherlands    0  0  0  0   0   0   0    0,
 'Group B':    Pos           Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1        England    0  0  0  0   0   0   0    0
 1    2           Iran    0  0  0  0   0   0   0    0
 2    3  United States    0  0  0  0   0   0   0    0
 3    4          Wales    0  0  0  0   0   0   0    0,
 'Group C':    Pos          Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1     Argentina    0  0  0  0   0   0   0    0
 1    2  Saudi Arabia    0  0  0  0   0   0   0    0
 2    3        Mexico    0  0  0  0   0   0   0    0
 3    4        Poland    0  0  0  0   0   0   0    0,
 'Group D':    Pos       Team  Pld  W  D  L  GF  GA  GD  Pts
 0    1     France    0  0  0  0   0   0   0    0
 1    2  Australia    0  0  0  0   0   0   0    0
 2    3 

In [4]:
df_historical_data

Unnamed: 0,local_team,visit_team,year,local_goals,visit_goals,total_goals
0,Yugoslavia,Brazil,1930,2,1,3
1,Uruguay,Yugoslavia,1930,6,1,7
2,Argentina,United States,1930,6,1,7
3,Paraguay,Belgium,1930,1,0,1
4,United States,Paraguay,1930,3,0,3
...,...,...,...,...,...,...
895,Brazil,Costa Rica,2018,2,0,2
896,Serbia,Switzerland,2018,1,2,3
897,Serbia,Brazil,2018,0,2,2
898,Germany,Mexico,2018,0,1,1


In [5]:
# creamos dos dataframe para equipo local y equipo visitante
df_local = df_historical_data[['local_team', 'local_goals', 'visit_goals']]
df_visit = df_historical_data[['visit_team', 'local_goals', 'visit_goals']]

In [6]:
# renombramos las columnas
df_local = df_local.rename(columns={'local_team': 'team', 'local_goals': 'goals_scored', 'visit_goals': 'goals_conceded'})
df_visit = df_visit.rename(columns={'visit_team': 'team', 'local_goals': 'goals_conceded', 'visit_goals': 'goals_scored'})

In [7]:
df_local

Unnamed: 0,team,goals_scored,goals_conceded
0,Yugoslavia,2,1
1,Uruguay,6,1
2,Argentina,6,1
3,Paraguay,1,0
4,United States,3,0
...,...,...,...
895,Brazil,2,0
896,Serbia,1,2
897,Serbia,0,2
898,Germany,0,1


In [8]:
# unimos ambos, agrupamos por equipo y hallamos el promedio
df_team_strength = pd.concat([df_local, df_visit], ignore_index=True)

In [9]:
df_team_strength = df_team_strength.groupby('team').mean()

In [10]:
df_team_strength.index = df_team_strength.index.str.strip()

In [11]:
df_team_strength = df_team_strength.groupby('team').mean()

## 2. Function predict_points

In [12]:
def predict_points(local, visit):
    if local in df_team_strength.index and visit in df_team_strength.index:
        lamb_local = df_team_strength.at[local, 'goals_scored'] * df_team_strength.at[visit, 'goals_conceded']
        lamb_visit = df_team_strength.at[visit, 'goals_scored'] * df_team_strength.at[local, 'goals_conceded']
        prob_local, prob_visit, prob_draw = 0, 0, 0
        for x in range(0, 11):
            for y in range(0, 11):
                p = poisson.pmf(x, lamb_local) * poisson.pmf(y, lamb_visit)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_local += p
                else:
                    prob_visit += p

        points_local = 3 * prob_local + prob_draw
        points_visit = 3 * prob_visit + prob_draw

        return points_local, points_visit
    else:
        return 0, 0

## 3. Probando la funcion

In [13]:
predict_points('Argentina', 'Mexico')
predict_points('England', 'United States')

(2.185111845736514, 0.6271253198165162)

In [14]:
df_team_strength.index

Index(['Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Belgium',
       'Bolivia', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Cameroon',
       'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Cuba',
       'Czech Republic', 'Czechoslovakia', 'Denmark', 'Dutch East Indies',
       'East Germany', 'Ecuador', 'Egypt', 'El Salvador', 'England',
       'FR Yugoslavia', 'France', 'Germany', 'Ghana', 'Greece', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'Iran', 'Iraq', 'Israel', 'Italy',
       'Ivory Coast', 'Jamaica', 'Japan', 'Kuwait', 'Mexico', 'Morocco',
       'Netherlands', 'New Zealand', 'Nigeria', 'North Korea',
       'Northern Ireland', 'Norway', 'Panama', 'Paraguay', 'Peru', 'Poland',
       'Portugal', 'Republic of Ireland', 'Romania', 'Russia', 'Saudi Arabia',
       'Scotland', 'Senegal', 'Serbia', 'Serbia and Montenegro', 'Slovakia',
       'Slovenia', 'South Africa', 'South Korea', 'Soviet Union', 'Spain',
       'Sweden', 'Switzerland

## 3. Prediccion del mundial

In [15]:
df_fixture_group_48 = df_fixture[:48].copy()
df_fixture_knockout = df_fixture[48:56].copy()
df_fixture_quarter = df_fixture[56:60].copy()
df_fixture_semi = df_fixture[60:62].copy()
df_fixture_final = df_fixture[62:].copy()

In [16]:
for group in dict_table:
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group_48[df_fixture_group_48['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away

    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index()
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

In [17]:
df_fixture_group_48

Unnamed: 0,home,score,away,year
0,Qatar,Match 1,Ecuador,2022
1,Senegal,Match 2,Netherlands,2022
2,Qatar,Match 18,Senegal,2022
3,Netherlands,Match 19,Ecuador,2022
4,Ecuador,Match 35,Senegal,2022
5,Netherlands,Match 36,Qatar,2022
6,England,Match 3,Iran,2022
7,United States,Match 4,Wales,2022
8,Wales,Match 17,Iran,2022
9,England,Match 20,United States,2022


In [18]:
# Mostrar tabla actualizada
dict_table['Group D']

Unnamed: 0,Team,Pts
0,France,7.0
1,Denmark,5.0
2,Tunisia,3.0
3,Australia,2.0


In [19]:
df_fixture_knockout

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [20]:
# actualizar el fixture de octavos con el 1 puesto (group winner) y 2 puesto (runners up)
for group in dict_table:
    group_winner = dict_table[group].loc[0, 'Team']
    runners_up = dict_table[group].loc[1, 'Team']
    df_fixture_knockout.replace({
        f'Winners {group}': group_winner,
        f'Runners-up {group}': runners_up
    }, inplace=True)

df_fixture_knockout['winner'] = '?'

In [21]:
df_fixture_knockout

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,?
49,Argentina,Match 50,Denmark,2022,?
50,France,Match 52,Poland,2022,?
51,England,Match 51,Senegal,2022,?
52,Germany,Match 53,Belgium,2022,?
53,Brazil,Match 54,Uruguay,2022,?
54,Croatia,Match 55,Spain,2022,?
55,Portugal,Match 56,Switzerland,2022,?


In [22]:
# crear la function para buscar el ganador
def get_winner(df_fixture_update):
    for index, row in df_fixture_update.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        df_fixture_update.loc[index, 'winner'] = winner
    return df_fixture_update

In [23]:
get_winner(df_fixture_knockout)

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Denmark,2022,Argentina
50,France,Match 52,Poland,2022,France
51,England,Match 51,Senegal,2022,England
52,Germany,Match 53,Belgium,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Croatia,Match 55,Spain,2022,Spain
55,Portugal,Match 56,Switzerland,2022,Portugal


In [24]:
df_fixture_quarter

Unnamed: 0,home,score,away,year
56,Winners Match 53,Match 58,Winners Match 54,2022
57,Winners Match 49,Match 57,Winners Match 50,2022
58,Winners Match 55,Match 60,Winners Match 56,2022
59,Winners Match 51,Match 59,Winners Match 52,2022


In [25]:
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'winner']
        match = df_fixture_round_1.loc[index, 'score']
        df_fixture_round_2.replace({f'Winners {match}': winner}, inplace=True)
    df_fixture_round_2['winner'] = '?'
    return df_fixture_round_2

In [26]:
update_table(df_fixture_knockout, df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Spain,Match 60,Portugal,2022,?
59,England,Match 59,France,2022,?


In [27]:
get_winner(df_fixture_quarter)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,Germany
57,Netherlands,Match 57,Argentina,2022,Netherlands
58,Spain,Match 60,Portugal,2022,Spain
59,England,Match 59,France,2022,France


In [28]:
update_table(df_fixture_quarter, df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Germany,2022,?
61,France,Match 62,Spain,2022,?


In [29]:
get_winner(df_fixture_semi)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Germany,2022,Germany
61,France,Match 62,Spain,2022,France


In [30]:
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Germany,Match 64,France,2022,?


In [31]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Germany,Match 64,France,2022,Germany


In [32]:
print(f"The world cup winner is {df_fixture_final['winner'].iloc[-1]} 🎉")

The world cup winner is Germany 🎉
