In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
world_cup = pd.read_csv("WorldCupMatches.csv")
world_cup.head()

Unnamed: 0,Year,Date,Stage,Stadium,City,Home_Team,Home Team Goals,Away Team Goals,Away_Team,Win conditions
0,1930,13 Jul 1930 - 15:00,Group 1,Pocitos,Montevideo,France,4,1,Mexico,
1,1930,13 Jul 1930 - 15:00,Group 4,Parque Central,Montevideo,Belgium,0,3,USA,
2,1930,14 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,2,1,Brazil,
3,1930,14 Jul 1930 - 14:50,Group 3,Pocitos,Montevideo,Romania,3,1,Peru,
4,1930,15 Jul 1930 - 16:00,Group 1,Parque Central,Montevideo,Argentina,1,0,France,


In [3]:
world_cup.shape

(916, 10)

In [4]:
world_cup.isnull().sum()

Year                0
Date                0
Stage               0
Stadium             0
City                0
Home_Team           0
Home Team Goals     0
Away Team Goals     0
Away_Team           0
Win conditions     65
dtype: int64

In [5]:
world_cup.dtypes

Year                int64
Date               object
Stage              object
Stadium            object
City               object
Home_Team          object
Home Team Goals     int64
Away Team Goals     int64
Away_Team          object
Win conditions     object
dtype: object

In [6]:
def replace_country_name(world_cup):
    if(world_cup['Home_Team'] in ['German DR', 'Germany FR']):
        world_cup['Home_Team'] = 'Germany'
    elif(world_cup['Home_Team'] == 'Soviet Union'):
        world_cup['Home_Team'] = 'Russia' 
    elif(world_cup['Home_Team'] == 'rn">Serbia and Montenegro'):
        world_cup['Home_Team'] = 'Serbia'   
    elif(world_cup['Home_Team'] == 'South Korea'):
        world_cup['Home_Team'] = 'Korea Republic'    
    if(world_cup['Away_Team'] in ['German DR', 'Germany FR']):
        world_cup['Away_Team'] = 'Germany'
    elif(world_cup['Away_Team'] == 'Soviet Union'):
        world_cup['Away_Team'] = 'Russia' 
    elif(world_cup['Away_Team'] == 'rn">Serbia and Montenegro'):
        world_cup['Away_Team'] = 'Serbia' 
    elif(world_cup['Away_Team'] == 'South Korea'):
        world_cup['Away_Team'] = 'Korea Republic'    
    return world_cup
world_cup= world_cup.apply(replace_country_name, axis = 1)

In [7]:
world_cup["Home_Team"].value_counts()

Brazil                      80
Germany                     75
Italy                       54
Argentina                   52
England                     39
                            ..
Jamaica                      1
Norway                       1
Haiti                        1
rn">United Arab Emirates     1
Panama                       1
Name: Home_Team, Length: 77, dtype: int64

In [8]:
winner = []
for i in range (len(world_cup['Home_Team'])):
    if world_cup['Home Team Goals'][i]>world_cup['Away Team Goals'][i]:
        winner.append(world_cup['Home_Team'][i])
    elif world_cup['Home Team Goals'][i]<world_cup['Away Team Goals'][i]:
        winner.append(world_cup['Away_Team'][i])
    else:
        winner.append('Draw')
world_cup['Winning_Team'] = winner 
world_cup['Goal Difference'] = np.absolute(world_cup['Home Team Goals'] - world_cup['Away Team Goals'])
world_cup.head(10)

Unnamed: 0,Year,Date,Stage,Stadium,City,Home_Team,Home Team Goals,Away Team Goals,Away_Team,Win conditions,Winning_Team,Goal Difference
0,1930,13 Jul 1930 - 15:00,Group 1,Pocitos,Montevideo,France,4,1,Mexico,,France,3
1,1930,13 Jul 1930 - 15:00,Group 4,Parque Central,Montevideo,Belgium,0,3,USA,,USA,3
2,1930,14 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Yugoslavia,2,1,Brazil,,Yugoslavia,1
3,1930,14 Jul 1930 - 14:50,Group 3,Pocitos,Montevideo,Romania,3,1,Peru,,Romania,2
4,1930,15 Jul 1930 - 16:00,Group 1,Parque Central,Montevideo,Argentina,1,0,France,,Argentina,1
5,1930,16 Jul 1930 - 14:45,Group 1,Parque Central,Montevideo,Chile,3,0,Mexico,,Chile,3
6,1930,17 Jul 1930 - 12:45,Group 2,Parque Central,Montevideo,Bolivia,0,4,Yugoslavia,,Yugoslavia,4
7,1930,17 Jul 1930 - 14:45,Group 4,Parque Central,Montevideo,Paraguay,0,3,USA,,USA,3
8,1930,18 Jul 1930 - 14:30,Group 3,Estadio Centenario,Montevideo,Uruguay,1,0,Peru,,Uruguay,1
9,1930,19 Jul 1930 - 12:50,Group 1,Estadio Centenario,Montevideo,France,0,1,Chile,,Chile,1


In [9]:
worldcup_teams = ['Qatar','Ecuador','Senegal','Netherlands','England',
                 'Iran','USA','Wales','Argentina','Saudi Arabia','Mexico',
                 'Poland','France','Australia','Denmark','Tunisia','Spain',
                 'Costa Rica','Germany','Japan','Belgium','Canada','Morocco',
                 'Croatia','Brazil','Serbia','Switzerland','Cameroon','Portugal',
                 'Ghana','Uruguay','Korea Republic']

world_cup_home = world_cup[world_cup['Home_Team'].isin(worldcup_teams)]
world_cup_away = world_cup[world_cup['Away_Team'].isin(worldcup_teams)]
world_cup_teams = pd.concat((world_cup_home,world_cup_away))
world_cup_teams.drop_duplicates()
world_cup_teams.count()

Year               1132
Date               1132
Stage              1132
Stadium            1132
City               1132
Home_Team          1132
Home Team Goals    1132
Away Team Goals    1132
Away_Team          1132
Win conditions     1032
Winning_Team       1132
Goal Difference    1132
dtype: int64

In [10]:
world_cup_teams.drop(['Stage','Year','Date','Stadium','City','Home Team Goals','Away Team Goals'
                     ,'Win conditions','Goal Difference'],axis=1,inplace=True)
world_cup_teams.head()

Unnamed: 0,Home_Team,Away_Team,Winning_Team
0,France,Mexico,France
1,Belgium,USA,USA
4,Argentina,France,Argentina
8,Uruguay,Peru,Uruguay
9,France,Chile,Chile


In [None]:
# Building the model
world_cup_teams = world_cup_teams.reset_index(drop=True)
world_cup_teams.loc[world_cup_teams.Winning_Team == world_cup_teams.Home_Team,'Winning_Team']=2
world_cup_teams.loc[world_cup_teams.Winning_Team == 'Draw','Winning_Team']=0
world_cup_teams.loc[world_cup_teams.Winning_Team == world_cup_teams.Away_Team,'Winning_Team']=1
world_cup_teams.head()

In [14]:
world_cup_teams = pd.get_dummies(world_cup_teams, prefix=['Home_Team', 'Away_Team'], columns=['Home_Team', 'Away_Team'])
world_cup_teams

Unnamed: 0,Winning_Team,Home_Team_Algeria,Home_Team_Angola,Home_Team_Argentina,Home_Team_Australia,Home_Team_Austria,Home_Team_Belgium,Home_Team_Bolivia,Home_Team_Brazil,Home_Team_Bulgaria,...,Away_Team_Turkey,Away_Team_USA,Away_Team_Ukraine,Away_Team_Uruguay,Away_Team_Wales,Away_Team_Yugoslavia,"Away_Team_rn"">Bosnia and Herzegovina","Away_Team_rn"">Republic of Ireland","Away_Team_rn"">Trinidad and Tobago","Away_Team_rn"">United Arab Emirates"
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1128,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1129,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1130,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X = world_cup_teams.drop(['Winning_Team'],axis=1)
y = world_cup_teams['Winning_Team']
y = y.astype('int')

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [26]:
y_train.value_counts()

2    377
1    252
0    163
Name: Winning_Team, dtype: int64

In [27]:
y_test.value_counts()

2    146
1    113
0     81
Name: Winning_Team, dtype: int64

In [29]:
from sklearn.model_selection import cross_val_score

In [39]:
n1=cross_val_score(LogisticRegression(),X,y,cv=5)
print(n1)

[0.55506608 0.54185022 0.53097345 0.45575221 0.5840708 ]


In [40]:
nr=cross_val_score(LogisticRegression(),X,y,cv=10)
print(nr)

[0.53508772 0.52631579 0.51327434 0.52212389 0.51327434 0.54867257
 0.44247788 0.48672566 0.5840708  0.55752212]


In [41]:
np.mean(n1)

0.5335425519472925

In [42]:
np.mean(nr)

0.5229545101692283

In [43]:
from sklearn.metrics import confusion_matrix , classification_report 

In [44]:
lr = LogisticRegression().fit(X_train,y_train)
predict = lr.predict(X_test)

In [45]:
from sklearn.utils.multiclass import unique_labels
unique_labels(y_test)

array([0, 1, 2])

In [46]:
#confussion matrix
def plot(y_true,y_pred):
 labels=unique_labels(y_test)
 column=[f'Predicted-{label}' for label in labels]
 inde=[f'Actual-{label}' for label in labels]
 table = pd.DataFrame(confusion_matrix(y_true,y_pred),
                     columns=column,index=inde)
 return table

In [47]:
plot(y_test,predict)

Unnamed: 0,Predicted-0,Predicted-1,Predicted-2
Actual-0,6,29,46
Actual-1,7,66,40
Actual-2,9,26,111


In [116]:
score = cross_val_score(LogisticRegression(),X_train,y_train)

In [117]:
np.mean(score)

0.5315978027227132

In [118]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       0.27      0.07      0.12        81
           1       0.55      0.58      0.56       113
           2       0.56      0.76      0.65       146

    accuracy                           0.54       340
   macro avg       0.46      0.47      0.44       340
weighted avg       0.49      0.54      0.49       340



In [86]:
score1 = cross_val_score(LogisticRegression(),X_test,y_test)

In [87]:
np.mean(score1)

0.4764705882352941

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
n=cross_val_score(RandomForestClassifier(n_estimators=40),X,y)
print(n)

[0.62995595 0.66519824 0.6460177  0.5840708  0.73893805]


In [50]:
nf=cross_val_score(RandomForestClassifier(n_estimators=40),X,y,cv=10)
print(nf)

[0.5877193  0.62280702 0.63716814 0.7079646  0.68141593 0.62831858
 0.51327434 0.6460177  0.7079646  0.7699115 ]


In [51]:
np.mean(n)

0.6528361467389185

In [52]:
np.mean(nf)

0.6502561714019562

In [53]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train,y_train)
predict1 = rf.predict(X_test)

In [54]:
plot(y_test,predict1)

Unnamed: 0,Predicted-0,Predicted-1,Predicted-2
Actual-0,30,15,36
Actual-1,9,72,32
Actual-2,16,21,109


In [55]:
print(classification_report(y_test,predict1))

              precision    recall  f1-score   support

           0       0.55      0.37      0.44        81
           1       0.67      0.64      0.65       113
           2       0.62      0.75      0.67       146

    accuracy                           0.62       340
   macro avg       0.61      0.58      0.59       340
weighted avg       0.62      0.62      0.61       340



In [56]:
score2 = cross_val_score(RandomForestClassifier(n_estimators=40),X_train,y_train)

In [57]:
np.mean(score2)

0.5833611973568984

In [58]:
score3 = cross_val_score(RandomForestClassifier(n_estimators=40),X_test,y_test)

In [59]:
np.mean(score3)

0.5323529411764707

In [60]:
from sklearn.tree import DecisionTreeClassifier

In [61]:
n2=cross_val_score(DecisionTreeClassifier(),X,y)
print(n2)

[0.59471366 0.65198238 0.63716814 0.57079646 0.69469027]


In [62]:
nd=cross_val_score(DecisionTreeClassifier(),X,y,cv=10)
print(nd)

[0.54385965 0.64035088 0.65486726 0.69026549 0.65486726 0.61946903
 0.54867257 0.6460177  0.7079646  0.67256637]


In [65]:
np.mean(n2)

0.6298701804997855

In [66]:
np.mean(nd)

0.6378900791802515

In [67]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
predict2 = dt.predict(X_test)

In [68]:
plot(y_test,predict2)

Unnamed: 0,Predicted-0,Predicted-1,Predicted-2
Actual-0,34,14,33
Actual-1,17,68,28
Actual-2,20,25,101


In [69]:
print(classification_report(y_test,predict2))

              precision    recall  f1-score   support

           0       0.48      0.42      0.45        81
           1       0.64      0.60      0.62       113
           2       0.62      0.69      0.66       146

    accuracy                           0.60       340
   macro avg       0.58      0.57      0.57       340
weighted avg       0.59      0.60      0.59       340



In [70]:
score4 = cross_val_score(DecisionTreeClassifier(),X_train,y_train)

In [71]:
np.mean(score4)

0.5568585303717857

In [72]:
score5 = cross_val_score(DecisionTreeClassifier(),X_test,y_test)

In [73]:
np.mean(score5)

0.5176470588235295

In [74]:
ranking = pd.read_csv('fifa_rankings.csv') 
fixtures = pd.read_csv('fixtures.csv')
pred_set = []

In [155]:
fixtures.insert(1, 'first_position', fixtures['Home_Team'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Away_Team'].map(ranking.set_index('Team')['Position']))
fixtures = fixtures.iloc[:48, :]
fixtures.tail()

Unnamed: 0,Round Number,first_position,second_position,Home_Team,Away_Team,Group,Result
43,3,11.0,34.0,Germany,Costa Rica,Group E,
44,3,9.0,28.0,Portugal,Korea Republic,Group H,
45,3,13.0,60.0,Uruguay,Ghana,Group H,
46,3,16.0,25.0,Switzerland,Serbia,Group G,
47,3,1.0,38.0,Brazil,Cameroon,Group G,


In [156]:
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'Home_Team': row['Home_Team'], 'Away_Team': row['Away_Team'], 'Winning_Team': None})
    else:
        pred_set.append({'Home_Team': row['Away_Team'], 'Away_Team': row['Home_Team'], 'Winning_Team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

Unnamed: 0,Home_Team,Away_Team,Winning_Team
0,Netherlands,Senegal,
1,Ecuador,Qatar,
2,England,Iran,
3,USA,Wales,
4,Argentina,Saudi Arabia,


In [157]:
pred_set = pd.get_dummies(pred_set, prefix=['Home_Team', 'Away_Team'], columns=['Home_Team', 'Away_Team'])

missing_cols = set(world_cup_teams.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[world_cup_teams.columns]

pred_set = pred_set.drop(['Winning_Team'], axis=1)

pred_set.head()

  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0


Unnamed: 0,Home_Team_Algeria,Home_Team_Angola,Home_Team_Argentina,Home_Team_Australia,Home_Team_Austria,Home_Team_Belgium,Home_Team_Bolivia,Home_Team_Brazil,Home_Team_Bulgaria,Home_Team_Cameroon,...,Away_Team_Turkey,Away_Team_USA,Away_Team_Ukraine,Away_Team_Uruguay,Away_Team_Wales,Away_Team_Yugoslavia,"Away_Team_rn"">Bosnia and Herzegovina","Away_Team_rn"">Republic of Ireland","Away_Team_rn"">Trinidad and Tobago","Away_Team_rn"">United Arab Emirates"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [158]:
predictions = rf.predict(pred_set)
for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    elif predictions[i] == 0:
        print("Draw")
    elif predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(rf.predict_proba(pred_set)[i][1]))
    print('Probability of Draw: ', '%.3f'%(rf.predict_proba(pred_set)[i][0]))
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(rf.predict_proba(pred_set)[i][2]))
    print("")

Senegal and Netherlands
Winner: Netherlands
Probability of Senegal winning:  0.250
Probability of Draw:  0.175
Probability of Netherlands winning:  0.575

Qatar and Ecuador
Winner: Ecuador
Probability of Qatar winning:  0.242
Probability of Draw:  0.163
Probability of Ecuador winning:  0.596

Iran and England
Winner: England
Probability of Iran winning:  0.146
Probability of Draw:  0.173
Probability of England winning:  0.681

Wales and USA
Winner: Wales
Probability of Wales winning:  0.431
Probability of Draw:  0.394
Probability of USA winning:  0.175

Saudi Arabia and Argentina
Winner: Argentina
Probability of Saudi Arabia winning:  0.100
Probability of Draw:  0.050
Probability of Argentina winning:  0.850

Tunisia and Denmark
Winner: Denmark
Probability of Tunisia winning:  0.125
Probability of Draw:  0.100
Probability of Denmark winning:  0.775

Poland and Mexico
Winner: Poland
Probability of Poland winning:  0.925
Probability of Draw:  0.025
Probability of Mexico winning:  0.050



In [159]:
round_of_16 = [('Netherlands','England'),
              ('Argentina','Denmark'),
              ('Wales','Ecuador'),
              ('France','Poland'),
              ('Germany','Belgium'),
              ('Brazil','Uruguay'),
              ('Croatia','Spain'),
              ('Ghana','Switzerland')]

In [160]:
def clean_and_predict(matches, ranking, world_cup_teams, rf):
    positions = []
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])
    pred_set = []
    i = 0
    j = 0
    while i < len(positions):
        dict1 = {}
        if positions[i] < positions[i + 1]:
            dict1.update({'Home_Team': matches[j][0], 'Away_Team': matches[j][1]})
        else:
            dict1.update({'Home_Team': matches[j][1], 'Away_Team': matches[j][0]})
        pred_set.append(dict1)
        i += 2
        j += 1
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set
    
    pred_set = pd.get_dummies(pred_set, prefix=['Home_Team', 'Away_Team'], columns=['Home_Team', 'Away_Team'])

    missing_cols2 = set(world_cup_teams.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[world_cup_teams.columns]

    pred_set = pred_set.drop(['Winning_Team'], axis=1)

    predictions = rf.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 1:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        elif predictions[i] == 0:
            print("Draw")
        elif predictions[i] == 2:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ' , '%.3f'%(rf.predict_proba(pred_set)[i][1]))
        print('Probability of Draw: ', '%.3f'%(rf.predict_proba(pred_set)[i][0])) 
        print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(rf.predict_proba(pred_set)[i][2]))
        print("")

In [161]:
clean_and_predict(round_of_16, ranking, world_cup_teams, rf)

Netherlands and England
Draw
Probability of Netherlands winning:  0.125
Probability of Draw:  0.725
Probability of England winning:  0.150

Denmark and Argentina
Winner: Argentina
Probability of Denmark winning:  0.250
Probability of Draw:  0.100
Probability of Argentina winning:  0.650

Ecuador and Wales
Winner: Ecuador
Probability of Ecuador winning:  0.550
Probability of Draw:  0.000
Probability of Wales winning:  0.450

Poland and France
Winner: Poland
Probability of Poland winning:  0.470
Probability of Draw:  0.127
Probability of France winning:  0.403

Germany and Belgium
Winner: Germany
Probability of Germany winning:  0.950
Probability of Draw:  0.025
Probability of Belgium winning:  0.025

Uruguay and Brazil
Winner: Brazil
Probability of Uruguay winning:  0.059
Probability of Draw:  0.125
Probability of Brazil winning:  0.816

Croatia and Spain
Winner: Croatia
Probability of Croatia winning:  0.535
Probability of Draw:  0.113
Probability of Spain winning:  0.352

Ghana and Sw

  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0


In [165]:
quarter_final = [('England','Argentina'),
                ('Ecuador','Poland'),
                ('Germany','Brazil'),
                ('Croatia','Ghana')]
clean_and_predict(quarter_final, ranking, world_cup_teams, rf)

England and Argentina
Winner: England
Probability of England winning:  0.430
Probability of Draw:  0.386
Probability of Argentina winning:  0.184

Ecuador and Poland
Winner: Ecuador
Probability of Ecuador winning:  0.718
Probability of Draw:  0.025
Probability of Poland winning:  0.257

Germany and Brazil
Winner: Brazil
Probability of Germany winning:  0.496
Probability of Draw:  0.000
Probability of Brazil winning:  0.504

Ghana and Croatia
Draw
Probability of Ghana winning:  0.400
Probability of Draw:  0.400
Probability of Croatia winning:  0.200



  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0


In [166]:
semi_final = [('England','Ecuador'),
             ('Brazil','Ghana')]
clean_and_predict(semi_final, ranking, world_cup_teams, rf)

Ecuador and England
Winner: England
Probability of Ecuador winning:  0.075
Probability of Draw:  0.075
Probability of England winning:  0.850

Ghana and Brazil
Winner: Brazil
Probability of Ghana winning:  0.025
Probability of Draw:  0.023
Probability of Brazil winning:  0.952



  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0


In [167]:
final = [('England','Brazil')]
clean_and_predict(final, ranking, world_cup_teams, rf)

England and Brazil
Winner: Brazil
Probability of England winning:  0.078
Probability of Draw:  0.075
Probability of Brazil winning:  0.847



  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
  pred_set[c] = 0
