In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('FIFAallMatchBoxData.csv', sep=';')
data['result'] = np.where(data['hgoals'] > data['agoals'],
                          data['hname'], 
                          np.where(data['hgoals'] < data['agoals'], 
                                   data['aname'], 
                                   'Tie'))

countries = list(set(data['hname'].tolist() + data['aname'].tolist()))
print('Tengo {} paises'.format(len(countries)))
data.head()

Tengo 128 paises


Unnamed: 0,year,hname,aname,hgoals,agoals,hPossesion,aPossesion,hshotsOnTarget,ashotsOnTarget,hshots,ashots,hyellowCards,ayellowCards,hredCards,aredCards,hfouls,afouls,hsaves,asaves,result
0,2012,Mexico,Costa Rica,1,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,Mexico
1,2012,Antigua and Barbuda,United States,1,2,24,76,2,4,14,17,1,1,0,0,11,13,2,1,United States
2,2012,United States,Guatemala,3,1,80,20,3,1,5,2,0,0,0,0,1,3,0,1,United States
3,2013,Honduras,United States,2,1,57,43,4,2,12,11,0,0,0,0,10,12,1,2,Honduras
4,2013,Panama,Costa Rica,2,2,47,53,5,3,13,11,1,1,0,0,7,10,1,3,Tie


In [3]:
def vectorize(row):
    countries_dict = {c:0 for c in countries}
    countries_dict[row.get('hname')] = 1
    countries_dict[row.get('aname')] = 1
    countries_dict['result'] = row.get('result')
    countries_dict['year'] = row.get('year')
    return countries_dict

list_of_matches = []
for _, row in data.iterrows():
    list_of_matches.append(vectorize(row))
    
matches = pd.DataFrame(list_of_matches)
matches.sample(2)

Unnamed: 0,Andorra,Bermuda,Denmark,Republic of Ireland,Curacao,Finland,Lithuania,Panama,Zambia,Kosovo,...,Serbia & Montenegro,Spain,Cameroon,Bahamas,Czech Republic,Senegal,Hungary,Switzerland,result,year
607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Belgium,2017
442,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Bosnia and Herzegovina,2013


In [4]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()
model.fit(matches[countries], 
          matches['result'])

LGBMClassifier()

In [5]:
matches['pred'] = model.predict(matches[countries])

In [7]:
countries

['Andorra',
 'Bermuda',
 'Denmark',
 'Republic of Ireland',
 'Curacao',
 'Finland',
 'Lithuania',
 'Panama',
 'Zambia',
 'Kosovo',
 'Italy',
 'Ukraine',
 'Congo DR',
 'Montserrat',
 'Ecuador',
 'Angola',
 'Trinidad and Tobago',
 'Germany',
 'Scotland',
 'Slovenia',
 'Algeria',
 'Cape Verde Islands',
 'Uganda',
 'Russia',
 'Suriname',
 'Estonia',
 'Turkey',
 'Moldova',
 'North Korea',
 'Gabon',
 'Poland',
 'Mexico',
 'Latvia',
 'Ivory Coast',
 'Dominican Republic',
 'South Africa',
 'Romania',
 'S',
 'Chile',
 'Guinea',
 'Colombia',
 'Congo',
 'China',
 'Brazil',
 'Australia',
 'Netherlands',
 'South Korea',
 'Belarus',
 'Aruba',
 'Luxembourg',
 'Saudi Arabia',
 'Morocco',
 'San Marino',
 'Malta',
 'Barbados',
 'Azerbaijan',
 'Gibraltar',
 'Guatemala',
 'Mali',
 'Venezuela',
 'Northern Ireland',
 'Argentina',
 'Bolivia',
 'Armenia',
 'Serbia',
 'IR Iran',
 'Grenada',
 'Burkina Faso',
 'Peru',
 'Jamaica',
 'Togo',
 'Nigeria',
 'Cayman Islands',
 'Ghana',
 'Egypt',
 'Cyprus',
 'Sweden',
 

In [8]:
def predict(country_1, country_2):
    countries_dict = {c:0 for c in countries}
    countries_dict[country_1] = 1
    countries_dict[country_2] = 1
    predictions = pd.DataFrame(model.predict_proba(pd.DataFrame([countries_dict])), columns = model._classes)
    return predictions[[country_1, country_2, 'Tie']].rename({'Tie':'Empate'}, axis=1) * 100




In [9]:
predict('England', 'Tunisia')

Unnamed: 0,England,Tunisia,Empate
0,61.499054,0.087324,36.215986


In [10]:
predict('Argentina', 'Mexico')

Unnamed: 0,Argentina,Mexico,Empate
0,93.843703,2.939386,3.03729


In [11]:
predict('Argentina', 'Saudi Arabia')

Unnamed: 0,Argentina,Saudi Arabia,Empate
0,89.19177,0.008788,10.117161


In [12]:
predict('Argentina', 'Poland')

Unnamed: 0,Argentina,Poland,Empate
0,72.368944,9.888699,13.266255


In [13]:
predict('Argentina', 'Brazil')

Unnamed: 0,Argentina,Brazil,Empate
0,4.859732,48.710143,45.714918


In [14]:
predict('Argentina', 'Netherlands')

Unnamed: 0,Argentina,Netherlands,Empate
0,10.058514,2.07962,81.161977


In [15]:
predict('Argentina', 'United States')

Unnamed: 0,Argentina,United States,Empate
0,60.603982,17.581582,15.371379


In [16]:
predict('Argentina', 'Portugal')

Unnamed: 0,Argentina,Portugal,Empate
0,63.838158,25.513888,10.208362


In [17]:
features=pd.concat([pd.DataFrame(matches[countries].columns),
                        pd.DataFrame(model.booster_.feature_importance(importance_type='gain')).astype(int)], axis=1)
features.columns=['feature', 'gain']
features.sort_values(by='gain', ascending=False).head(20)

Unnamed: 0,feature,gain
17,Germany,2082
121,Spain,1784
43,Brazil,1759
45,Netherlands,1719
84,England,1517
113,France,1499
61,Argentina,1449
85,Belgium,1436
10,Italy,1337
127,Switzerland,1253
