In [50]:
#!pip install scikit-optimize --user --quiet

In [60]:
import pandas as pd
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, mean_squared_error, make_scorer, mean_absolute_error

import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.manifold import LocallyLinearEmbedding

from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.model_selection import GridSearchCV,train_test_split, cross_val_score, TimeSeriesSplit

from yellowbrick.classifier.rocauc import roc_auc
from yellowbrick.target.feature_correlation import feature_correlation
from yellowbrick.classifier import precision_recall_curve

import warnings

from skopt import BayesSearchCV

pd.set_option('display.precision', 3)


In [42]:
from IPython.display import display, HTML
show_html = lambda html: display(HTML(html))

In [43]:
df = pd.read_csv('international_matches_clean.csv')
df.head()

Unnamed: 0,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,shoot_out,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,home_team_score,away_team_score
0,2,2,3,67,0,0,0,94.0,84.0,86.5,89.3,89.5,80.2,79.7,81.8,1,1
1,2,2,90,7,0,0,0,83.0,88.0,76.2,73.0,74.0,90.5,88.7,91.2,2,2
2,2,2,25,76,0,0,0,77.0,74.0,80.5,78.7,79.0,71.8,75.7,70.2,3,0
3,2,2,80,41,0,0,0,78.0,78.0,68.8,77.0,69.2,70.5,79.7,78.5,1,3
4,2,2,9,38,0,0,0,97.0,79.0,91.8,92.3,87.5,79.2,81.3,79.0,2,1


In [44]:
X = df[df.columns.difference(['home_team_score', 'away_team_score'])]
y = df[['home_team_score', 'away_team_score']]

In [45]:
X_train, X_test, y_trainF, y_testF = train_test_split(X, y, test_size=0.3, random_state=0)

In [46]:
y_train = y_trainF['home_team_score'] - y_trainF['away_team_score']
y_test = y_testF['home_team_score'] - y_testF['away_team_score']


# **Random Forest**

In [47]:
param = {'n_estimators': [5,10,25,40, 50, 75,100, 200], 
         'criterion':['gini', 'entropy'], 
         'max_depth':[None, 1, 2, 3,  5,  8, 9,10,15],
         'min_samples_leaf':[1,2,3,5,10]}

cv = 5

rf =  RandomForestClassifier(random_state=0)
rf_bs =  BayesSearchCV(rf,param,n_iter=iter, cv=cv, n_jobs=-1, refit=True, random_state=0)
rf_bs.fit(X_train, y_train)

BayesSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_iter=40,
              n_jobs=-1, random_state=0,
              search_spaces={'criterion': ['gini', 'entropy'],
                             'max_depth': [None, 1, 2, 3, 5, 8, 9, 10, 15],
                             'min_samples_leaf': [1, 2, 3, 5, 10],
                             'n_estimators': [5, 10, 25, 40, 50, 75, 100, 200]})

In [48]:
show_html(pd.DataFrame(rf_bs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

Unnamed: 0,params,mean_test_score,rank_test_score
30,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 100}",0.295,1
16,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 100}",0.293,2
31,"{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 75}",0.292,3
9,"{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 100}",0.291,4
12,"{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}",0.291,5


In [53]:
print(classification_report(rf_bs.predict(X_test), y_test))

              precision    recall  f1-score   support

          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
          -1       0.15      0.33      0.21       101
           0       0.59      0.30      0.40       692
           1       0.48      0.27      0.35       497
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0

    accuracy                           0.29      1291
   macro avg       0.09      0.07      0.07      1291
weighted avg       0.51      0.29      0.36      1291



# **SVM con kernel RBF**

In [54]:
param = {'C':10**np.linspace(-3,3,101), 'gamma':['scale','auto']}

niter = 15

rbsvc =  SVC(kernel='rbf', max_iter=25000, random_state=0)
rbsvc_gs = BayesSearchCV(rbsvc,param,n_iter=niter, cv=cv, n_jobs=-1, refit=True, random_state=0)
rbsvc_gs.fit(X_train, y_train);

In [55]:
show_html(pd.DataFrame(rbsvc_gs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

Unnamed: 0,params,mean_test_score,rank_test_score
6,"{'C': 660.6934480075951, 'gamma': 'scale'}",0.277,1
5,"{'C': 109.64781961431851, 'gamma': 'scale'}",0.276,2
12,"{'C': 6.918309709189363, 'gamma': 'scale'}",0.267,3
7,"{'C': 1.1481536214968828, 'gamma': 'scale'}",0.267,4
1,"{'C': 0.03630780547701014, 'gamma': 'scale'}",0.266,5


In [56]:
print(classification_report(rbsvc_gs.predict(X_test), y_test))

              precision    recall  f1-score   support

          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
          -1       0.02      0.57      0.04         7
           0       0.69      0.28      0.40       865
           1       0.38      0.25      0.30       419
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0

    accuracy                           0.27      1291
   macro avg       0.08      0.09      0.06      1291
weighted avg       0.58      0.27      0.37      1291



# **MLP**

In [57]:
sdscaler = StandardScaler()

X_train_sd = sdscaler.fit_transform(X_train)
X_test_sd = sdscaler.transform(X_test)

In [58]:
mlp = MLPClassifier(max_iter=10000, early_stopping=True, n_iter_no_change=15, random_state=0)
print(np.mean(cross_val_score(mlp,X_train_sd,y_train,cv=10)))

0.2739015643220171


In [59]:
param = {'hidden_layer_sizes':[10, 50, 100, 200], 
         'activation':['relu', 'logistic', 'identity'], 
         'learning_rate_init': [0.001, 0.01, 0.1]  }

mlp =  MLPClassifier(max_iter=10000, early_stopping=True, n_iter_no_change=20,learning_rate='adaptive',random_state=0)
mlp_gs =  GridSearchCV(mlp,param,cv=cv, n_jobs=-1, refit=True)
mlp_gs.fit(X_train_sd, y_train);

In [61]:
show_html(pd.DataFrame(mlp_gs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

Unnamed: 0,params,mean_test_score,rank_test_score
16,"{'activation': 'logistic', 'hidden_layer_sizes': 50, 'learning_rate_init': 0.01}",0.29,1
13,"{'activation': 'logistic', 'hidden_layer_sizes': 10, 'learning_rate_init': 0.01}",0.285,2
15,"{'activation': 'logistic', 'hidden_layer_sizes': 50, 'learning_rate_init': 0.001}",0.285,3
28,"{'activation': 'identity', 'hidden_layer_sizes': 50, 'learning_rate_init': 0.01}",0.284,4
26,"{'activation': 'identity', 'hidden_layer_sizes': 10, 'learning_rate_init': 0.1}",0.283,5
