In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as cs
from sklearn.pipeline import FeatureUnion

In [3]:
worldcup=pd.read_csv("2018 worldcup.csv",index_col=0)
#match date is assumed to be irrelevant for the match results
worldcup.drop(['Date','Team1_Ball_Possession(%)'],axis=1,inplace=True)
#worldcup.describe()
train_wc = worldcup[:int(len(worldcup))]


In [4]:
#world cup attributes
w_features=train_wc.iloc[:,np.arange(26)].copy()
#world cup goal result
w_goals=train_wc.iloc[:,26].copy()
#wordl cup match result
w_results=train_wc.iloc[:,27].copy()

len(train_wc.columns)


28

In [130]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames in this wise manner yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [131]:
def feature_process(features):

    w_features_num = features.drop(['Location','Phase','Team1','Team2','Team1_Continent','Team2_Continent','Normal_Time'], axis=1,inplace=False)
    w_features_cat= features[['Location','Phase','Team1','Team2','Team1_Continent','Team2_Continent','Normal_Time']].copy()


    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_num))),
            ('imputer', Imputer(strategy="median")),
            ('std_scaler', StandardScaler(with_mean = False)),
        ])

    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_cat))),
            ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)),
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline

In [132]:
full_pipeline = feature_process(w_features)

feature_prepared = pd.DataFrame(data=full_pipeline.fit_transform(w_features),index=np.arange(1,len(w_features) + 1))

worldcup_cleaned=pd.concat([feature_prepared,w_goals.to_frame(), w_results.to_frame()], axis=1)

In [133]:
#Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_goals = w_goals[:int(0.8 * len(w_goals))]
test_goals = w_goals[int(0.8 * len(w_goals)):]

model = LogisticRegression(max_iter = 3)
model.fit(train_data,train_goals)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)

print model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [134]:
from sklearn.metrics import mean_squared_error, r2_score


print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_goals, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_goals, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_goals, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_goals, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 2.15
Variance score for testing data: -0.05
******************************************************* 
Mean squared error for training data: 2.22
Variance score for training data: 0.13


In [135]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

model = LinearRegression(n_jobs = None)
model.fit(train_data, train_goals)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)
print model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [136]:
from sklearn.metrics import mean_squared_error, r2_score


print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_goals, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_goals, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_goals, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_goals, W_predict))


 
_________________###################____________________
Mean squared error for testing data: 5.29
Variance score for testing data: -1.57
******************************************************* 
Mean squared error for training data: 0.00
Variance score for training data: 1.00


In [137]:
# Linear Ridge Regression
from sklearn.linear_model import Ridge

model = Ridge(alpha=1.0)
model.fit(train_data, train_goals)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)
print model

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [138]:
print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_goals, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_goals, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_goals, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_goals, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 3.98
Variance score for testing data: -0.93
******************************************************* 
Mean squared error for training data: 0.15
Variance score for training data: 0.94


In [139]:
# DesicionTree For Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_results = w_results[:int(0.8 * len(w_results))]
test_results = w_results[int(0.8 * len(w_results)):]




params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_


T_predict = grid_search_cv.predict(test_data)
print("The prediction accuracy using the decision tree is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))


Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done 428 tasks      | elapsed:    1.9s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=2, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')
The prediction accuracy using the decision tree is : 53.85%.


[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    3.1s finished


In [148]:
# Random Forest For Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_results = w_results[:int(0.8 * len(w_results))]
test_results = w_results[int(0.8 * len(w_results)):]

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_

T_predict = grid_search_cv.predict(test_data)
print("The prediction accuracy using the random forest is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))



Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    7.2s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=8,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
The prediction accuracy using the random forest is : 38.46%.


[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:   16.5s finished


In [147]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_data, train_results)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

T_predict = model.predict(test_data)
print model
print("The prediction accuracy using the Navie Bayes is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
The prediction accuracy using the Navie Bayes is : 61.54%.


In [153]:
# Perception 
from sklearn.linear_model import Perceptron

model = Perceptron(alpha=1)
model.fit(train_data, train_results)

T_predict = model.predict(test_data)
print model
print("The prediction accuracy using the Perceptron is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))




Perceptron(alpha=1, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)
The prediction accuracy using the Perceptron is : 23.08%.


In [159]:
# Nearest Neighbour Classifer

from sklearn.neighbors import KNeighborsClassifier


params = {'leaf_size': list(range(10, 50)), 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
grid_search_cv = GridSearchCV(KNeighborsClassifier(), params, n_jobs=-1, verbose=1)
grid_search_cv.fit(train_data, train_results)
print grid_search_cv.best_estimator_


T_predict = grid_search_cv.predict(test_data)
print("The prediction accuracy using the Perceptron is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))



Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done 712 tasks      | elapsed:    2.4s


KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')
The prediction accuracy using the Perceptron is : 15.38%.


[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:    2.7s finished


In [163]:
# SVC

from sklearn.svm import SVC

param_grid = [
        {'kernel': ['rbf'], 'C': [ 2**x for x in range(0,6) ]},
    ]
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)
grid_search = GridSearchCV(SVC(), param_grid, cv=inner_cv,  n_jobs=1, scoring='accuracy',verbose=3)
grid_search.fit(train_data, train_results)
clf=grid_search.best_estimator_


T_predict = grid_search.predict(test_data)
print grid_search
print("The prediction accuracy using the Perceptron is : {:.2f}%.".format(100*accuracy_score(test_results, T_predict)))




Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] kernel=rbf, C=1 .................................................
[CV] ............ kernel=rbf, C=1, score=0.470588235294, total=   0.0s
[CV] kernel=rbf, C=1 .................................................
[CV] ............ kernel=rbf, C=1, score=0.294117647059, total=   0.0s
[CV] kernel=rbf, C=1 .................................................
[CV] ............ kernel=rbf, C=1, score=0.352941176471, total=   0.0s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.352941176471, total=   0.0s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.176470588235, total=   0.0s
[CV] kernel=rbf, C=2 .................................................
[CV] ............ kernel=rbf, C=2, score=0.294117647059, total=   0.0s
[CV] kernel=rbf, C=4 .................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.1s finished
