In [73]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as cs
from sklearn.pipeline import FeatureUnion

In [74]:
worldcup=pd.read_csv("2018 worldcup.csv",index_col=0)
#match date is assumed to be irrelevant for the match results
worldcup.drop(['Date','Team1_Ball_Possession(%)'],axis=1,inplace=True)
#worldcup.describe()
train_wc = worldcup[:int(len(worldcup))]
test_wc = worldcup[int(0.8 * len(worldcup)):]


In [75]:
#world cup attributes
w_features=train_wc.iloc[:,np.arange(26)].copy()
#world cup goal result
w_goals=train_wc.iloc[:,26].copy()
#wordl cup match result
w_results=train_wc.iloc[:,27].copy()


In [76]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames in this wise manner yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [77]:
def feature_process(features):

    w_features_num = features.drop(['Location','Phase','Team1','Team2','Team1_Continent','Team2_Continent','Normal_Time'], axis=1,inplace=False)
    w_features_cat= features[['Location','Phase','Team1','Team2','Team1_Continent','Team2_Continent','Normal_Time']].copy()


    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_num))),
            ('imputer', Imputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])

    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(list(w_features_cat))),
            ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)),
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline

In [78]:
full_pipeline = feature_process(w_features)

feature_prepared = pd.DataFrame(data=full_pipeline.fit_transform(w_features),index=np.arange(1,len(w_features) + 1))

worldcup_cleaned=pd.concat([feature_prepared,w_goals.to_frame(), w_results.to_frame()], axis=1)

In [92]:
#Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

train_data = feature_prepared[:int(0.8 * len(feature_prepared))]
test_data = feature_prepared[int(0.8 * len(feature_prepared)):]
train_goals = w_goals[:int(0.8 * len(w_goals))]
test_goals = w_goals[int(0.8 * len(w_goals)):]

model = LogisticRegression()
model.fit(train_data,train_goals)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)



In [94]:
from sklearn.metrics import mean_squared_error, r2_score


print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_goals, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_goals, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_goals, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_goals, W_predict))

 
_________________###################____________________
Mean squared error for testing data: 8.23
Variance score for testing data: -3.00
******************************************************* 
Mean squared error for training data: 0.02
Variance score for training data: 0.99


In [95]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

model = LinearRegression(n_jobs = None)
model.fit(train_data, train_goals)
T_predict = model.predict(test_data)
W_predict = model.predict(train_data)


In [97]:
from sklearn.metrics import mean_squared_error, r2_score


print(' ')
# The coefficients
#print('Coefficients and Intercept are: ', model.coef_,"   ",model.intercept_,' respectively')
# The mean squared error
print('_________________###################____________________')
print("Mean squared error for testing data: %.2f"
      % mean_squared_error(test_goals, T_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for testing data: %.2f' % r2_score(test_goals, T_predict))
print('******************************************************* ')
print("Mean squared error for training data: %.2f"
      % mean_squared_error(train_goals, W_predict))
# Explained variance score: 1 is perfect prediction
print('Variance score for training data: %.2f' % r2_score(train_goals, W_predict))


 
_________________###################____________________
Mean squared error for testing data: 5.29
Variance score for testing data: -1.57
******************************************************* 
Mean squared error for training data: 0.00
Variance score for training data: 1.00
