In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [2]:
liga_pre = pd.read_csv('LIGA05-18_wValues.csv')

In [3]:
liga_pre.drop(['Unnamed: 0'], axis = 1, inplace = True)
liga_pre['Date'] = liga_pre['Date'].astype('datetime64[ns]')

In [4]:
liga_pre.head()
liga_pre.drop(['Date'], axis=1, inplace=True) #dropping DATE at early ML.

In [6]:
liga_pre.drop(['HTR'], axis=1, inplace=True) #dropping result at Half Time 

In [7]:
liga_pre.dtypes

HomeTeam      object
HomeValue    float64
AwayTeam      object
AwayValue    float64
B365H        float64
B365D        float64
B365A        float64
BWH          float64
BWD          float64
BWA          float64
IWH          float64
IWD          float64
IWA          float64
VCH          float64
VCD          float64
VCA          float64
WHH          float64
WHD          float64
WHA          float64
FTR           object
dtype: object

In [8]:
to_encode = liga_pre.select_dtypes(include=[object])
to_encode.head()

Unnamed: 0,HomeTeam,AwayTeam,FTR
0,Alaves,Barcelona,D
1,Valencia,Betis,H
2,Ath Madrid,Zaragoza,D
3,Cadiz,Real Madrid,A
4,Celta,Malaga,H


In [9]:
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features_encoded = to_encode.apply(le.fit_transform)
features_encoded.head(10)

Unnamed: 0,HomeTeam,AwayTeam,FTR
0,0,4,1
1,33,5,2
2,3,38,1
3,6,26,0
4,7,21,2
5,11,12,0
6,22,17,0
7,25,36,2
8,29,28,2
9,5,25,2


In [10]:
liga_pre.drop(['HomeTeam', 'AwayTeam', 'FTR'], axis = 1, inplace = True)

In [11]:
liga_pre.head()

Unnamed: 0,HomeValue,AwayValue,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,VCH,VCD,VCA,WHH,WHD,WHA
0,20950000.0,275230000.0,7.0,3.75,1.5,7.0,3.7,1.45,5.4,3.5,1.5,6.5,3.75,1.45,6.0,3.4,1.5
1,21090000.0,78100000.0,2.0,3.25,3.25,1.75,3.3,4.4,1.9,3.0,3.6,1.85,3.25,3.75,1.95,3.1,3.5
2,113700000.0,52750000.0,1.72,3.4,4.0,1.65,3.4,4.9,1.7,3.1,4.4,1.65,3.4,4.5,1.7,3.2,4.5
3,1650000.0,283750000.0,7.5,4.0,1.44,8.0,4.25,1.35,5.4,3.5,1.5,6.5,3.6,1.45,6.5,3.6,1.44
4,36350000.0,21100000.0,2.1,3.25,3.0,2.1,3.1,3.3,2.1,2.9,3.2,2.0,3.2,3.25,2.15,3.1,3.0


In [12]:
result = pd.concat([liga_pre.stack(), features_encoded.stack()], axis=0).unstack()
result.head()

Unnamed: 0,AwayTeam,AwayValue,B365A,B365D,B365H,BWA,BWD,BWH,FTR,HomeTeam,HomeValue,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH
0,4.0,275230000.0,1.5,3.75,7.0,1.45,3.7,7.0,1.0,0.0,20950000.0,1.5,3.5,5.4,1.45,3.75,6.5,1.5,3.4,6.0
1,5.0,78100000.0,3.25,3.25,2.0,4.4,3.3,1.75,2.0,33.0,21090000.0,3.6,3.0,1.9,3.75,3.25,1.85,3.5,3.1,1.95
2,38.0,52750000.0,4.0,3.4,1.72,4.9,3.4,1.65,1.0,3.0,113700000.0,4.4,3.1,1.7,4.5,3.4,1.65,4.5,3.2,1.7
3,26.0,283750000.0,1.44,4.0,7.5,1.35,4.25,8.0,0.0,6.0,1650000.0,1.5,3.5,5.4,1.45,3.6,6.5,1.44,3.6,6.5
4,21.0,21100000.0,3.0,3.25,2.1,3.3,3.1,2.1,2.0,7.0,36350000.0,3.2,2.9,2.1,3.25,3.2,2.0,3.0,3.1,2.15


In [13]:
cols = result.columns.tolist()
cols

['AwayTeam',
 'AwayValue',
 'B365A',
 'B365D',
 'B365H',
 'BWA',
 'BWD',
 'BWH',
 'FTR',
 'HomeTeam',
 'HomeValue',
 'IWA',
 'IWD',
 'IWH',
 'VCA',
 'VCD',
 'VCH',
 'WHA',
 'WHD',
 'WHH']

In [15]:
df = result [['AwayTeam', 'AwayValue', 'HomeTeam', 'HomeValue', 'B365A', 'B365D', 'B365H', 'BWA',
                  'BWD', 'BWH','IWA', 'IWD', 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'FTR']]

In [16]:
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5161,5162,5163,5164,5165,5166,5167,5168,5169,5170
AwayTeam,4.0,5.0,38.0,26.0,21.0,12.0,17.0,36.0,28.0,25.0,...,9.0,14.0,35.0,30.0,36.0,3.0,2.0,33.0,4.0,5.0
AwayValue,275230000.0,78100000.0,52750000.0,283750000.0,21100000.0,15950000.0,118700000.0,70000000.0,22850000.0,24000000.0,...,53800000.0,35600000.0,58750000.0,196300000.0,222300000.0,872250000.0,208900000.0,442650000.0,1160000000.0,180330000.0
HomeTeam,0.0,33.0,3.0,6.0,7.0,11.0,22.0,25.0,29.0,5.0,...,36.0,0.0,7.0,11.0,12.0,20.0,29.0,34.0,9.0,26.0
HomeValue,20950000.0,21090000.0,113700000.0,1650000.0,36350000.0,55500000.0,39900000.0,24000000.0,73250000.0,78100000.0,...,222300000.0,71050000.0,181700000.0,82300000.0,69500000.0,72500000.0,268500000.0,37350000.0,53800000.0,1090000000.0
B365A,1.5,3.25,4.0,1.44,3.0,5.0,2.3,2.3,4.5,4.5,...,5.0,3.1,7.0,3.0,5.25,1.95,4.33,1.36,1.9,4.75
B365D,3.75,3.25,3.4,4.0,3.25,3.4,3.2,3.2,3.4,3.4,...,4.33,3.5,4.75,3.6,4.2,3.75,3.4,5.25,4.0,4.75
B365H,7.0,2.0,1.72,7.5,2.1,1.61,2.7,2.7,1.66,1.66,...,1.6,2.25,1.45,2.3,1.61,3.75,1.85,8.0,3.6,1.57
BWA,1.45,4.4,4.9,1.35,3.3,4.75,2.5,2.4,4.7,5.0,...,5.0,3.0,6.5,3.0,5.25,2.0,4.6,1.4,1.87,5.0
BWD,3.7,3.3,3.4,4.25,3.1,3.5,3.1,3.15,3.5,3.35,...,4.4,3.5,4.75,3.7,4.1,3.75,3.4,4.75,4.1,4.75
BWH,7.0,1.75,1.65,8.0,2.1,1.65,2.65,2.7,1.65,1.65,...,1.6,2.3,1.45,2.25,1.62,3.5,1.87,8.0,3.7,1.57


In [17]:
y = df.FTR # define the target variable (dependent variable) as y
df = df.drop(['FTR'],axis=1)

In [18]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

In [19]:
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)


(4136, 19)
(4136,)
(1035, 19)
(1035,)


In [20]:
columnas_xtrain = X_train.columns.tolist()
columnas_xtrain

['AwayTeam',
 'AwayValue',
 'HomeTeam',
 'HomeValue',
 'B365A',
 'B365D',
 'B365H',
 'BWA',
 'BWD',
 'BWH',
 'IWA',
 'IWD',
 'IWH',
 'VCA',
 'VCD',
 'VCH',
 'WHA',
 'WHD',
 'WHH']

## MODELS

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn import preprocessing
import numpy as np
from scipy.stats import f
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn import metrics


In [22]:
'''#Create a Gaussian Classifier
clf = GradientBoostingClassifier(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=9,
                          min_weight_fraction_leaf=0.0, n_estimators=450,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)'''

"#Create a Gaussian Classifier\nclf = GradientBoostingClassifier(alpha=0.9, criterion='friedman_mse', init=None,\n                          learning_rate=0.1, loss='ls', max_depth=3,\n                          max_features=None, max_leaf_nodes=None,\n                          min_impurity_decrease=0.0, min_impurity_split=None,\n                          min_samples_leaf=1, min_samples_split=9,\n                          min_weight_fraction_leaf=0.0, n_estimators=450,\n                          n_iter_no_change=None, presort='auto',\n                          random_state=None, subsample=1.0, tol=0.0001,\n                          validation_fraction=0.1, verbose=0, warm_start=False)"

In [26]:
'''clf_v1 = GradientBoostingClassifier (loss='deviance', 
learning_rate=0.1, n_estimators=100, subsample=1.0, 
criterion='friedman_mse', min_samples_split=2, 
min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, 
init=None, random_state=None, max_features=None, verbose=0, 
max_leaf_nodes=None, warm_start=False, n_iter_no_change=None, tol=0.0001)'''

"clf_v1 = GradientBoostingClassifier (loss='deviance', \nlearning_rate=0.1, n_estimators=100, subsample=1.0, \ncriterion='friedman_mse', min_samples_split=2, \nmin_samples_leaf=1, min_weight_fraction_leaf=0.0, \nmax_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, \ninit=None, random_state=None, max_features=None, verbose=0, \nmax_leaf_nodes=None, warm_start=False, n_iter_no_change=None, tol=0.0001)"

In [37]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 1.0],
    "n_estimators":[10]
    }

clf_v2 = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)


In [38]:
#Train the model using the training sets y_pred=clf.predict(X_test)
#clf_v1.fit(X_train,y_train) #Accuracy: 0.52
clf_v2.fit(X_train,y_train) #



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                         'min_samples_leaf': array([0.1       , 0.13

In [39]:
y_pred=clf_v2.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5391304347826087


In [26]:
'''def grid_model(modelo, params, X_train, y_train, X_test, cv = 5):
    grid = GridSearchCV(modelo, params, cv=cv)
    grid.fit(X_train, y_train)
    best_m = grid.best_estimator_.fit(X_train, y_train)
    y_pred = best_m.predict(X_test)
    return y_pred, best_m

modelo = ensemble.GradientBoostingRegressor()

params = {'n_estimators' : [450, 750, 1000, 1200], 'max_depth' : [3,4,5,6], 
          'min_samples_split' : [5,7,8,9], 'learning_rate' : [0.1]}

y_pred = grid_model(modelo, params, X_train, y_train, X_test, cv = 5)'''

"def grid_model(modelo, params, X_train, y_train, X_test, cv = 5):\n    grid = GridSearchCV(modelo, params, cv=cv)\n    grid.fit(X_train, y_train)\n    best_m = grid.best_estimator_.fit(X_train, y_train)\n    y_pred = best_m.predict(X_test)\n    return y_pred, best_m\n\nmodelo = ensemble.GradientBoostingRegressor()\n\nparams = {'n_estimators' : [450, 750, 1000, 1200], 'max_depth' : [3,4,5,6], \n          'min_samples_split' : [5,7,8,9], 'learning_rate' : [0.1]}\n\ny_pred = grid_model(modelo, params, X_train, y_train, X_test, cv = 5)"

In [27]:
to_predict = pd.read_csv('//home/jacoboj/Escritorio/data_football/prueba19.csv')

In [28]:
new_df = to_predict[['AwayTeam', 'HomeTeam', 'B365A', 'B365D', 'B365H', 'BWA',
                  'BWD', 'BWH','IWA', 'IWD', 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'HTR', 'FTR']]

In [29]:
new_df.head()

Unnamed: 0,AwayTeam,HomeTeam,B365A,B365D,B365H,BWA,BWD,BWH,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,Barcelona,Ath Bilbao,1.65,3.8,5.25,1.65,3.8,5.5,1.7,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,D,H
1,Real Madrid,Celta,1.65,4.2,4.75,1.72,4.2,4.4,1.6,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,A,A
2,Sociedad,Valencia,5.5,3.75,1.66,5.5,3.75,1.67,5.3,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,D,D
3,Eibar,Mallorca,2.6,3.2,2.8,2.6,3.1,2.95,2.6,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,H,H
4,Osasuna,Leganes,4.2,3.2,2.0,3.9,3.25,2.05,4.05,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,D,A


In [30]:
HomeValue = pd.Series([]) # Create Serie to add market value team

#Market Value Teams 2018
for i in range(len(new_df)):
    if new_df['HomeTeam'][i] == 'Granada':
        HomeValue[i] = 33500000
    elif new_df['HomeTeam'][i] == 'Ath Bilbao':
        HomeValue[i] = 224000000
    elif new_df['HomeTeam'][i] == 'Valencia':
        HomeValue[i] = 495500000
    elif new_df['HomeTeam'][i] == 'Ath Madrid':
        HomeValue[i] = 872500000
    elif new_df['HomeTeam'][i] == 'Eibar':
        HomeValue[i] = 69600000
    elif new_df['HomeTeam'][i] == 'Sociedad':
        HomeValue[i] = 309800000
    elif new_df['HomeTeam'][i] == 'Espanol':
        HomeValue[i] = 150300000
    elif new_df['HomeTeam'][i] == 'Villarreal':
        HomeValue[i] = 211700000
    elif new_df['HomeTeam'][i] == 'Valladolid':
        HomeValue[i] = 91300000
    elif new_df['HomeTeam'][i] == 'Sevilla':
        HomeValue[i] = 28300000
    elif new_df['HomeTeam'][i] == 'Alaves':
        HomeValue[i] = 90200000
    elif new_df['HomeTeam'][i] == 'Mallorca':
        HomeValue[i] = 58550000
    elif new_df['HomeTeam'][i] == 'Real Madrid':
        HomeValue[i] = 1190000000
    elif new_df['HomeTeam'][i] == 'Barcelona':
        HomeValue[i] = 1180000000
    elif new_df['HomeTeam'][i] == 'Leganes':
        HomeValue[i] = 106800000
    elif new_df['HomeTeam'][i] == 'Betis':
        HomeValue[i] = 292500000
    elif new_df['HomeTeam'][i] == 'Getafe':
        HomeValue[i] = 151200000
    elif new_df['HomeTeam'][i] == 'Osasuna':
        HomeValue[i] = 43500000
    elif new_df['HomeTeam'][i] == 'Celta':
        HomeValue[i] = 226300000
    elif new_df['HomeTeam'][i] == 'Levante':
        HomeValue[i] = 102400000

new_df.insert(2, "HomeValue", HomeValue) # Execute

In [31]:
new_df.head()

Unnamed: 0,AwayTeam,HomeTeam,HomeValue,B365A,B365D,B365H,BWA,BWD,BWH,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,Barcelona,Ath Bilbao,224000000,1.65,3.8,5.25,1.65,3.8,5.5,1.7,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,D,H
1,Real Madrid,Celta,226300000,1.65,4.2,4.75,1.72,4.2,4.4,1.6,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,A,A
2,Sociedad,Valencia,495500000,5.5,3.75,1.66,5.5,3.75,1.67,5.3,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,D,D
3,Eibar,Mallorca,58550000,2.6,3.2,2.8,2.6,3.1,2.95,2.6,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,H,H
4,Osasuna,Leganes,106800000,4.2,3.2,2.0,3.9,3.25,2.05,4.05,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,D,A


In [32]:
AwayValue = pd.Series([]) # Create Serie to add market value team

#Market Value Teams 2019
for i in range(len(new_df)):
    if new_df['AwayTeam'][i] == 'Granada':
        AwayValue[i] = 33500000
    elif new_df['AwayTeam'][i] == 'Ath Bilbao':
        AwayValue[i] = 224000000
    elif new_df['AwayTeam'][i] == 'Valencia':
        AwayValue[i] = 495500000
    elif new_df['AwayTeam'][i] == 'Ath Madrid':
        AwayValue[i] = 872500000
    elif new_df['AwayTeam'][i] == 'Eibar':
        AwayValue[i] = 69600000
    elif new_df['AwayTeam'][i] == 'Sociedad':
        AwayValue[i] = 309800000
    elif new_df['AwayTeam'][i] == 'Espanol':
        AwayValue[i] = 150300000
    elif new_df['AwayTeam'][i] == 'Villarreal':
        AwayValue[i] = 211700000
    elif new_df['AwayTeam'][i] == 'Valladolid':
        AwayValue[i] = 91300000
    elif new_df['AwayTeam'][i] == 'Sevilla':
        AwayValue[i] = 28300000
    elif new_df['AwayTeam'][i] == 'Alaves':
        AwayValue[i] = 90200000
    elif new_df['AwayTeam'][i] == 'Mallorca':
        AwayValue[i] = 58550000
    elif new_df['AwayTeam'][i] == 'Real Madrid':
        AwayValue[i] = 1190000000
    elif new_df['AwayTeam'][i] == 'Barcelona':
        AwayValue[i] = 1180000000
    elif new_df['AwayTeam'][i] == 'Leganes':
        AwayValue[i] = 106800000
    elif new_df['AwayTeam'][i] == 'Betis':
        AwayValue[i] = 292500000
    elif new_df['AwayTeam'][i] == 'Getafe':
        AwayValue[i] = 151200000
    elif new_df['AwayTeam'][i] == 'Osasuna':
        AwayValue[i] = 43500000
    elif new_df['AwayTeam'][i] == 'Celta':
        AwayValue[i] = 226300000
    elif new_df['AwayTeam'][i] == 'Levante':
        AwayValue[i] = 102400000

new_df.insert(4, "AwayValue", AwayValue) # Execute

In [33]:
new_df.head()

Unnamed: 0,AwayTeam,HomeTeam,HomeValue,B365A,AwayValue,B365D,B365H,BWA,BWD,BWH,...,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,Barcelona,Ath Bilbao,224000000,1.65,1180000000,3.8,5.25,1.65,3.8,5.5,...,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,D,H
1,Real Madrid,Celta,226300000,1.65,1190000000,4.2,4.75,1.72,4.2,4.4,...,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,A,A
2,Sociedad,Valencia,495500000,5.5,309800000,3.75,1.66,5.5,3.75,1.67,...,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,D,D
3,Eibar,Mallorca,58550000,2.6,69600000,3.2,2.8,2.6,3.1,2.95,...,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,H,H
4,Osasuna,Leganes,106800000,4.2,43500000,3.2,2.0,3.9,3.25,2.05,...,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,D,A


In [34]:
new_df = new_df[['AwayTeam', 'AwayValue', 'HomeTeam', 'HomeValue', 'B365A', 'B365D', 'B365H', 'BWA',
                  'BWD', 'BWH','IWA', 'IWD', 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'HTR', 'FTR']]

In [35]:
new_df = new_df[['AwayTeam','AwayValue', 'HomeTeam', 'HomeValue', 'B365A', 'B365D', 'B365H', 'BWA', 'BWD', 'BWH', 'IWA', 'IWD',
 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'HTR', 'FTR']]

In [36]:
new_df.head()

Unnamed: 0,AwayTeam,AwayValue,HomeTeam,HomeValue,B365A,B365D,B365H,BWA,BWD,BWH,...,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,Barcelona,1180000000,Ath Bilbao,224000000,1.65,3.8,5.25,1.65,3.8,5.5,...,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,D,H
1,Real Madrid,1190000000,Celta,226300000,1.65,4.2,4.75,1.72,4.2,4.4,...,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,A,A
2,Sociedad,309800000,Valencia,495500000,5.5,3.75,1.66,5.5,3.75,1.67,...,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,D,D
3,Eibar,69600000,Mallorca,58550000,2.6,3.2,2.8,2.6,3.1,2.95,...,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,H,H
4,Osasuna,43500000,Leganes,106800000,4.2,3.2,2.0,3.9,3.25,2.05,...,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,D,A


In [37]:
to_encode_pred = new_df.select_dtypes(include=[object])
to_encode_pred.head()
new_df.drop(['AwayTeam', 'HomeTeam', 'HTR', 'FTR'], axis=1, inplace=True)

In [38]:
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features_encoded_pred = to_encode_pred.apply(le.fit_transform)
features_encoded_pred.head(10)

Unnamed: 0,AwayTeam,HomeTeam,HTR,FTR
0,3,1,1,2
1,14,5,0,0
2,16,17,1,1
3,6,12,2,2
4,13,10,1,0
5,9,19,1,1
6,11,0,1,2
7,15,7,0,0
8,18,4,1,0
9,8,2,2,2


In [39]:
result_pred = pd.concat([new_df.stack(), features_encoded_pred.stack()], axis=0).unstack()


In [40]:
result_pred.head()

Unnamed: 0,AwayTeam,AwayValue,B365A,B365D,B365H,BWA,BWD,BWH,FTR,HTR,...,HomeValue,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH
0,3.0,1180000000.0,1.65,3.8,5.25,1.65,3.8,5.5,2.0,1.0,...,224000000.0,1.7,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0
1,14.0,1190000000.0,1.65,4.2,4.75,1.72,4.2,4.4,0.0,0.0,...,226300000.0,1.6,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25
2,16.0,309800000.0,5.5,3.75,1.66,5.5,3.75,1.67,1.0,1.0,...,495500000.0,5.3,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67
3,6.0,69600000.0,2.6,3.2,2.8,2.6,3.1,2.95,2.0,2.0,...,58550000.0,2.6,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9
4,13.0,43500000.0,4.2,3.2,2.0,3.9,3.25,2.05,0.0,1.0,...,106800000.0,4.05,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05


In [41]:
result_pred = result_pred[['AwayTeam','AwayValue', 'HomeTeam', 'HomeValue', 'B365A', 'B365D', 'B365H', 'BWA', 'BWD', 'BWH', 'IWA', 'IWD',
 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'HTR', 'FTR']]

In [42]:
result_pred.head()

Unnamed: 0,AwayTeam,AwayValue,HomeTeam,HomeValue,B365A,B365D,B365H,BWA,BWD,BWH,...,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,3.0,1180000000.0,1.0,224000000.0,1.65,3.8,5.25,1.65,3.8,5.5,...,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,1.0,2.0
1,14.0,1190000000.0,5.0,226300000.0,1.65,4.2,4.75,1.72,4.2,4.4,...,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,0.0,0.0
2,16.0,309800000.0,17.0,495500000.0,5.5,3.75,1.66,5.5,3.75,1.67,...,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,1.0,1.0
3,6.0,69600000.0,12.0,58550000.0,2.6,3.2,2.8,2.6,3.1,2.95,...,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,2.0,2.0
4,13.0,43500000.0,10.0,106800000.0,4.2,3.2,2.0,3.9,3.25,2.05,...,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,1.0,0.0


In [43]:
asd = result_pred.FTR

In [44]:
result_pred.drop(['FTR'], axis=1, inplace=True)

In [45]:
result_pred.head()

Unnamed: 0,AwayTeam,AwayValue,HomeTeam,HomeValue,B365A,B365D,B365H,BWA,BWD,BWH,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR
0,3.0,1180000000.0,1.0,224000000.0,1.65,3.8,5.25,1.65,3.8,5.5,1.7,3.8,5.0,1.75,3.8,5.0,1.7,3.8,5.0,1.0
1,14.0,1190000000.0,5.0,226300000.0,1.65,4.2,4.75,1.72,4.2,4.4,1.6,4.2,5.3,1.73,4.2,4.75,1.6,4.2,5.25,0.0
2,16.0,309800000.0,17.0,495500000.0,5.5,3.75,1.66,5.5,3.75,1.67,5.3,3.75,1.67,5.75,3.9,1.67,5.25,3.8,1.67,1.0
3,6.0,69600000.0,12.0,58550000.0,2.6,3.2,2.8,2.6,3.1,2.95,2.6,3.1,2.9,2.7,3.13,2.9,2.62,3.1,2.9,2.0
4,13.0,43500000.0,10.0,106800000.0,4.2,3.2,2.0,3.9,3.25,2.05,4.05,3.1,2.05,4.1,3.2,2.1,4.0,3.2,2.05,1.0


In [46]:
y_pred_2=clf2.predict(result_pred)


In [47]:
print("Accuracy:",metrics.accuracy_score(asd, y_pred_2))

Accuracy: 0.6


0        0.0
1       33.0
2        3.0
3        6.0
4        7.0
        ... 
5166    20.0
5167    29.0
5168    34.0
5169     9.0
5170    26.0
Name: HomeTeam, Length: 5171, dtype: float64