In [32]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [3]:
liga_pre = pd.read_csv('LIGA05-18_wValues.csv')

In [4]:
liga_pre.drop(['Unnamed: 0'], axis = 1, inplace = True)
liga_pre['Date'] = liga_pre['Date'].astype('datetime64[ns]')

In [5]:
liga_pre.head()
liga_pre.drop(['Date'], axis=1, inplace=True)

In [6]:
liga_pre.dtypes

HomeTeam      object
HomeValue    float64
AwayTeam      object
AwayValue    float64
B365H        float64
B365D        float64
B365A        float64
BWH          float64
BWD          float64
BWA          float64
IWH          float64
IWD          float64
IWA          float64
VCH          float64
VCD          float64
VCA          float64
WHH          float64
WHD          float64
WHA          float64
HTR           object
FTR           object
dtype: object

In [7]:
to_encode = liga_pre.select_dtypes(include=[object])
to_encode.head()

Unnamed: 0,HomeTeam,AwayTeam,HTR,FTR
0,Alaves,Barcelona,D,D
1,Valencia,Betis,D,H
2,Ath Madrid,Zaragoza,D,D
3,Cadiz,Real Madrid,A,A
4,Celta,Malaga,H,H


In [8]:
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features_encoded = to_encode.apply(le.fit_transform)
features_encoded.head(10)

Unnamed: 0,HomeTeam,AwayTeam,HTR,FTR
0,0,4,1,1
1,33,5,1,2
2,3,38,1,1
3,6,26,0,0
4,7,21,2,2
5,11,12,1,0
6,22,17,0,0
7,25,36,2,2
8,29,28,2,2
9,5,25,1,2


In [13]:
liga_pre.drop(['HomeTeam', 'AwayTeam', 'HTR', 'FTR'], axis = 1, inplace = True)

In [14]:
liga_pre.head()

Unnamed: 0,HomeValue,AwayValue,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,VCH,VCD,VCA,WHH,WHD,WHA
0,20950000.0,275230000.0,7.0,3.75,1.5,7.0,3.7,1.45,5.4,3.5,1.5,6.5,3.75,1.45,6.0,3.4,1.5
1,21090000.0,78100000.0,2.0,3.25,3.25,1.75,3.3,4.4,1.9,3.0,3.6,1.85,3.25,3.75,1.95,3.1,3.5
2,113700000.0,52750000.0,1.72,3.4,4.0,1.65,3.4,4.9,1.7,3.1,4.4,1.65,3.4,4.5,1.7,3.2,4.5
3,1650000.0,283750000.0,7.5,4.0,1.44,8.0,4.25,1.35,5.4,3.5,1.5,6.5,3.6,1.45,6.5,3.6,1.44
4,36350000.0,21100000.0,2.1,3.25,3.0,2.1,3.1,3.3,2.1,2.9,3.2,2.0,3.2,3.25,2.15,3.1,3.0


In [20]:
result = pd.concat([liga_pre.stack(), features_encoded.stack()], axis=0).unstack()
result.head()

Unnamed: 0,AwayTeam,AwayValue,B365A,B365D,B365H,BWA,BWD,BWH,FTR,HTR,...,HomeValue,IWA,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH
0,4.0,275230000.0,1.5,3.75,7.0,1.45,3.7,7.0,1.0,1.0,...,20950000.0,1.5,3.5,5.4,1.45,3.75,6.5,1.5,3.4,6.0
1,5.0,78100000.0,3.25,3.25,2.0,4.4,3.3,1.75,2.0,1.0,...,21090000.0,3.6,3.0,1.9,3.75,3.25,1.85,3.5,3.1,1.95
2,38.0,52750000.0,4.0,3.4,1.72,4.9,3.4,1.65,1.0,1.0,...,113700000.0,4.4,3.1,1.7,4.5,3.4,1.65,4.5,3.2,1.7
3,26.0,283750000.0,1.44,4.0,7.5,1.35,4.25,8.0,0.0,0.0,...,1650000.0,1.5,3.5,5.4,1.45,3.6,6.5,1.44,3.6,6.5
4,21.0,21100000.0,3.0,3.25,2.1,3.3,3.1,2.1,2.0,2.0,...,36350000.0,3.2,2.9,2.1,3.25,3.2,2.0,3.0,3.1,2.15


In [21]:
cols = result.columns.tolist()
cols

['AwayTeam',
 'AwayValue',
 'B365A',
 'B365D',
 'B365H',
 'BWA',
 'BWD',
 'BWH',
 'FTR',
 'HTR',
 'HomeTeam',
 'HomeValue',
 'IWA',
 'IWD',
 'IWH',
 'VCA',
 'VCD',
 'VCH',
 'WHA',
 'WHD',
 'WHH']

In [27]:
df = result [['AwayTeam', 'AwayValue', 'HomeTeam', 'HomeValue', 'B365A', 'B365D', 'B365H', 'BWA',
                  'BWD', 'BWH','IWA', 'IWD', 'IWH', 'VCA', 'VCD', 'VCH', 'WHA', 'WHD', 'WHH', 'HTR', 'FTR']]

In [29]:
df.head()

Unnamed: 0,AwayTeam,AwayValue,HomeTeam,HomeValue,B365A,B365D,B365H,BWA,BWD,BWH,...,IWD,IWH,VCA,VCD,VCH,WHA,WHD,WHH,HTR,FTR
0,4.0,275230000.0,0.0,20950000.0,1.5,3.75,7.0,1.45,3.7,7.0,...,3.5,5.4,1.45,3.75,6.5,1.5,3.4,6.0,1.0,1.0
1,5.0,78100000.0,33.0,21090000.0,3.25,3.25,2.0,4.4,3.3,1.75,...,3.0,1.9,3.75,3.25,1.85,3.5,3.1,1.95,1.0,2.0
2,38.0,52750000.0,3.0,113700000.0,4.0,3.4,1.72,4.9,3.4,1.65,...,3.1,1.7,4.5,3.4,1.65,4.5,3.2,1.7,1.0,1.0
3,26.0,283750000.0,6.0,1650000.0,1.44,4.0,7.5,1.35,4.25,8.0,...,3.5,5.4,1.45,3.6,6.5,1.44,3.6,6.5,0.0,0.0
4,21.0,21100000.0,7.0,36350000.0,3.0,3.25,2.1,3.3,3.1,2.1,...,2.9,2.1,3.25,3.2,2.0,3.0,3.1,2.15,2.0,2.0


In [30]:
y = df.FTR # define the target variable (dependent variable) as y

In [41]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

In [42]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(4136, 21) (4136,)
(1035, 21) (1035,)


## MODELS

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [58]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=3,
                             random_state=0)


In [59]:
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [60]:
# prediction on test set
y_pred=clf.predict(X_test)



In [63]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9594202898550724
