In [5]:
import pandas as pd
import numpy as np

In [9]:
#read the dataset
traindf = pd.read_csv('../FinalData/finaldata.csv')
testdf = pd.read_csv('../FinalData/Testfinaldata.csv')

print("Training Dataset Shape:{}".format(traindf.shape))
print("Test Dataset Shape:{}".format(testdf.shape))

Training Dataset Shape:(12776, 27)
Test Dataset Shape:(1530, 26)


In [10]:
columns = list(traindf.columns)
print("Columns In the dataset")
print("-" * 15)
print(columns)

Columns In the dataset
---------------
['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'AwayTeam', 'FTR', 'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY', 'HomeTeam', 'league', 'timestamp', 'ht_label', 'at_label', 'league_label', 'ftr_label', 'HTCT', 'ATCT', 'HTWP', 'ATWP']


In [11]:
input_features = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 
                  'HC', 'HF', 'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY',  
                  'ht_label', 'at_label', 'league_label', 'HTCT', 'ATCT', 'HTWP', 'ATWP']

label = 'ftr_label'

#check the label value counts
print("Total Value Counts in the dataset")
print(traindf.FTR.value_counts())

Total Value Counts in the dataset
H    5930
A    3564
D    3282
Name: FTR, dtype: int64


Importing Library called imbalanced-learn. To install it

       pip install -U imbalanced-learn
       
       I will upsample the D class data using SMOTE algorithm. It creates synthetic observations of the minority class by

    1. Finding K-nearest neighbors for minority class observations
    2. Randomly choosing one of the K-nearest-neighbors and using to create a similar, but randomly tweaked, new observation

In [12]:
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.datasets import make_imbalance
from imblearn.metrics import classification_report_imbalanced
from collections import Counter
#convert into train, test
from sklearn.model_selection import train_test_split

X, y = make_imbalance(traindf[input_features], traindf[label],
                     sampling_strategy = {0:3000, 1:3280, 2:3000},
                     random_state=12)
print([X.shape, y.shape])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.1,
    random_state=12)

print([X_train.shape, X_test.shape, y_train.shape, y_test.shape])


print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))


input_df = pd.DataFrame(X, columns=input_features)
label_df = pd.DataFrame(y, columns=['ftr_label'])

df = pd.concat([input_df, label_df], axis=1)
df.to_csv("smoatdf.csv", index=False, sep=',')

[(9280, 21), (9280,)]
[(8352, 21), (928, 21), (8352,), (928,)]
Training target statistics: Counter({1: 2942, 0: 2725, 2: 2685})
Testing target statistics: Counter({1: 338, 2: 315, 0: 275})


In [13]:
import xgboost as xgb 
from datetime import datetime as dt

#creating a pipeline
pipeline = make_pipeline(NearMiss(version=2),
                    xgb.XGBClassifier(seed = 12))

start = dt.utcnow()
print("Started Training the Model")
pipeline.fit(X_train, y_train)
end = dt.utcnow()

print("Time Elapsed:{}".format(end - start))
print("-" * 15)

print("Model Metrics")
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

Started Training the Model
Time Elapsed:0:00:06.556825
---------------
Model Metrics
                   pre       rec       spe        f1       geo       iba       sup

          0       0.63      0.67      0.83      0.65      0.75      0.55       275
          1       0.54      0.49      0.76      0.51      0.61      0.36       338
          2       0.66      0.69      0.82      0.67      0.75      0.56       315

avg / total       0.61      0.61      0.80      0.61      0.70      0.48       928



In [14]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=12,
                                    solver='lbfgs', multi_class='ovr')

#creating a pipeline
pipeline2 = make_pipeline(NearMiss(version=2),
                    clf)

start = dt.utcnow()
print("Started Training the Model")
pipeline2.fit(X_train, y_train)
end = dt.utcnow()

print("Time Elapsed:{}".format(end - start))
print("-" * 15)

print("Model Metrics")
print(classification_report_imbalanced(y_test, pipeline2.predict(X_test)))

Started Training the Model




Time Elapsed:0:00:04.945947
---------------
Model Metrics
                   pre       rec       spe        f1       geo       iba       sup

          0       0.61      0.67      0.82      0.64      0.74      0.54       275
          1       0.56      0.43      0.81      0.49      0.59      0.34       338
          2       0.63      0.73      0.78      0.68      0.76      0.57       315

avg / total       0.60      0.60      0.80      0.60      0.69      0.48       928



### Applying SMOTE ###

In [15]:
from imblearn.over_sampling import SMOTE
sm = SMOTE('minority')
X_res, y_res = sm.fit_sample(X_train, y_train)
print(np.bincount(y_train), np.bincount(y_res))


start = dt.utcnow()
print("Started Training the Model")
pipeline2.fit(X_res, y_res)
end = dt.utcnow()

print("Time Elapsed:{}".format(end - start))
print("-" * 15)

print("Model Metrics")
print(classification_report_imbalanced(y_test, pipeline2.predict(X_test)))


from sklearn import linear_model

lrclf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)

#creating a pipeline
start = dt.utcnow()
print("Started Training the Model")
pipeline4 = make_pipeline(NearMiss(version=2),
                          lrclf)
pipeline4.fit(X_train, y_train)
end = dt.utcnow()

print("Time Elapsed:{}".format(end - start))
print("-" * 15)

print("Model Metrics")
print(classification_report_imbalanced(y_test, pipeline4.predict(X_test)))

[2725 2942 2685] [2725 2942 2942]
Started Training the Model




Time Elapsed:0:00:04.704833
---------------
Model Metrics
                   pre       rec       spe        f1       geo       iba       sup

          0       0.61      0.67      0.82      0.64      0.74      0.54       275
          1       0.57      0.45      0.81      0.51      0.60      0.35       338
          2       0.64      0.73      0.79      0.68      0.76      0.57       315

avg / total       0.61      0.61      0.80      0.60      0.70      0.48       928

Started Training the Model
Time Elapsed:0:00:05.298052
---------------
Model Metrics
                   pre       rec       spe        f1       geo       iba       sup

          0       0.58      0.69      0.79      0.63      0.74      0.54       275
          1       0.51      0.61      0.66      0.55      0.63      0.40       338
          2       0.81      0.50      0.94      0.62      0.69      0.45       315

avg / total       0.63      0.60      0.79      0.60      0.68      0.46       928



### Prediction in Test Dataset ####

### Tensorflow Estimators ###

A high level Tensorflow API that greatly simplify ML programming. It encapsulates following things

1. Training
2. Evaluating
3. Prediction
4. Export for Serving

#### Structure of a Pre-Made Estimators Programs ####

It typically consists of following 4 steps

1. Convert CSV data into Tensorflow Records
2. Define the Feature columns
3. Create an relevant Algorithm
4. Call a Training, Evaluation and Inference Method
5. Export a serving function

### Import necessary libraries ###

In [16]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pandas as pd
import tensorflow as tf

### Load the dataset ###

In [17]:
training_data = pd.read_csv('smoatdf.csv')
test_data = pd.read_csv('../FinalData/Testfinaldata.csv')

train_filename = 'smoatdf.csv'
test_filename = '../FinalData/TestFinaldata.csv'


print("Training Dataset Shape:{}".format(training_data.shape))
print("Test Dataset Shape:{}".format(test_data.shape))

Training Dataset Shape:(9280, 22)
Test Dataset Shape:(1530, 26)


### csv Input function ###

In [18]:
def csv_input_fn(features, labels, batch_size):
    
    #converts the inputs to dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    #shuffle
    dataset =  dataset.shuffle(1000).repeat().batch(batch_size)
    
    return dataset

def eval_input_fn(features, labels, batch_size):
    
    #input function for validation
    features = dict(features)
    
    if labels is None:
        #no labels only features
        inputs = features
    else:
        inputs = (features, labels)
        
    #convert inputs into dataset
    dataset = tf.data.Dataset.from_tensor_slices(inputs)
    
    assert batch_size is not None, "Batch Size must not be None"
    dataset = dataset.batch(batch_size)
    
    return dataset

### Build an Estimator ###

In [123]:
#IMPORT COLUMNS
important_columns = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY','ht_label', 'at_label', 'league_label', 
        'HTCT', 'ATCT','HTWP', 'ATWP']


#get input and output features
X_all = training_data[important_columns]
y_all = training_data['ftr_label']

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 1500,
                                                    random_state = 2,
                                                    stratify = y_all)

In [124]:
#feature columns
my_feature_columns = []

for key in important_columns:
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [138]:
#classifier
classifier = tf.estimator.DNNClassifier(
    feature_columns = my_feature_columns,
    #three hidden layers of 30 nodes each
    hidden_units = [30, 30, 10],
    #model must choose between 3 classes
    n_classes=3
)

#train the model
training_result = classifier.train(
    input_fn = lambda: csv_input_fn(X_train,y_train, 16),
    steps = 7600
)


#evaluate the model
eval_result = classifier.evaluate(
    input_fn = lambda: eval_input_fn(X_test, y_test, 8)
)

print("Test Accuracy:{accuracy: 0.3f}\n".format(**eval_result))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_device_fn': None, '_log_step_count_steps': 100, '_is_chief': True, '_model_dir': 'C:\\Users\\MADHIV~1\\AppData\\Local\\Temp\\tmpcg5cixcb', '_global_id_in_cluster': 0, '_task_type': 'worker', '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_num_worker_replicas': 1, '_save_summary_steps': 100, '_session_config': None, '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000025B10462630>, '_keep_checkpoint_max': 5, '_task_id': 0, '_master': '', '_train_distribute': None, '_evaluation_master': '', '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_num_ps_replicas': 0}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 

INFO:tensorflow:global_step/sec: 251.172
INFO:tensorflow:loss = 15.135541, step = 7200 (0.398 sec)
INFO:tensorflow:global_step/sec: 257.384
INFO:tensorflow:loss = 14.111376, step = 7300 (0.389 sec)
INFO:tensorflow:global_step/sec: 253.2
INFO:tensorflow:loss = 10.052315, step = 7400 (0.395 sec)
INFO:tensorflow:global_step/sec: 255.784
INFO:tensorflow:loss = 12.641413, step = 7500 (0.391 sec)
INFO:tensorflow:Saving checkpoints for 7600 into C:\Users\MADHIV~1\AppData\Local\Temp\tmpcg5cixcb\model.ckpt.
INFO:tensorflow:Loss for final step: 10.081821.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-01-18-09:19:43
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MADHIV~1\AppData\Local\Temp\tmpcg5cixcb\model.ckpt-7600
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-01-18-09:19:44
INFO:tensorflow:Saving dict fo

### check down the model performance ###

In [139]:
evaldfinp = pd.DataFrame(X_test, columns=important_columns)
evaldfinp.to_csv("evaldf.csv", index=False, sep=',')

In [140]:
#convert dataframe into json list file
evaldf = pd.read_csv("evaldf.csv")

cols = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY','ht_label', 'at_label', 'league_label', 
        'HTCT', 'ATCT','HTWP', 'ATWP']

impo_test_data = evaldf[cols]

predict_y = {}

for cols in list(impo_test_data.columns):
    val = list(impo_test_data[cols].values)
    predict_y.update({cols:val})

In [141]:
predict_y.keys()

dict_keys(['HY', 'ht_label', 'HST', 'HS', 'HC', 'at_label', 'HF', 'league_label', 'AY', 'AST', 'AF', 'HTHG', 'ATWP', 'HTCT', 'AC', 'AS', 'ATCT', 'HTAG', 'HTWP', 'HR', 'AR'])

In [142]:
expected = ['A', 'D', 'H']

predictions = classifier.predict(
    input_fn = lambda : eval_input_fn(predict_y, labels=None, batch_size=1)
)

template = ('\n Prediction is "{}" ({:1f}%)')
who_win, only_class_pred = [], [] #list to store

for pred in predictions:
    class_id = pred['class_ids'][0]
    probability = pred['probabilities'][class_id]
    
    #print(template.format(expected[class_id], 100 * probability))
    only_class_pred.append(class_id)
    who_win.append([expected[class_id], 100 * probability])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MADHIV~1\AppData\Local\Temp\tmpcg5cixcb\model.ckpt-7600
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


0 --> Away, 1 --> DrawTeam,  2 --> HomeTeam

### Model Stability on Validation Test Set ### 

In [143]:
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, only_class_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.62      0.89      0.67      0.74      0.53       485
          1       0.53      0.62      0.70      0.57      0.66      0.43       530
          2       0.70      0.67      0.86      0.68      0.76      0.56       485

avg / total       0.64      0.63      0.81      0.64      0.72      0.51      1500



### Test the model stability in Test dataset ####

In [144]:
cols_to_consider = ['AC', 'AF', 'AR', 'AS', 'AST', 'AY', 'HC', 'HF',
       'HR', 'HS', 'HST', 'HTAG', 'HTHG', 'HY','ht_label', 'at_label', 'league_label', 
        'HTCT', 'ATCT','HTWP', 'ATWP']

testdf = test_data[cols_to_consider]
test = {}
modelprediction = []

for cols in list(testdf.columns):
    val= list(testdf[cols].values)
    test.update({cols:val})

testPredictions = classifier.predict(
    input_fn = lambda : eval_input_fn(test, labels=None, batch_size=1)
)

for pred in testPredictions:
    class_id = pred['class_ids'][0]
    probability = pred['probabilities'][class_id]
    modelprediction.append([expected[class_id], 100 * probability])
    
    
preddf = pd.DataFrame(modelprediction, columns=['Prediction', 'Probability'])
print(preddf.shape)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\MADHIV~1\AppData\Local\Temp\tmpcg5cixcb\model.ckpt-7600
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
(1530, 2)


In [145]:
rawtestdf = pd.read_csv("../data/test.csv")
concatdf = pd.concat([test_data, preddf], axis=1)
concatdf.shape
concatdf.to_csv("output.csv", index=False, sep=",")

### Model Stability Performance ###

In [146]:
#to check the performance i downloaded the data from net to cross check(2017, 2018) matches
d1 = pd.read_csv("../MatchResults/D1.csv")

[(612, 64), (1530, 28)]


### Tensorflow Model Prediction ###

In [147]:
predd1 = outputdf.loc[:305]
mpred = list(predd1.Prediction)
actresult = list(d1.FTR)

from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(actresult, mpred))

                   pre       rec       spe        f1       geo       iba       sup

          A       0.84      0.58      0.96      0.69      0.75      0.54        84
          D       0.41      0.49      0.74      0.45      0.60      0.36        83
          H       0.73      0.78      0.76      0.76      0.77      0.60       139

avg / total       0.68      0.65      0.81      0.66      0.72      0.52       306



### Scikit-learn Model Prediction(SGD Classifier) ###

In [156]:
testdata = testdf.values
sk_prediction = lrclf.predict(testdata)

expected = ['A', 'D', 'H']
asteam = []

convertedList = [expected[x] for x in sk_prediction]
#take only first 305 rows
split = convertedList[:306]

test_data['output'] = convertedList
test_data.to_csv('sklearnOutput.csv', sep=",", index=False)

from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(actresult, split))

                   pre       rec       spe        f1       geo       iba       sup

          A       0.87      0.55      0.97      0.67      0.73      0.51        84
          D       0.43      0.69      0.66      0.53      0.67      0.45        83
          H       0.80      0.69      0.86      0.74      0.77      0.58       139

avg / total       0.72      0.65      0.83      0.66      0.73      0.53       306



#### Model Stability Improvement ####

Model stability improved from

| Team | Pre  | Rec | f1 |
|------|------|-----|----|
|   0  | 0.83 |0.60 |0.70|
|   1  | 0.44 |0.34 |0.38|
|   2  | 0.66 |0.86 |0.75|


        
                                                                to
    
| Team | Pre  | Rec | f1 |
|------|------|-----|----|
|   A  | 0.87 |0.55 |0.67|
|   D  | 0.43 |0.69 |0.53|
|   H  | 0.80 |0.69 |0.74|