In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
from TrainTestSplit import trainTestSplit
from sklearn.naive_bayes import GaussianNB

'------------------------------------------------- Initial Setup -------------------------------------------------'

%matplotlib inline
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score

np.set_printoptions(precision = 2)

# Set the random seed to a specific value (e.g., 42)
np.random.seed(42)

#grid means grid search will be done, joblib means it will load a model
grid_p = 'joblib'


In [2]:
## Loading in the data of Set A.
XTrain, XTest, YTrain, YTest = trainTestSplit("A")

In [3]:
## Setting the classifier to be Vanilla Linear SVM.
Classifier = SVC(kernel = 'linear')

## Defining a list to store the Cross-Validation Scores.
crossValidationScores = []

## Computing the 10-fold cross-validation score.
cvScore = cross_val_score(Classifier, XTrain, YTrain['FTR'], cv = 10)
crossValidationScores.append(np.mean(cvScore))
print("Cross Validation Score : ", np.mean(crossValidationScores))

## Computing the Training and Testing Accuracies.
Classifier.fit(XTrain,YTrain)
print("Training Accuracy : ", Classifier.score(XTrain, YTrain))

print ("Testing Accuracy : ", Classifier.score(XTest, YTest))

Cross Validation Score :  0.5371673465423465
Training Accuracy :  0.5405867274280831
Testing Accuracy :  0.5609375


In [4]:
parameters = { 'C' : [1, 2],
               'gamma' : [0.01, 0.02]
              }

if grid_p == 'joblib':
    Clf = joblib.load('./GridSearchObjects/LSVMSetACP1.pkl')
elif grid_p == 'grid':
    Clf = GridSearchCV(Classifier, parameters, n_jobs = 5, verbose = 4, scoring = 'accuracy')


In [5]:
## ## Fitting the Model to the Training Data.
Clf.fit(XTrain, YTrain.values.ravel())

In [6]:
if grid_p == 'grid':
    ## Printing the statistics after hyper-parameter tuning.
    print (Clf.best_score_)
    print (Clf.best_estimator_)

In [7]:
## Setting the classifier to be the hyperparameter tuned Linear SVM.
if grid_p == 'grid':
    Classifier = Clf.best_estimator_
elif grid_p == 'joblib':
    Classifier = Clf
    
## Defining a list to store the Cross-Validation Scores.
crossValidationScores = []

## Computing the 10-fold cross-validation score.
cvScore = cross_val_score(Classifier, XTrain, YTrain['FTR'], cv = 10)
crossValidationScores.append(np.mean(cvScore))
print ("Cross Validation Score : ", np.mean(crossValidationScores))

## Computing the Training and Testing Accuracies.
Classifier.fit(XTrain,YTrain)
print ("Training Accuracy : ", Classifier.score(XTrain, YTrain))
print ("Testing Accuracy : ", Classifier.score(XTest, YTest))

Cross Validation Score :  0.5371673465423465
Training Accuracy :  0.5405867274280831
Testing Accuracy :  0.5609375


In [8]:
## Using Recursive Feature Elimination for performing feature selection.
Rfe = RFE(Classifier, n_features_to_select = 1, step = 1)
Rfe.fit(XTrain,YTrain)
Rankings = Rfe.ranking_

## Creating a list of features .
Features = list(XTrain.columns.values)

## Creating a dictionary to map the variable name to their RFE rankings .
RFERanks = {}

for i in range(0,len(Features)):
    
     RFERanks[Features[i]] = Rankings[i]
        
RFERanks = OrderedDict(sorted(RFERanks.items(), reverse = False, key = lambda t: t[1]))
print(RFERanks)

OrderedDict([('ATGD', 1), ('HTGD', 2), ('AForm', 3), ('AStWeighted', 4), ('ASt', 5), ('HForm', 6), ('HStWeighted', 7), ('ACKPP', 8), ('HSt', 9), ('HMidfield', 10), ('HGKPP', 11), ('HSTKPP', 12), ('AOverall', 13), ('ASTKPP', 14), ('AGKPP', 15), ('AAttack', 16), ('ADefense', 17), ('HCKPP', 18), ('HOverall', 19), ('HDefense', 20), ('HAttack', 21), ('AMidfield', 22)])


In [9]:
## Remove the less relevant features and only keep the top ones.
removeFeatures = []
print(RFERanks)
for key, value in RFERanks.items():
    if (value >= 15):
        removeFeatures.append(key)


XTrain.drop(removeFeatures, axis = 1, inplace = True)
XTest.drop(removeFeatures, axis = 1, inplace = True)

OrderedDict([('ATGD', 1), ('HTGD', 2), ('AForm', 3), ('AStWeighted', 4), ('ASt', 5), ('HForm', 6), ('HStWeighted', 7), ('ACKPP', 8), ('HSt', 9), ('HMidfield', 10), ('HGKPP', 11), ('HSTKPP', 12), ('AOverall', 13), ('ASTKPP', 14), ('AGKPP', 15), ('AAttack', 16), ('ADefense', 17), ('HCKPP', 18), ('HOverall', 19), ('HDefense', 20), ('HAttack', 21), ('AMidfield', 22)])


In [10]:
parameters = { 'C' : [1, 2],
               'gamma' : [0.01, 0.02]
              }

if grid_p == 'joblib':
    Clf = joblib.load('./GridSearchObjects/LSVMSetACP1.pkl')
elif grid_p == 'grid':
    Clf = GridSearchCV(Classifier, parameters, n_jobs = 5, verbose = 4, scoring = 'accuracy')

In [11]:
Clf.fit(XTrain, YTrain.values.ravel())

In [12]:
if grid_p == 'grid':
    ## Printing the statistics after hyper-parameter tuning.
    print (Clf.best_score_)
    print (Clf.best_estimator_)

In [13]:
## Setting the classifier to be the hyperparameter tuned Linear SVM.
if grid_p == 'grid':
    Classifier = Clf.best_estimator_
    joblib.dump(Classifier,"./GridSearchObjects/LSVMSetACP1.pkl")
elif grid_p == 'joblib':
    Classifier = Clf


## Defining a list to store the Cross-Validation Scores.
crossValidationScores = []

## Computing the 10-fold cross-validation score.
cvScore = cross_val_score(Classifier, XTrain, YTrain['FTR'], cv = 10)
crossValidationScores.append(np.mean(cvScore))
print ("Cross Validation Score : ", np.mean(crossValidationScores))

## Computing the Training and Testing Accuracies.
Classifier.fit(XTrain,YTrain)
print( "Training Accuracy : ", Classifier.score(XTrain, YTrain))
print( "Testing Accuracy : ", Classifier.score(XTest, YTest))

Cross Validation Score :  0.5368800181300182
Training Accuracy :  0.5383081743093135
Testing Accuracy :  0.565625


In [14]:
## Obtaining our predictions made by the best Linear SVM Classifier .
YPred = Classifier.predict(XTest)

## Defining the target classes .
classLabels = ['H','A','D']

## Obtaining the confusion matrix for our predictions .
confusionMatrix = confusion_matrix(y_true = YTest['FTR'], y_pred = YPred, labels=classLabels)
confusionMatrix

array([[272,  36,   0],
       [ 84,  90,   0],
       [130,  28,   0]], dtype=int64)

In [15]:
## Obtaining a more readable format of our Confusion Matrix in terms of a Classification Report . 
classificationReport = classification_report(y_true = YTest, y_pred = YPred, labels = classLabels)
print(classificationReport)

              precision    recall  f1-score   support

           H       0.56      0.88      0.69       308
           A       0.58      0.52      0.55       174
           D       0.00      0.00      0.00       158

    accuracy                           0.57       640
   macro avg       0.38      0.47      0.41       640
weighted avg       0.43      0.57      0.48       640



In [16]:
if grid_p == 'grid':
    gridScores = Clf.cv_results_

    gridScoresList = []

    for i in range(0, len(gridScores['params'])):

        Score = (gridScores['mean_test_score'][i],)
        C = (gridScores['params'][i]['C'],)
        Gamma = (gridScores['params'][i]['gamma'],)

        Tuple = Score + C + Gamma
        gridScoresList.append(Tuple)
        
    df = pd.DataFrame(gridScoresList, columns = ['Validation Score', 'C', 'Gamma'])

    grid_map = df.pivot(index = 'Gamma', columns='C', values= 'Validation Score')

    sns.heatmap(grid_map, cmap = "YlGnBu")