In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

<h3> Load the Dataset </h3>

In [1]:
ls

Assignmet_2.docx                      Modelling_Knn.ipynb
Dummy estimators.ipynb                Modelling_SVM.ipynb
EDA.ipynb                             Modelling_SVM_V2.ipynb
EDA_V2_Kaggle.ipynb                   XGBOOST.ipynb
Exercise 2.pdf                        [31mdigit_recognizer_dataset.csv[m[m*
Feature Selection.ipynb               digit_recognizer_dataset_reduced.csv
Icon?                                 ~$signmet_2.docx
Modelling_Decision_Trees.ipynb


In [4]:
data = pd.read_csv("digit_recognizer_dataset_reduced.csv")

In [5]:
# Plot the first 5 records
data.head()

Unnamed: 0,label,pixel12,pixel13,pixel14,pixel15,pixel32,pixel33,pixel34,pixel35,pixel36,...,pixel770,pixel771,pixel772,pixel773,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Seperate the dataset into Features and Labels
y =data['label'].values       # Target values - numpy.ndarray
X = data.drop(['label'],axis=1).values

<h3> Apply PCA Transformation with 15 Components </h3>

In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=15, whiten=True)
new_X = pca.fit_transform(X)

print(pca.explained_variance_ratio_)

[0.09748938 0.07160266 0.06145903 0.05379302 0.04894262 0.04303214
 0.03277051 0.02892103 0.02766902 0.02348871 0.02099324 0.02058998
 0.01702517 0.01692775 0.01581118]


<h3> Search for the Best Hyper-parameters using Random Search </h3>

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import scipy

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

import matplotlib.pyplot as plt 
%matplotlib inline

distributions = {'max_depth': [x for x in range(200)], 'min_samples_leaf': scipy.stats.expon(scale=.1),
  'criterion': ['gini', 'entropy']}

# distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])

clf = RandomizedSearchCV(DecisionTreeClassifier(), distributions, random_state=0)

search = clf.fit(new_X, y)

search.best_params_

{'criterion': 'entropy',
 'max_depth': 70,
 'min_samples_leaf': 0.005838467078070335}

In [None]:
# hyper - parameters

# max_depth
# min_samples_leaf
# criterion {“gini”, “entropy”}

<h3> Train and evaluate a Decision Tree using the previously determined optimum hyper-parameters </h3>

In [24]:
# 5-Fold Cross - Validation 
model = DecisionTreeClassifier(max_depth=70, criterion='entropy', min_samples_leaf = 0.005838467078070335, random_state=0)
model_scores = cross_val_score(model, new_X, y, cv=5, n_jobs=-1, scoring='accuracy')

print("Accuracy scores {}".format(model_scores))
print("Mean accuracy {}".format(model_scores.mean()))

Accuracy scores [0.7247619  0.7302381  0.72809524 0.73047619 0.73440476]
Mean accuracy 0.7295952380952382


<h3> Train and Evaluate using Random Split using the previously determined optimum hyper-parameters </h3>

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.33, random_state=42)

In [36]:
model = DecisionTreeClassifier(max_depth=70, criterion='entropy', min_samples_leaf = 0.005838467078070335, random_state=0)
model.fit(X_train,y_train)

predictions = model.predict(X_test)
conf_matrix = confusion_matrix(y_test,predictions)

print(f" Accuracy Score: {accuracy_score(y_test, predictions)}\n")
print(" Confusion Matrix \n")
print(conf_matrix)

 Accuracy Score: 0.7316738816738817

 Confusion Matrix 

[[1009    0   51   39    3   79   35   46   56   15]
 [   0 1428   34    0    0   31   12    1   14    0]
 [  27   28 1010   49   41   29  104   18   97   11]
 [  21   21   40 1012   11  162   43   15  129   17]
 [   2   25   31    2  894   26   26   63   22  267]
 [  38   22   27  102   30  735   59   39  119   34]
 [  53   26   97   21    9   29 1105    3   36   18]
 [   0   30   21   12    7   18   15 1197   36  144]
 [  23   23   78   60   16  114   25    7  918   70]
 [   4   27    7   25  181   69   22  145   35  833]]


<h3> Evaluation Metrics:  Precision - Recall - F1_Score </h3>

In [30]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, predictions, average='macro') # precision_score(y_true, y_pred, average='macro')

print('Macro precision score: {0:0.2f}'.format(
      precision))

Macro precision score: 0.73


In [34]:
from sklearn.metrics import recall_score
recall = recall_score(y_test, predictions, average='macro') 

print('Macro recall score: {0:0.2f}'.format(
      recall))

Macro recall score: 0.73


In [32]:
from sklearn.metrics import f1_score
f1_s = f1_score(y_test, predictions, average='macro')

print('Macro f1 score: {0:0.2f}'.format(
      f1_s))
                    

Macro f1 score: 0.73


<h3> Feature Ranking </h3>

In [37]:
# The Recursive Feature Elimination (or RFE) works by recursively removing attributes 
# and building a model on those attributes that remain.
# It uses the model accuracy to identify which attributes 
# contribute the most to predicting the target attribute.

from pandas import read_csv
from sklearn.feature_selection import RFE


# feature extraction
# model = SVR(kernel="linear")
# Select best 3 attributes
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(new_X, y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True  True False  True False False False False False False False False
 False False False]
Feature Ranking: [ 1  1  5  1  3  2  4  6 11  9 13 10  7 12  8]


In [39]:
# Show the selected features
print('Selected features:')
feats = list(data.drop(['label'],axis=1).columns.values)
for i, rank in enumerate(fit.ranking_):
    if rank == 1:
        print(feats[i])

Selected features:
pixel12
pixel13
pixel15
