# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier, VotingClassifier

# Read data

In [2]:
train = pd.read_csv('mnist_train.csv')
test = pd.read_csv('mnist_test.csv')

In [142]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 359.3 MB


In [143]:
train.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
nan = train.isnull().sum()
print(nan[nan != 0])

Series([], dtype: int64)


# Data preprocessing

In [25]:
train_y = train['label']
train_x = train.drop(columns = 'label', inplace = False)
test_y = test['label']
test_x = test.drop(columns = 'label', inplace = False)

In [147]:
train_x.describe()

Unnamed: 0,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,1x10,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200433,0.088867,0.045633,0.019283,0.015117,0.002,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.042472,3.956189,2.839845,1.68677,1.678283,0.3466,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


# Trial 1 using binary features

Rescale image to all be from 0 to 255 then threshold each image to make them binary based on each image's mean

In [26]:
for row in train_x.index:
    pmin, pmax = train_x.iloc[row].min(), train_x.iloc[row].max()
    train_x.iloc[row] = train_x.iloc[row].apply(lambda x: 255*(x-pmin) / (pmax - pmin))
    mean = train_x.iloc[row].mean()
    train_x.iloc[row] = train_x.iloc[row].apply(lambda x: 0 if x<mean else 255)

In [27]:
for row in test_x.index:
    pmin, pmax = test_x.iloc[row].min(), test_x.iloc[row].max()
    test_x.iloc[row] = test_x.iloc[row].apply(lambda x: 255*(x-pmin) / (pmax - pmin))
    mean = test_x.iloc[row].mean()
    test_x.iloc[row] = test_x.iloc[row].apply(lambda x: 0 if x<mean else 255)

In [178]:
train_x.describe()

Unnamed: 0,1x13,1x14,1x15,2x6,2x7,2x8,2x9,2x10,2x11,2x12,...,28x15,28x16,28x17,28x18,28x19,28x20,28x21,28x22,28x23,28x24
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,0.00425,0.0085,0.00425,0.00425,0.02125,0.03825,0.06375,0.1105,0.17425,0.255,...,1.08375,0.952,0.78625,0.54825,0.33575,0.16575,0.07225,0.0425,0.0255,0.0085
std,1.041033,1.472231,1.041033,1.041033,2.327743,3.122891,4.031434,5.307142,6.663642,8.059842,...,16.588741,15.551776,14.137853,11.811245,9.246889,6.499191,4.291717,3.291789,2.549894,1.472231
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


* Some columns (pixels) have values that don't change (always 0 or 255) throughout whole dataset ==> remove those columns in train and test

In [28]:
dropped_0 = []
dropped_255 = []
for col in train_x:
    if train_x[col].min() == 255:
        train_x = train_x.drop(columns=[col], inplace=False)
        test_x = test_x.drop(columns=[col], inplace=False)
        dropped_255.append(col)
for col in train_x:
    if train_x[col].max() == 0:
        train_x = train_x.drop(columns=[col], inplace=False)
        test_x = test_x.drop(columns=[col], inplace=False)
        dropped_0.append(col)

## Separate classifiers

**Gaussian Naive Bayes**:

* Using bayes' rule for conditional probabilities to predict the probability of belonging to each class.
* Naive assumption => features are conditionally independent and have a gaussian distribution
* Gave a very low accuracy => I think because it doesn't suite this type of data 

In [29]:
train_y = train_y.iloc[:15000]
train_x = train_x.iloc[:15000]

In [30]:
gnb = GaussianNB()
NB = gnb.fit(train_x, train_y)
y_pred = NB.predict(test_x)
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 4507
Accuracy: 0.5493


**Decision Tree**:

* What we took in class was ID3 implementation
* Scikit uses CART implementation which choses features based on gini value
* Some post-pruning is done
* It supports numerical target variables (regression) and does not compute rule sets
* CART constructs binary trees using the feature and threshold that yield the largest information gain at each node

In [31]:
DTclf = tree.DecisionTreeClassifier()
DTclf = DTclf.fit(train_x, train_y)

In [32]:
y_pred = DTclf.predict(test_x)
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 1591
Accuracy: 0.8409


**One-vs-rest SVC**:

* SVM works on principle of dividing the data into 2 classes by using a hyperplane that's midway between 2 other hyperplanes described by support vectors.
* It uses lagrange to get the new value of weights based on constraints
* My implementation of SVM from scratch is in the "SVM from scratch" notebook, and based on PEGASUS algorithm whose pseudocode is in the uploaded paper "PegasosMPB.pdf".
* In Scikit, LinearSVC implements “one-vs-the-rest” multi-class strategy, thus training n_class models. If there are only two classes, only one model is trained.
* Gives the highest accuracy

In [33]:
SVMclf = LinearSVC(random_state=0, tol=1e-5, max_iter = 1500)
SVMclf.fit(train_x, train_y)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1500,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [34]:
y_pred = SVMclf.predict(test_x)
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 1479
Accuracy: 0.8521


## Ensemble classifier

* Ensemble classifiers are based on the concept of combining a number of weak learners to get a strong learner.
* In a voting classifier, ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities from each class.
* The weights given to each estimator are a default of [1, 1, 1]

In [38]:
vote = VotingClassifier([('svc', SVC(kernel='linear',probability=True)),
                            ('gnb', GaussianNB()),
                            ('dt', tree.DecisionTreeClassifier())], voting='soft')
# 'lsvc', LinearSVC()
# vote = vote.fit(train_x, train_y)

In [36]:
# y_pred = vote.predict(test_x)
# print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
# print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 1159
Accuracy: 0.8841


Bagging, where subsets of dataset are drawn with replacements to train each base_estimator

In [39]:
bagging = BaggingClassifier(base_estimator=vote, n_estimators=10, random_state=0)
bagging = bagging.fit(train_x, train_y)

In [40]:
y_pred = bagging.predict(test_x)
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 937
Accuracy: 0.9063


Pasting, where subsets of dataset are drawn without replacements to train each base_estimator

In [41]:
pasting = BaggingClassifier(base_estimator=vote, n_estimators=10, random_state=0, bootstrap = False)
pasting = pasting.fit(train_x, train_y)

In [42]:
y_pred = pasting.predict(test_x)
print("Number of mislabeled points out of a total %d points : %d" % (test_x.shape[0], (test_y != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y, y_pred))

Number of mislabeled points out of a total 10000 points : 1086
Accuracy: 0.8914


# Trial 2 using standard scaled features

In [17]:
train_y2 = train['label'].iloc[:15000]
train_x2 = train.drop(columns = 'label', inplace = False).iloc[:15000]
test_y2 = test['label']
test_x2 = test.drop(columns = 'label', inplace = False)

In [18]:
dropped_0 = []
dropped_255 = []
for col in train_x2:
    if train_x2[col].min() == 255:
        train_x2 = train_x2.drop(columns=[col], inplace=False)
        test_x2 = test_x2.drop(columns=[col], inplace=False)
        dropped_255.append(col)
for col in train_x2:
    if train_x2[col].max() == 0:
        train_x2 = train_x2.drop(columns=[col], inplace=False)
        test_x2 = test_x2.drop(columns=[col], inplace=False)
        dropped_0.append(col)

In [19]:
scaler = preprocessing.StandardScaler().fit(train_x2)
scaled_train_x = scaler.transform(train_x2)
scaled_test_x = scaler.transform(test_x2)

## Separate classifiers

In [20]:
gnb = GaussianNB()
NB = gnb.fit(scaled_train_x, train_y2)
y_pred = NB.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_y2 != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y2, y_pred))

Number of mislabeled points out of a total 10000 points : 4662
Accuracy: 0.5338


In [21]:
DTclf = tree.DecisionTreeClassifier()
DTclf = DTclf.fit(scaled_train_x, train_y2)

In [22]:
y_pred = DTclf.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_y2 != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y2, y_pred))

Number of mislabeled points out of a total 10000 points : 1700
Accuracy: 0.83


In [23]:
SVMclf = LinearSVC(random_state=0, tol=1e-5, max_iter = 1500)
SVMclf.fit(scaled_train_x, train_y2)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1500,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [24]:
y_pred = SVMclf.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_y2 != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_y2, y_pred))

Number of mislabeled points out of a total 10000 points : 1236
Accuracy: 0.8764


## Ensemble classifier

**Trianed on 15000 sample only because of long training time for 60000**

In [7]:
train_yE = train_y2
train_xE = train_x2
test_yE = test_y2
test_xE = test_x2

In [8]:
scalerE = preprocessing.StandardScaler().fit(train_xE)
scaled_train_x = scalerE.transform(train_xE)
scaled_test_x = scalerE.transform(test_xE)

**Voting classifier with Bagging or Pasting**

In [12]:
voteEN = VotingClassifier([('svc', SVC(kernel='linear',probability=True)),
                            ('gnb', GaussianNB()),
                            ('dt', tree.DecisionTreeClassifier())], voting='soft')

**Bagging**

In [13]:
bagging = BaggingClassifier(base_estimator=voteEN, n_estimators=10, random_state=0)
bagging = bagging.fit(scaled_train_x, train_yE)

In [14]:
y_pred = bagging.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_yE != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_yE, y_pred))

Number of mislabeled points out of a total 10000 points : 935
Accuracy: 0.9065


**Pasting**

In [15]:
pasting = BaggingClassifier(base_estimator=vote, n_estimators=10, random_state=0, bootstrap = False)
pasting = pasting.fit(scaled_train_x, train_yE)

In [16]:
y_pred = pasting.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_yE != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_yE, y_pred))

Number of mislabeled points out of a total 10000 points : 1141
Accuracy: 0.8859


**Voting classifier on its own**

In [9]:
vote = VotingClassifier([('svc', SVC(kernel='linear',probability=True)),
                            ('gnb', GaussianNB()),
                            ('dt', tree.DecisionTreeClassifier())], voting='soft')
# 'lsvc', LinearSVC())
vote = vote.fit(scaled_train_x, train_yE)

In [10]:
y_pred = vote.predict(scaled_test_x)
print("Number of mislabeled points out of a total %d points : %d" % (scaled_test_x.shape[0], (test_yE != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(test_yE, y_pred))

Number of mislabeled points out of a total 10000 points : 1234
Accuracy: 0.8766


# Comparison

|                             | Naive Bayes     | Decision Tree   |  | SVM     || Voting + Bagging  | Voting + Pasting |
|---                          |---              |--              -|--|--      -|--|-                --|                --|
|Trial 2: Standard scaling    |       53.38%    |       83%       |  |87.64% |     | 90.65%           |      88.59%      | 
|Trial 1: Binary Thresholding |       54.93%    |       84.09%    |  |85.21% |     |   90.63%         |        89.14%    |