In [None]:
import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve


def plot(image):
    plt.figure()
    plt.imshow(image.reshape((32, 32))
               if image.shape[0] != 32 else image, cmap="gray")
    plt.show()


def plot_confusion_matrix(
    cm, classes, normalize=False, title="Confusion matrix", cmap=plt.cm.Blues
):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")

    fig = plt.figure(figsize=(6.5, 6.5))
    plt.imshow(cm, interpolation="none", cmap=cmap)
    plt.title(title)
    plt.colorbar(fraction=0.046, pad=0.04)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, ha="right")
    plt.yticks(tick_marks, classes)

    fmt = ".2f" if normalize else "d"
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
        )

    plt.tight_layout()
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    # return fig


# Get data

In [None]:
import keras.datasets.cifar10 as cifar10

(Xtrain, ytrain), (Xtest, ytest) = cifar10.load_data()

# Transform data into array from 1-col matrix
ytrain = ytrain.squeeze()
ytest = ytest.squeeze()

print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)


# Randomize train data and sort test data

In [None]:
idx = rnd.permutation(Xtrain.shape[0])
Xtrain = Xtrain[idx]
ytrain = ytrain[idx]

idx = np.argsort(ytest)
Xtest = Xtest[idx]
ytest = ytest[idx]


# Convert images to grayscale
We're converting the images to grayscale to improve the performance of the classification. We're expecting that the performance is worse than with colored images because there are some images that would be easier to classify according to color. For example, we would expect the model to identify the blue around the ship or the plane to better distiguish them from the others

In [None]:
from skimage import color

XtrainG = np.zeros((Xtrain.shape[0], 32, 32))
for i in range(Xtrain.shape[0]):
    XtrainG[i] = color.rgb2gray(Xtrain[i])

XtestG = np.zeros((Xtest.shape[0], 32, 32))
for i in range(Xtest.shape[0]):
    XtestG[i] = color.rgb2gray(Xtest[i])


# Convert images to vectors

In [None]:
XtrainG = XtrainG.reshape((XtrainG.shape[0], XtrainG.shape[1] ** 2)) * 1.0
XtestG = XtestG.reshape((XtestG.shape[0], XtestG.shape[1] ** 2)) * 1.0

print(XtrainG.shape, ytrain.shape)
print(XtestG.shape, ytest.shape)


# Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler().fit(XtrainG)
XtrainGS = ss.transform(XtrainG)
XtestGS = ss.transform(XtestG)


# Binarize for ship (index 8)

In [None]:
ytrainB = (ytrain == 8) * 1
ytestB = (ytest == 8) * 1


#### Change names

In [None]:
X1 = XtrainG
X1s = XtrainGS
y1 = ytrain
y1b = ytrainB
X2 = XtestG
X2s = XtestGS
y2 = ytest
y2b = ytestB


# Binary classification

## SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier


### Not normalized data

In [None]:
sgdb = SGDClassifier().fit(X1, y1b)
y2e = sgdb.predict(X2)

print('Score:', sgdb.score(X2, y2b))
print("Number of errors:", np.sum(y2b != y2e))
print(classification_report(y2b, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2b)
plt.show()

print('Prediction')
cm = confusion_matrix(y2b, y2e)
print(cm)


#### Decision function

In [None]:
th = -1.5
y2d = sgdb.decision_function(X2)

plt.figure()
plt.plot(y2d)
plt.axhline(th, color='r')
plt.title("Decision function")
plt.show()

print(f'Decision function with threshold = {th}')
cm = confusion_matrix(y2b, (y2d >= th)*1)
print(cm)

fp, tp, t = roc_curve(y2b, y2d)
plt.figure()
plt.plot(fp, tp)
plt.plot(cm[0][1] / (cm[0][0] + cm[0][1]),
         cm[1][1] / (cm[1][0] + cm[1][1]), 'ob')
plt.title('ROC curve')
plt.xlabel("FP-rate")
plt.ylabel("TP-rate")
plt.grid(True)
plt.show()


### Normalized data

In [None]:
sgdbn = SGDClassifier().fit(X1s, y1b)
y2e = sgdbn.predict(X2s)

print('Score:', sgdbn.score(X2s, y2b))
print("Number of errors:", np.sum(y2b != y2e))
print(classification_report(y2b, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2b)
plt.show()

print('Prediction')
cm = confusion_matrix(y2b, y2e)
print(cm)


#### Decision function

In [None]:
th = -1.5
y2d = sgdbn.decision_function(X2)

plt.figure()
plt.plot(y2d)
plt.axhline(th, color='r')
plt.title("Decision function")
plt.show()

print(f'Decision function with threshold = {th}')
cm = confusion_matrix(y2b, (y2d >= th)*1)
print(cm)

fp, tp, t = roc_curve(y2b, y2d)
plt.figure()
plt.plot(fp, tp)
plt.plot(cm[0][1] / (cm[0][0] + cm[0][1]),
         cm[1][1] / (cm[1][0] + cm[1][1]), 'ob')
plt.title('ROC curve')
plt.xlabel("FP-rate")
plt.ylabel("TP-rate")
plt.grid(True)
plt.show()


We can see that without any normalization, the classifier doesn't guess any image as ship, resulting in a score of 0.9 because every other class isn't a ship.
With normalization, the classifier was able to guess some of the images as ship, despite getting a bit more errors

## RandomForestClassifier
why

In [None]:
from sklearn.ensemble import RandomForestClassifier


### Not normalized data

In [None]:
rfcb = RandomForestClassifier().fit(X1, y1b)
y2e = rfcb.predict(X2)

print('Score:', rfcb.score(X2, y2b))
print("Number of errors:", np.sum(y2b != y2e))
print(classification_report(y2b, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2b)
plt.show()

print('Prediction')
cm = confusion_matrix(y2b, y2e)
print(cm)


#### Decision function

In [None]:
th = 0.25
y2d = rfcb.predict_proba(X2)[:, 1]

plt.figure()
plt.plot(y2d)
plt.axhline(th, color='r')
plt.title("Decision function")
plt.show()

print(f'Decision function with threshold = {th}')
cm = confusion_matrix(y2b, (y2d >= th)*1)
print(cm)

fp, tp, t = roc_curve(y2b, y2d)
plt.figure()
plt.plot(fp, tp)
plt.plot(cm[0][1] / (cm[0][0] + cm[0][1]),
         cm[1][1] / (cm[1][0] + cm[1][1]), 'ob')
plt.title('ROC curve')
plt.xlabel("FP-rate")
plt.ylabel("TP-rate")
plt.grid(True)
plt.show()


### Normalized data

In [None]:
rfcbn = RandomForestClassifier().fit(X1s, y1b)
y2e = rfcbn.predict(X2s)

print('Score:', rfcbn.score(X2s, y2b))
print("Number of errors:", np.sum(y2b != y2e))
print(classification_report(y2b, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2b)
plt.show()

print('Prediction')
cm = confusion_matrix(y2b, y2e)
print(cm)


#### Decision function

In [None]:
th = 0.25
y2d = rfcbn.predict_proba(X2s)[:, 1]

plt.figure()
plt.plot(y2d)
plt.axhline(th, color='r')
plt.title("Decision function")
plt.show()

print(f'Decision function with threshold = {th}')
cm = confusion_matrix(y2b, (y2d >= th)*1)
print(cm)

fp, tp, t = roc_curve(y2b, y2d)
plt.figure()
plt.plot(fp, tp)
plt.plot(cm[0][1] / (cm[0][0] + cm[0][1]),
         cm[1][1] / (cm[1][0] + cm[1][1]), 'ob')
plt.title('ROC curve')
plt.xlabel("FP-rate")
plt.ylabel("TP-rate")
plt.grid(True)
plt.show()


#### Compare normalization!

#### Compare classifiers!

# Multi-class classification

We decided to use the SGDClassifier because it takes a significant less amount of time to train comparing to the RandomForestClassifier

### Not normalized data

In [None]:
sgd = SGDClassifier().fit(X1, y1)
y2e = sgd.predict(X2)

print('Score:', sgd.score(X2, y2))
print("Number of errors:", np.sum(y2 != y2e))
print(classification_report(y2, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2)
plt.show()

print('Prediction')
cm = confusion_matrix(y2, y2e)
# print(cm)

plot_confusion_matrix(
    cm,
    classes=[
        "airplane",
        "automobile",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    ],
)


### Normalized data

In [None]:
sgdn = SGDClassifier().fit(X1s, y1)
y2e = sgdn.predict(X2s)

print('Score:', sgdn.score(X2s, y2))
print("Number of errors:", np.sum(y2 != y2e))
print(classification_report(y2, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2)
plt.show()

print('Prediction')
cm = confusion_matrix(y2, y2e)
# print(cm)

plot_confusion_matrix(
    cm,
    classes=[
        "airplane",
        "automobile",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    ],
)


We can see that in the multi-class classification, the classifier performed better with the data normalized

### Validation
We'll use only the train data to estimate the performance of the classifier and compare. We'll be using the normalized data because of the better performance.
We could use the KFold strategy to do the same but we decided to just split the training data and working with that as if it was the train and test data provided

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X1s, y1, stratify=y1, test_size=0.2)

# Randomize new train data
idx = rnd.permutation(X_train.shape[0])
X_train = X_train[idx]
y_train = y_train[idx]

# Order new test data
idx = np.argsort(y_test)
X_test = X_test[idx]
y_test = y_test[idx]

sgdn = SGDClassifier().fit(X_train, y_train)
y_test_e = sgdn.predict(X_test)

print('Score:', sgdn.score(X_test, y_test))
print("Number of errors:", np.sum(y_test != y_test_e))
print(classification_report(y_test, y_test_e))

plt.figure()
plt.plot(y_test_e, ".", alpha=0.5)
plt.plot(y_test)
plt.show()

print('Prediction')
cm = confusion_matrix(y_test, y_test_e)
# print(cm)

plot_confusion_matrix(
    cm,
    classes=[
        "airplane",
        "automobile",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    ],
)


## PCA
The images are in grayscale and are already with a low resolution. We're expecting that the PCA will impact the classification because the images already have low information and reducing the dimensionality will worsen the performance

In [None]:
from sklearn.decomposition import PCA


### Not normalized data

In [None]:
n_components = np.arange(50, 501, 50)
best_score = 0
best_n = 0
best_sgd = None
best_pca = None

for n in n_components:
    pca = PCA(n_components=n).fit(X1)
    X1p = pca.transform(X1)
    X2p = pca.transform(X2)

    sgd = SGDClassifier().fit(X1p, y1)
    y2e = sgd.predict(X2p)

    score = sgd.score(X2p, y2)

    if score > best_score:
        best_score = score
        best_n = n
        best_sgd = sgd
        best_pca = pca

print('Best n_components:', best_n)

sgd = best_sgd
pca = best_pca

# pca = PCA(n_components=best_n).fit(X1)
X1p = pca.transform(X1)
X2p = pca.transform(X2)

# sgd = SGDClassifier().fit(X1p, y1)
y2e = sgd.predict(X2p)

print('Score:', sgd.score(X2p, y2))
print("Number of errors:", np.sum(y2 != y2e))
print(classification_report(y2, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2)
plt.show()

print('Prediction')
cm = confusion_matrix(y2, y2e)
# print(cm)

plot_confusion_matrix(
    cm,
    classes=[
        "airplane",
        "automobile",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    ],
)


### Normalized data

In [None]:
n_components = np.arange(50, 501, 50)
best_score = 0
best_n = 0
best_sgd = None
best_pca = None

for n in n_components:
    pca = PCA(n_components=n).fit(X1s)
    X1sp = pca.transform(X1s)
    X2sp = pca.transform(X2s)

    sgdn = SGDClassifier().fit(X1sp, y1)
    y2e = sgdn.predict(X2sp)

    score = sgd.score(X2sp, y2)

    if score > best_score:
        best_score = score
        best_n = n
        best_sgd = sgdn
        best_pca = pca

print('Best n_components:', best_n)

sgdn = best_sgd
pca = best_pca

# pca = PCA(n_components=best_n).fit(X1s)
X1sp = pca.transform(X1s)
X2sp = pca.transform(X2s)

# sgdn = SGDClassifier().fit(X1sp, y1)
y2e = sgdn.predict(X2sp)

print('Score:', sgdn.score(X2sp, y2))
print("Number of errors:", np.sum(y2 != y2e))
print(classification_report(y2, y2e))

plt.figure()
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2)
plt.show()

print('Prediction')
cm = confusion_matrix(y2, y2e)
# print(cm)

plot_confusion_matrix(
    cm,
    classes=[
        "airplane",
        "automobile",
        "bird",
        "cat",
        "deer",
        "dog",
        "frog",
        "horse",
        "ship",
        "truck",
    ],
)


# Example preview

In [None]:
plot(Xtrain[0])
plot(XtrainG[0])
print(ytrain[0])
