# Import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras.datasets as DT
import numpy.random as rd


In [None]:
import tensorflow as tf


In [None]:
tf.__version__


## Import Data

In [None]:
(X1, y1), (X2, y2) = DT.fashion_mnist.load_data()
print("Training set:", X1.shape, X1.dtype)
print("Test set:    ", X2.shape, X2.dtype)


### View some images

In [None]:
idxList = []
for i in range(10):
    idxTemp = np.argwhere(y1 == i).squeeze()
    idxList.append(idxTemp)


for i in range(10):
    idx = idxList[i]
    for n in range(20):
        if n == 0:
            Img = np.hstack((255 - X1[idx[n]], np.ones((28, 3)) * 255))
        else:
            Img = np.hstack((Img, 255 - X1[idx[n]], np.ones((28, 3)) * 255))

    if i == 0:
        ImgT = np.vstack((Img, np.ones((3, Img.shape[1])) * 255))
    else:
        ImgT = np.vstack((ImgT, Img, np.ones((3, Img.shape[1])) * 255))

plt.figure(figsize=(20, 10))
plt.imshow(ImgT, cmap="gray")
plt.axis("off")


## Shuffle training set and sort test set

In [None]:
idx = rd.permutation(X1.shape[0])
X1 = X1[idx]
y1 = y1[idx]


In [None]:
idx = np.argsort(y2)
y2 = y2[idx]
X2 = X2[idx]
plt.plot(y2)


## Images must be in vector format (make sure that data is in "float" format)

In [None]:
X1 = X1.reshape((60000, 28**2)) * 1.0
X2 = X2.reshape((X2.shape[0], 28**2)) * 1.0
print("Training set:", X1.shape, X1.dtype)
print("Test set:    ", X2.shape, X2.dtype)


## Train, predict and check confusion matrix - use SGDClassifier (try others)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier().fit(X1, y1)
y2e = sgd.predict(X2)


In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y2, y2e))
print("Total number of erros %d (in %d)" % (np.sum(y2 != y2e), X2.shape[0]))


## Since test set is ordered, one can also check erros visually

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(y2e, ".", alpha=0.5)
plt.plot(y2)


## Possible problem: different means and variances of each of the 784 data dimensions

In [None]:
m = np.mean(X1, axis=0)
plt.figure(figsize=(15, 5))
plt.plot(m, ".-")
plt.grid(True)
plt.title("Means")

s = np.std(X1, axis=0)
plt.figure(figsize=(15, 5))
plt.plot(s, ".-")
plt.grid(True)
plt.title("Standard Deviations")


## Pre-process data (0 mean and unit variance in each dimension)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler().fit(X1)
X1s = sc.transform(X1)
X2s = sc.transform(X2)


In [None]:
sgd = SGDClassifier().fit(X1s, y1)
y2e = sgd.predict(X2s)
print(confusion_matrix(y2, y2e))
print("Total number of erros %d (in %d)" % (np.sum(y2 != y2e), X2.shape[0]))


## Pre-process data with PCA - use different values for total variance
## Repeat the process with the normalized data (StandarScaler) to check if it is better

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9, whiten=True).fit(X1)
X1p = pca.transform(X1)
X2p = pca.transform(X2)
print("Nº of Principal Components kept: %d" % X1p.shape[1])


In [None]:
sgd = SGDClassifier().fit(X1p, y1)
y2e = sgd.predict(X2p)
print(confusion_matrix(y2, y2e))
print("Total number of erros %d (in %d)" % (np.sum(y2 != y2e), X2.shape[0]))


# Binary classification problem (positives are class 3 - dresses)

### Change labels

In [None]:
y1b = (y1 == 3) * 1
y2b = (y2 == 3) * 1


### Train and test

In [None]:
sgd = SGDClassifier().fit(X1p, y1b)
y2e = sgd.predict(X2p)
print(confusion_matrix(y2b, y2e))
print("Total number of erros %d (in %d)" % (np.sum(y2b != y2e), y2b.shape[0]))


In [None]:
r1 = np.sum(y2e[y2b == 1] == 1) / (
    np.sum(y2e[y2b == 1] == 1) + np.sum(y2e[y2b == 1] == 0)
)  # recall
p1 = np.sum(y2e[y2b == 1] == 1) / (
    np.sum(y2e[y2b == 1] == 1) + np.sum(y2e[y2b == 0] == 1)
)  # precision
f1 = np.sum(y2e[y2b == 0] == 1) / (
    np.sum(y2e[y2b == 0] == 0) + np.sum(y2e[y2b == 0] == 1)
)  # fp-rate
print("Recall: %3f - Precision: %3f - FP-rate: %3f" % (r1, p1, f1))


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y2b, y2e))


# Model Calibration (changing decision threshold)

In [None]:
r2e = sgd.decision_function(X2p)
plt.figure(figsize=(15, 5))
plt.plot(r2e)
plt.grid(True)


In [None]:
print(confusion_matrix(y2b, (r2e >= 0) * 1))  # default threshold
print(
    "Total number of erros %d (in %d)" % (np.sum(y2b != (r2e >= 0) * 1), y2b.shape[0])
)


## Change threshold - reduce the false negatives

In [None]:
lim2 = -0.191
y2eB = (r2e > lim2) * 1
print(confusion_matrix(y2b, y2eB))
print("Total number of erros %d (in %d)" % (np.sum(y2b != y2eB), y2b.shape[0]))


In [None]:
r2 = np.sum(y2eB[y2b == 1] == 1) / (
    np.sum(y2eB[y2b == 1] == 1) + np.sum(y2eB[y2b == 1] == 0)
)
p2 = np.sum(y2eB[y2b == 1] == 1) / (
    np.sum(y2eB[y2b == 1] == 1) + np.sum(y2eB[y2b == 0] == 1)
)
f2 = np.sum(y2eB[y2b == 0] == 1) / (
    np.sum(y2eB[y2b == 0] == 0) + np.sum(y2eB[y2b == 0] == 1)
)
print("Recall: %3f - Precision: %3f - FP-rate: %3f" % (r2, p2, f2))


In [None]:
lim3 = -1.75
y2eC = (r2e > lim3) * 1
print(confusion_matrix(y2b, y2eC))
print("Total number of erros %d (in %d)" % (np.sum(y2b != y2eC), y2.shape[0]))
r3 = np.sum(y2eC[y2b == 1] == 1) / (
    np.sum(y2eC[y2b == 1] == 1) + np.sum(y2eC[y2b == 1] == 0)
)
p3 = np.sum(y2eC[y2b == 1] == 1) / (
    np.sum(y2eC[y2b == 1] == 1) + np.sum(y2eC[y2b == 0] == 1)
)
f3 = np.sum(y2eC[y2b == 0] == 1) / (
    np.sum(y2eC[y2b == 0] == 0) + np.sum(y2eC[y2b == 0] == 1)
)
print("Recall: %3f - Precision: %3f - FP-rate: %3f" % (r3, p3, f3))


# ROC curve + Precision-recall curve - 3 thresholds

In [None]:
import sklearn.metrics as skm


In [None]:
fp, tp, t = skm.roc_curve(y2b, r2e)

plt.figure(figsize=(7, 7))
plt.plot(fp, tp)
plt.plot(f1, r1, "or")
plt.plot(f2, r2, "ob")
plt.plot(f3, r3, "og")

plt.axis("scaled")
plt.grid(True)


In [None]:
pre, rec, lim = skm.precision_recall_curve(y2b, r2e)
plt.figure(figsize=(7, 7))
plt.plot(pre, rec)
plt.plot(p1, r1, "or")
plt.plot(p2, r2, "ob")
plt.plot(p3, r3, "og")

plt.axis("scaled")
plt.grid(True)


## Precion, Recall and F-score  for the thresholds used

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(lim, pre[:-1], "b")
plt.plot(lim, rec[:-1], "r")
fsc = 2 * pre * rec / (pre + rec)
plt.plot(lim, fsc[:-1], "g")

plt.plot(0, p1, "or")
plt.plot(0, r1, "or")
plt.plot(lim2, p2, "ob")
plt.plot(lim2, r2, "ob")
plt.plot(lim3, p3, "og")
plt.plot(lim3, r3, "og")

plt.axis([np.floor(lim.min()), np.ceil(lim.max()), 0, 1])
plt.grid(True)


In [None]:
np.floor(lim.min())


## Model Comparison (try another classifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RandF = RandomForestClassifier().fit(X1p, y1b)
y2f = RandF.predict(X2p)
print(confusion_matrix(y2b, y2f))
print("Total number of erros %d (in %d)" % (np.sum(y2b != y2f), y2b.shape[0]))


In [None]:
r2f = RandF.predict_proba(X2p)[:, 1]
yB = (r2f > 0.5) * 1  # default threshold
print(confusion_matrix(y2b, yB))
print("Total number of erros %d (in %d)" % (np.sum(y2b != yB), y2b.shape[0]))


In [None]:
fp, tp, t = skm.roc_curve(y2b, r2e)

plt.figure(figsize=(7, 7))
plt.plot(fp, tp, color=[0.3, 0.6, 0.1])
plt.text(0.2, 0.8, "SGDClassifier", fontsize=14, color=[0.3, 0.6, 0.1])

plt.plot(f1, r1, "or")
plt.plot(f2, r2, "ob")
plt.plot(f3, r3, "og")

fp2, tp2, t2 = skm.roc_curve(y2b, r2f)
r4 = np.sum(yB[y2b == 1] == 1) / (np.sum(yB[y2b == 1] == 1) + np.sum(yB[y2b == 1] == 0))
p4 = np.sum(yB[y2b == 1] == 1) / (np.sum(yB[y2b == 1] == 1) + np.sum(yB[y2b == 0] == 1))
f4 = np.sum(yB[y2b == 0] == 1) / (np.sum(yB[y2b == 0] == 0) + np.sum(yB[y2b == 0] == 1))
print("Recall: %3f - Precision: %3f - FP-rate: %3f" % (r2, p2, f2))
plt.plot(fp2, tp2, color=[0.9, 0.3, 0.2])
plt.text(
    0.2, 0.85, "RandomForest (needs calibration)", fontsize=14, color=[0.9, 0.3, 0.2]
)

plt.plot(f4, r4, ".", ms=15, color=[0.9, 0.3, 0.2])
plt.axis("scaled")
plt.grid(True)
