In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", as_frame = False)

In [None]:
X, y = mnist.data, mnist.target
X
X[0].shape

In [None]:
import matplotlib.pyplot as plt
def plot_digit(image_data):
    image = image_data.reshape(28, 28)
    plt.imshow(image, cmap = "binary")
    plt.axis("off")

some_digit = X[0]
plot_digit(some_digit)
plt.show()

In [None]:
y[0]

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
plt.figure(figsize = (9, 9))
for idx, image_data in enumerate(X[:100]):
    plt.subplot(10,10, idx + 1)
    plot_digit(image_data)
plt.subplots_adjust(wspace = 0, hspace = 0)
plt.show()


#### TRAINING A BINARY CLASSIFIER

In [None]:
y_train_5 = (y_train == "5")
y_test_5 = (y_test == "5")

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train_5)

#### USING CROSS VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring = "accuracy", verbose = 1)

#### Using a dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train_5)
print(any(dummy_clf.predict(X_train)))


#### Validating dummy classifier

In [None]:
cross_val_score(dummy_clf, X_train, y_train_5, cv = 3, scoring = "accuracy")

##### Counting the how many entires are 5
This showed us the reason that the validation is so high because most of the values in the dataset are 5.

In [None]:
count_of_5 = (y_train_5 == True).sum()
print(count_of_5 / len(y_train_5))
# print(y_train_5)

#### Implementing a confusion matrix


In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3)

In [None]:
# y_train_pred_int = y_train_pred.astype(int)
# y_train_pred

##### Each row in a confusion matrix represents an actual class, whereas each column represents a predicted class.

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_5, y_train_pred)
cm

#### Suppose we have the perfect confusion matrix, it will non zero values in its main diagonal

In [None]:
y_train_perfect_predictions = y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)


#### precision of classifier
precision = TP / (TP + FP), where TP = True positives & FP = false positives

#### recall / sensitivity / true positive rate
recall = TP /(TP + FN)


#### Implementing precison and recall 

In [None]:
from sklearn.metrics import precision_score, recall_score
print(f" precision score - {precision_score(y_train_5, y_train_pred)}")
print(f"recall score - {recall_score(y_train_5, y_train_pred)}")


The classifier is only correct 83.7% of times, whereas it only detects 65.1% of 5's.

#### F1 Score
This is harmonic mean of precision and recall, giving much more weight to low values. F1 will only be high when both precision and recall are high.

In [None]:
from sklearn.metrics import f1_score
print(f"f1 score - {f1_score(y_train_5, y_train_pred)}")

#### Decision function
Looking at how SGDClassifier makes its classification decisions using a decision fucntion which will help understand precision/recall trade off

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
print(f"y score - {y_scores}")

In [None]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
threshold = 3000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

The above code shows that by increasing the threshold, precision increases but recall decreases.

#### Which threshold to use?
First we use cross_val_predict function to get decison scores of all instances in the training set.

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3,
                             method = "decision_function")

Now with these scores we use precision_recall_curve which returns precison, recall and threshold used for calculating the first two.
Basically, it uses different thresholds and calculates P and R to find the best threshold.

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
plt.figure(figsize = (8, 4))
plt.plot(thresholds,precisions[:-1], "b--", label = "Precision",  linewidth = 2)
plt.plot(thresholds, recalls[:-1], "g-", label = "Recall", linewidth = 2)
plt.vlines(threshold, 0, 1.0, "k", "dotted", label = "threshold")
#We use 3000 to draw a vertical line as a refrence threhold

idx = (thresholds >= threshold).argmax()
#argmax returns the index of first occurance of max value in an array
plt.plot(thresholds[idx], precisions[idx],"bo")
plt.plot(thresholds[idx], recalls[idx], "go")
plt.axis([-50000, 50000, 0, 1])
plt.grid()
plt.xlabel("Threshold")
plt.legend(loc = "center right")
plt.show()

PLotting precision recall curve

In [None]:
import matplotlib.patches as patches

plt.figure(figsize = (6,5))
plt.plot(recalls, precisions, linewidth = 2, label = "Precision/Recall curve")

plt.plot([recalls[idx], recalls[idx]], [0, precisions[idx]], "k:")
plt.plot([0, recalls[idx]], [precisions[idx], precisions[idx]], "k:")
plt.plot(recalls[idx], precisions[idx], "ko", label = "Point at threshold 3000")
plt.gca().add_patch(patches.FancyArrowPatch(
    (0.79, 0.60), (0.61, 0.78), connectionstyle="arc3, rad=.2",
    arrowstyle="Simple, tail_width = 1.5, head_width = 8, head_length = 10",
    color="#444444"
))
plt.text(0.56, 0.62, "Higher\nthreshold", color = "#333333")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.axis([0,1,0,1])
plt.grid()
plt.legend(loc = "lower left")

plt.show()


Suppose we go for atleast 90% precision

In [None]:
idx_for_90_precision = (precisions >= 0.90).argmax()
threshold_for_90_precision = thresholds[idx_for_90_precision]
threshold_for_90_precision

Instead of calling predict, we can also check their y_scores(distances) and compare them to threshold.

In [None]:
y_train_pred_90 = (y_scores >= threshold_for_90_precision)
print(f" Precison score for the above threshold - {precision_score(y_train_5, y_train_pred_90)}")
print(f"Recall score for the above threshold - {recall_score(y_train_5, y_train_pred_90)}")

#### TIP
If someone says, "Let's launch 99% precision", you should ask, "At what recall?"

#### Receiver Operating Characteristic(ROC) Curve
It plots True Positive Rate(recall) versus False Positive Rate(Fall-out)
FPR is the rate of labeling an instance as positive when it actually is negative, as the name implies False Positive.

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
# thresholds
print(f"Note that the thresholds are in decreasig order this time.\n{thresholds}")

In [None]:
idx_for_threshold_at_90 = (thresholds <= threshold_for_90_precision).argmax()
tpr_90, fpr_90 = tpr[idx_for_threshold_at_90], fpr[idx_for_threshold_at_90]

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, linewidth=2, label="ROC Curve")
plt.plot([0, 1], [0,1], "k:", label="Random classifier's ROC Curve")

plt.plot(fpr_90, tpr_90, "ko", label="Threshold for 90% precision.")

plt.gca().add_patch(patches.FancyArrowPatch(
    (0.2,0.89), (0.07, 0.70),
    connectionstyle="arc3, rad=.4",
    arrowstyle="Simple, tail_width=1.5, head_width=8, head_length=10",
    color="#444444"
))
plt.text(0.12, 0.71, "Higher\nthreshold", color="#333333")
plt.xlabel("False positive rate (Fall-Out)")
plt.ylabel("True Positive Rate (Recall)")
plt.grid()
plt.axis([0,1,0,1])
plt.legend(loc="lower right", fontsize=13)


plt.show()

A good classifier stays away from thr straight line as far as possible, in the top left corner.

#### AOC AREA UNDER THE CURVE
Comparing Classifiers using Area under the curve.Comparing Classifiers using Area under the curve.

In [None]:
from sklearn.metrics import roc_auc_score
print(f"AUC Score - {roc_auc_score(y_train_5, y_scores)}")

Creating a RandomForestClassifier to compare to SGDClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)

In [None]:
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, method="predict_proba", cv=3)

In [None]:
y_probas_forest[:2]
# y_train_5[:2]


89% probability is for positive class.
Also these are estimated probabilities. Among the images, the model classified with a probs between 50% & 60%, there are actually about 94% positive images.

In [None]:
idx_50_to_60 = (y_probas_forest[:, 1] > 0.50) & (y_probas_forest[:, 1] < 0.60)
print(f"{y_train_5[idx_50_to_60].sum() / idx_50_to_60.sum():.1%}")

How to identify which probs belong to positve and which to negative. IDK

In [None]:
# print(f"Sorted order of classifier - {forest_clf.classes_b
#}")

In [None]:
y_scores_forest = y_probas_forest[:, 1]
precisions_forest, recalls_forest,thresholds_forest = precision_recall_curve(
    y_train_5, y_scores_forest
)


In [None]:
plt.figure(figsize=(6,5))
plt.plot(recalls_forest, precisions_forest, "b-", linewidth=2, 
         label="Random Forest")
plt.plot(recalls, precisions,"--", linewidth=2, label="SGD")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.axis([0,1,0,1])
plt.grid()
plt.legend(loc="lower left")

plt.show()

In [None]:
y_train_pred_forest = y_probas_forest[:, 1] >= 0.5
y_train_pred_forest

In [None]:
print(f"F1 score for random forest classifier - {f1_score(y_train_5, y_train_pred_forest)}")

In [None]:
print(f"AUC Score for random forest classifier - {roc_auc_score(y_train_5, y_scores_forest)}")

In [None]:
print(f"Precision score for Random forest-{precision_score(y_train_5, y_train_pred_forest)}")
print(f"Recall scores for Random forest-{recall_score(y_train_5, y_train_pred_forest)}")

#### MULTICLASS CLASSIFICATION

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(random_state=42)
svm_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
svm_clf.predict([some_digit])

Scikit-learn actually trained 45 classifiers for this and used the One versus One strategy.

In [None]:
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores.round(2)


In [None]:
class_id = some_digit_scores.argmax()
class_id

When a classifier is trained, the target classes are stored in classes_ attribute.

In [None]:
svm_clf.classes_

In [None]:
svm_clf.classes_[class_id]

You can explicitly state which strategy to use.

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SVC(random_state=42))
ovr_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
ovr_clf.predict([some_digit])


In [None]:
print(f"The number of classifiers trained - {len(ovr_clf.estimators_)}")

Training an SGDClassifier is just as easy.

SGSClassifier is actually a Linear model with stochastic gradient descent learning.

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

This time scikit learn used the OvR strategy under the hood. Since there 10 classes, 10 binary classifiers were trained.

In [None]:
sgd_clf.decision_function([some_digit]).round()

The scores here represent the distance between the data point & the hyperplane. As you can see, it not very confident about any of them.

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

Scaling the dataset increases accuracy as this puts the values on a similar scale ensuring all values contribute equally. This also helps to converge faster and reduces the chances of getting struck in a local minima.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype("float64"))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

#### Error Analysis

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)
plt.show()

Now, the number 4444 above can mean two things-

1)There are simply fewer 5's, which makes the number dark.

2)The model actually identified very less 5's than it was shown, basically showing the poor performance on classifying 5.

For these two reasons, it is important to normalize this data.

In [None]:
#NORMALIZED MATRIX
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize="true", values_format=".0%")

Now, each cell on the matrix shows recall.82% tells that the model correctly identified only 82% of the 5's it was shown.

##### To make the error stand out more.
To do this, we assign 1 weights to the errors and 0 to the correct. We do this by creating a boolean array which has true when the error occured.

In [None]:
sample_weight = (y_train_pred != y_train)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, sample_weight=sample_weight, normalize="true", values_format=".0%")

The x% here shows that the out of all the errors the model made(on classifying a particular digit), x% of those total errors were made on the column number. 

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, sample_weight=sample_weight, normalize="pred", values_format=".0%")

We can see most of the errors were on false 8's meaning that the classifier was not able to properly understand the digit 8.

We can solve this by collecting more data(adding the images that look like an 8 but are not), preprocessing them to make some patterns or writing some algorithm.

Analyzing individual errors can also be a good way. For ex, lets plot 3 and 5 in a confusion matrix style.

In [None]:
cl_a, cl_b = "3", "5"
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

size = 5
pad = 0.2
plt.figure(figsize=(size, size))

for images, (label_col, label_row) in [(X_ba, (0,0)), (X_bb, (1,0)), (X_aa, (0,1)), (X_aa, (1,1))]:
    for idx, images in enumerate(images[:size * size]):
        x = idx % size + (label_col * (size + pad))
        y = idx // size + (label_row * (size + pad))
        plt.imshow(images.reshape(28, 28), cmap="binary", extent=(x, x+1, y, y+1))

plt.xticks([size / 2, size + pad + size / 2], [str(cl_a), str(cl_b)])
plt.yticks([size / 2, size + pad + size / 2], [str(cl_b), str(cl_a)])
plt.plot([size + pad / 2, size + pad / 2 ], [0 , 2 * size + pad ], "k:")
plt.plot([0, 2 * size + pad], [size + pad / 2, size + pad / 2], "k:")
plt.axis([0, 2 * size + pad, 0, 2 * size + pad])
plt.xlabel("Predicted label")
plt.ylabel("True Label")

plt.show()


#### MULTILABEL CLASSIFICATION
This refers to outputting multiple classes for each instance, for ex - classifying a digit as large AND if it even. 

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= "7")
y_train_odd = (y_train.astype(np.int8) % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)


In [None]:
knn_clf.predict([some_digit])

Evaluating how the mutlilabels perform, one is to calculate F1 score for each class/label and compute the average.

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)


In [None]:
print(f"The f1 score is {round(f1_score(y_multilabel, y_train_knn_pred, average="macro"), 4)}")
print("By taking average as macro,we are giving equal weight to both the labels.")

In [None]:
print("Now taking, aveerage as weighted to give more importance to the labels which are more.")
print(f"The weighted f1 score is {round(f1_score(y_multilabel, y_train_knn_pred, average="weighted"), 4)}")

If we want to use a model which does not support multilabel classification, such as SVC, we can train one model per label( one SVC for odd and another for large classfication), and then chain the outputs of both.

In [None]:
from sklearn.multioutput import ClassifierChain

chain_clf = ClassifierChain(SVC(), cv = 3, random_state=42)
chain_clf.fit(X_train[:2000], y_multilabel[:2000])


In [None]:
chain_clf.predict([some_digit])

#### Multioutput Classification
A generalization of Multilabel Classification, but here each label can have multiple classes(i.e., it can have more than two values.)

For ex - A system that removes noise from images. The output is a clean image(an array of pixels and as each pixel can have values between 0 and 255).

In [None]:
np.random.seed(42)
noise = np.random.randint(0, 100, (len(X_train), 784))
# X_train.shape
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test


In [None]:
plt.subplot(121); plot_digit(X_test_mod[0])
plt.subplot(122); plot_digit(y_test_mod[0])

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[0]])
plot_digit(clean_digit)
plt.show()

### Exercises

##### 1. An MNIST Classifier with over 97% accuracy

In [None]:
X_train.shape
# y_train.shape

In [None]:
knn_clf_mnist = KNeighborsClassifier()
knn_clf_mnist.fit(X_train, y_train)

In [None]:
initial_accuracy = knn_clf_mnist.score(X_test, y_test)

In [None]:
initial_accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_neighbors": [3,4,5,6,7]},
    {"weights": ["uniform", "distance"]}
]

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv = 5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
print(f"The accuracy score is - {grid_search.best_score_}")
print("Exercise done. More than 97% accuracy achieved.")

In [None]:
grid_search.best_estimator_

In [None]:
alternate_knn = KNeighborsClassifier(weights="distance", n_neighbors=4)
alternate_knn.fit(X_train, y_train)
alternate_knn.score(X_test, y_test)

##### 2 Data Augmentation

In [None]:
# X_train[2][:28]
X_train_copy = X_train.copy()
y_train_copy = y_train.copy()

In [None]:
plot_digit(X_train[2])
plt.show()

In [None]:
digit = X_train_copy[0]
digit_matrix = digit.reshape(28, 28).copy()
# digit_matrix
temp = digit_matrix[:,0:1]
# temp.shape
for i in range(27):
    digit_matrix[:,i :i + 1] = digit_matrix[:, i+1:i+2]
digit_matrix[:,27:28] = temp


In [None]:
plot_digit(digit)

In [None]:
plot_digit(digit_matrix)


In [None]:
print((digit_matrix == digit.reshape(28, 28)).sum())
np.array_equal(digit.reshape(28,28), digit_matrix)

In [None]:
from scipy.ndimage import shift
def shift_image(image, dx, dy):
    image = image.reshape(28, 28)
    shifted_image = shift(image, [dy, dx], cval = 0, mode="constant")
    return shifted_image.reshape([-1])

shifted_image_down = shift_image(digit, 0, 5)
plot_digit(shifted_image_down)


In [None]:
plt.figure(figsize=(12, 3))
plt.subplot(131)
plt.title("Original")
plt.imshow(digit.reshape(28, 28), interpolation="nearest", cmap="Greys")

plt.subplot(132)
plt.title("Shifted down")
plt.imshow(shift_image(digit, 0, 5).reshape(28,28), interpolation="nearest", cmap="Greys")

plt.subplot(133)
plt.title("Shifted left")
plt.imshow(shift_image(digit, -5, 0).reshape(28,28), interpolation="nearest", cmap="Greys")

plt.show()

We use python for appending data because in numpy whenever we insert(append), a new array is created in memory.

In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1,0), (-1, 0), (0,1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)


In [None]:
np.random.shuffle(X_train_augmented)
np.random.shuffle(y_train_augmented)

In [None]:
y_train_augmented.shape

In [None]:
knn_clf_best = KNeighborsClassifier()
grid_search_best = GridSearchCV(knn_clf_best, param_grid=param_grid, cv=5)
grid_search_best.fit(X_train_augmented, y_train_augmented)


In [None]:
author_knn = KNeighborsClassifier(n_neighbors=4, weights="distance")
author_knn.fit(X_train_augmented, y_train_augmented)
author_score = author_knn.score(X_test, y_test)
print(f"The score of author's knn is {author_score}.")

In [None]:
augmented_accuracy = knn_clf_best.score(X_test, y_test)
augmented_accuracy