In [1]:
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, clone
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape # 70k images, 784 features (28x28 pixels)

(70000, 784)

In [None]:
some_digit = X.loc[0]
some_digit_image = some_digit.values.reshape(28, 28) # create image from 1D array
plt.imshow(some_digit_image, cmap="binary")
plt.axis("off")
plt.show()

In [None]:
# label
print(y[0])
print(type(y[0]))

In [4]:
# convert string to int
y = y.astype(np.uint8)

In [5]:
# train test split
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
# binary classifier

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)


sgd_clf = SGDClassifier(random_state=42)

sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit]) # predict the digit 5 (True is 5, False is not 5)

In [None]:
# cross validation check
score = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
print(score)

In [None]:
# cross validation check with custom implementation
skfolds = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train.loc[train_index]
    y_train_folds = y_train_5.loc[train_index]
    X_test_fold = X_train.loc[test_index]
    y_test_fold = y_train_5.loc[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

In [None]:
# Dummy classifier
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
never_5_clf = Never5Classifier()

score = cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")
print(score) # [0.91125 0.90855 0.90915]

# It's means only 10% of the data is 5, so if you always guess that the image is not 5, you will be right 90% of the time.

In [None]:
# confusion matrix
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
confusion_matrix = pd.crosstab(y_train_5, y_train_pred, rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)
# True Negative: 53892, False Positive: 687
# False Negative: 1891, True Positive: 3530

# accuracy = TN + TP / (TN + FP + FN + TP) = 53892 + 3530 / (53892 + 687 + 1891 + 3530) = 0.9502 %
# precision = TP / (TP + FP) = 3530 / (3530 + 687) = 0.84  
# recall = TP / (TP + FN) = 3530 / (3530 + 1891) = 0.65 %
# f1 = 2 * (precision * recall) / (precision + recall) = 2 * (0.84 * 0.65) / (0.84 + 0.65) = 0.74 % 

In [None]:
# precision and recall ratio
# we cannot change the threshold of the model, but we can change the threshold of the decision function
y_scores = sgd_clf.decision_function([some_digit])
print(y_scores)

threshold = 0 # default threshold
y_some_digit_pred = (y_scores > threshold) # True
print(y_some_digit_pred)

# make the threshold higher
threshold = 8000

y_some_digit_pred = (y_scores > threshold) # False
print(y_some_digit_pred)


In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0, 1])
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:

threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
print('Threshold for 90 % accuracy is: ', threshold_90_precision)

x_train_pred_90 = (y_scores >= threshold_90_precision)

pre_score = precision_score(y_train_5, x_train_pred_90)
rec_score = recall_score(y_train_5, x_train_pred_90)

print(pre_score) # 0.90 % precision increased
print(rec_score) # 0.48 %  recall decreased

In [None]:
# ROC curve ( indicate the true positive rate against the false positive rate)
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label) # plot the curve
    plt.plot([0, 1], [0, 1], 'k--') # plot the diagonal
    plt.axis([0, 1, 0, 1]) # set the axis
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr, tpr)
# The more recall of model, the more false positive rate

In [None]:
# ROC AUC score

# AUC score is the area under the ROC curve
roc_auc_score = roc_auc_score(y_train_5, y_scores)
print(roc_auc_score) # 0.96 
# Good model has ROC AUC score close to 1, bad model has ROC AUC score close to 0.5

In [None]:
# Random Forest Classifier model for comparison
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")

y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()


In [None]:
# ROC AUC score
roc_auc_score(y_train_5, y_scores_forest) # 0.99

In [None]:
# Multiclass classification

# SGD, RandomForest, and Naive Bayes classifiers can be used for multiclass classification
# But SVM, Linear classifiers, and Neural Networks are binary classifiers it can be used for multiclass classification using OvR or OvO strategy

# SVC bad for large datasets, so we use SGDClassifier

svm_clf = SVC()
svm_clf.fit(X_train, y_train) # y_train, not y_train_5 (so it's multiclass classification)

In [None]:
prediction = svm_clf.predict([some_digit]) # predict the digit 5
# SVC classifier trained 10 binary classifiers, get their decision scores for the image, and selected the class with the highest score
digit_scores = svm_clf.decision_function([some_digit])
print(digit_scores)
print(np.argmax(digit_scores)) # 5
print(prediction)
len(svm_clf.classes_) # 10 classes

In [None]:
# Using OvR strategy
ovr_clf = OneVsRestClassifier(SVC())

ovr_clf.fit(X_train, y_train)

prediction = ovr_clf.predict([some_digit])  
print(prediction) # 5

len(ovr_clf.estimators_) # 10

In [None]:
# Training SGDClassifier
sgd_clf.fit(X_train, y_train)

prediction = sgd_clf.predict([some_digit])
print(prediction) # predicted - 3

In [None]:
sgd_clf.decision_function([some_digit]) # 3

score = cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
print(score) # 
print(score.mean()) #

In [None]:
# To improve the accuracy, we can scale the inputs
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
score = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
print(score) # 
print(score.mean()) #

In [None]:
# Error Analysis
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)

plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0) # fill the diagonal with 0
plt.matshow(norm_conf_mx, cmap=plt.cm.gray) # plot the matrix
plt.show()

In [None]:
# Draw the digit 3 and 5

def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.values.reshape(size, size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = 'binary', **options)
    plt.axis("off")
    
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.save_fig("error_analysis_digits_plot")
plt.show()

In [None]:
# Multilabel classification
y_train_large = (y_train >= 7) # is the digit large?
y_train_odd = (y_train % 2 == 1) # is the digit odd?

y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

knn_clf.predict([some_digit]) # [False, True] 3 is not large and odd

In [None]:
# Evaluate the model
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average="macro") # F1 score
# If we have more large digits, we can set the weight of the large digits to be higher
f1_score(y_multilabel, y_train_knn_pred, average="weighted") # F1 score (adjust weight)

In [None]:
# Multioutput classification

def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

noise = np.random.randint(0, 100, (len(X_train), 784)) # generate random noise
X_train_mod = X_train + noise # add noise to the training set
noise = np.random.randint(0, 100, (len(X_test), 784)) # generate random noise
X_test_mod = X_test + noise # add noise to the test set
y_train_mod = X_train # target is the original image
y_test_mod = X_test # target is the original image

some_index = 0
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
save_fig("noisy_digit_example_plot")
plt.show()

knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod.iloc[0]]) # clean the image
plot_digits(clean_digit)

#### Extra material

1. Improve the model KNN to 97% accuracy (use grid search to find the best hyperparameters: weights and n_neighbors)
2. Create shift-function to shift the image (make 4 copies of the image, each shifted by one pixel)
3. Complete Titanic project on Kaggle
4. Create a spam classifier (Apache SpamAssassin public datasets) https://homl.info/spamassassin

In [None]:
# Task 1 - Improve the model KNN to 97% accuracy

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance']
}

knn_clf = KNeighborsClassifier()

rnd_search = RandomizedSearchCV(knn_clf, param_dist, n_iter=3, cv=3, scoring='accuracy', random_state=42, verbose=2, n_jobs=-1)
rnd_search.fit(X_train, y_train)

print(rnd_search.best_params_)
print(rnd_search.best_score_)

In [None]:
# Task 2 - Create shift-function to shift the image
# TODO: Run on Desktop

import numpy as np
from scipy.ndimage import shift

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

# Convert X_train and y_train to NumPy arrays
X_train_np = X_train.values
y_train_np = y_train.values

# Preallocate arrays for augmented data
num_shifts = 4
X_train_augmented = np.zeros((len(X_train_np) * (num_shifts + 1), 784))
y_train_augmented = np.zeros(len(y_train_np) * (num_shifts + 1))

# Copy original data
X_train_augmented[:len(X_train_np)] = X_train_np
y_train_augmented[:len(y_train_np)] = y_train_np

# Apply shifts
shift_idx = len(X_train_np)
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for i in range(len(X_train_np)):
        X_train_augmented[shift_idx] = shift_image(X_train_np[i], dx, dy)
        y_train_augmented[shift_idx] = y_train_np[i]
        shift_idx += 1

# Shuffle the augmented dataset
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [None]:
knn_clf = KNeighborsClassifier(**{'weights': 'distance', 'n_neighbors': 3})
knn_clf.fit(X_train_augmented, y_train_augmented)

y_pred = knn_clf.predict(X_test)
accuracy = np.mean(y_pred == y_test)

print(accuracy)

In [None]:
# 3. Titanic challenge -> Kaggle/Challenges/Titanic Disaster.ipynb

In [None]:
# 4. Create a spam classifier

easy_ham_folder = 'dataset/easy_ham'
spam_folder = 'dataset/spam'

ham_filenames = [name for name in sorted(os.listdir(easy_ham_folder)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(spam_folder)) if len(name) > 20]

len(ham_filenames), len(spam_filenames)

In [None]:
import email
import email.policy

def load_email(filename, spam_path):
    with open(os.path.join(spam_path, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
ham_emails = [load_email(name, easy_ham_folder) for name in ham_filenames]
spam_emails = [load_email(name, spam_folder) for name in spam_filenames]

In [None]:
print(ham_emails[1].get_content().strip())

In [None]:
def get_email_structure(email):
    '''Return the email structure'''
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [None]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
structures_counter(ham_emails).most_common()

In [None]:
structures_counter(spam_emails).most_common()

In [None]:
for header, value in spam_emails[0].items():
    print(header,":",value)

In [None]:
spam_emails[0]["Subject"]

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [None]:
html_spam_emails = [email for email in X_train[y_train==1] if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

In [None]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [None]:
print(email_to_text(sample_html_spam)[:100], "...")

In [None]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

In [None]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, strip_headers=True,
                 lower_case=True,
                 remove_punctuation=True,
                 replace_urls=True,
                 replace_numbers=True,
                 stemming=True):
        
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [None]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

In [None]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [None]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

In [None]:
X_few_vectors.toarray()

In [None]:
vocab_transformer.vocabulary_

In [None]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

In [None]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))
print("F1 Score: {:.2f}%".format(100 * f1_score(y_test, y_pred)))