# COMP47590 Advanced Machine Learning
# Basic Machine Learning in Python - MNIST

## Imports

To build predictive models in Python we use a set of libraries that are imported here. In particular **pandas** and **sklearn** are particularly important.

In [None]:
#import os
#import subprocess
import io
import random 

import pandas as pd # core data handling package
import numpy as np # core data handling package
import matplotlib # core plotting functioanlity
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns # nicer plotting functionlity
import pandas_profiling # Nice exploratory data analysis package
import missingno # For nice missing number analysis

import sklearn # For basic machine learning functionality
import sklearn.preprocessing
import sklearn.metrics
import sklearn.model_selection
import sklearn.tree
import sklearn.ensemble
import sklearn.svm
import sklearn.linear_model
import sklearn.neighbors
import sklearn.neural_network

## Data Prep

### Setup

Take only a sample of the dataset for fast testing

In [None]:
data_sampling_rate = 0.1

Setup the number of folds for all grid searches (should be 5 - 10)

In [None]:
cv_folds = 2

Set up a dictionary to store simple model performance comparions

In [None]:
model_valid_accuracy_comparisons = dict()
model_accuracy_comparisons = dict()
model_tuned_params_list = dict()

### Load Data

Load the dataset and explore it.

In [None]:
file_name = '../Data/mnist_train.csv'

target_feature = "label"
num_classes = 10
classes = {0: "0", 1:"1", 2: "2", 3:"3", 4:"4", 5:"5", 6:"6", 7:"7", 8:"8", 9:"9"}

In [None]:
dataset = pd.read_csv(file_name)
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
display(dataset.head())

### Explore Data

Examine the distribution of the target levels

In [None]:
dataset[target_feature].value_counts()

In [None]:
dataset[target_feature].value_counts().plot(kind = 'bar')
plt.show()

Display summary statistics for each feature.

In [None]:
# Print descriptive statsitcs for each column
print("Summary Stats")
if dataset.select_dtypes(include=[np.number]).shape[1] > 0: 
    display(dataset.select_dtypes(include=[np.number]).describe().transpose())
    
if dataset.select_dtypes(include=[object]).shape[1] > 0: 
    display(dataset.select_dtypes(include=[object]).describe().transpose())

Examine presence of missing values

In [None]:
# Check for presence of missing values
print("Missing Values")
print(dataset.isnull().sum().sort_values(ascending = False))

Plot a nice diagram showing missing values - especially usefuol for combined missing values.

In [None]:
missingno.matrix(dataset)
plt.show()

Generate a plot (histogram or bar plot) for each feature in the dataset. (Commented out for MNIST)

In [None]:
#for f in dataset.select_dtypes(include=['number']).columns:
#    print(f)
#    sns.histplot(dataset[f])
#    plt.title(f)
#    plt.show()

In [None]:
#for f in dataset.select_dtypes(include=['object']).columns:
#    print(f)
#    sns.barplot(dataset[f].value_counts())
#    plt.title(f)
#    plt.show()

We can do nice overall exploration using the **pandas_profiling** package. 

(Commented out as *SLOOOOOW* and not very useful for MNIST.)

In [None]:
#pandas_profiling.ProfileReport(dataset, minimal = True)

Display some of the instances in the dataset (only really useful for images).

In [None]:
pltsize=4
row_images = 5
col_images = 5
plt.figure(figsize=(col_images*pltsize, row_images*pltsize))

for i in range(row_images * col_images):
    i_rand = random.randint(0, dataset.shape[0])
    plt.subplot(row_images,col_images,i+1)
    plt.axis('off')
    plt.imshow((dataset.iloc[i_rand, 1:]).values.reshape(28,28), cmap='gray', vmin=0, vmax=256)
    plt.title(str(classes[dataset[target_feature].iloc[i_rand]]))
plt.show()

### Partition Dataset

Isolate the descriptive features we are interested in

In [None]:
X = dataset[dataset.columns[1:]]
y = dataset[target_feature]

Split the data into a **training set** and **validation set**

In [None]:
X_train, X_valid, y_train, y_valid \
    = sklearn.model_selection.train_test_split(X, y, 
                        shuffle=True, 
                        stratify = y, 
                        train_size = 0.7)

### Preprocess Dataset

Normalise the data (important for some models but not used in this example.)

In [None]:
# Make the min max scalar object
#min_max_scaler = sklearn.preprocessing.MinMaxScaler((-1,1))
#min_max_scaler.fit(X_train)
#
## Train the scalar on the training dataset
#a = min_max_scaler.transform(X_train)
#
#X_train = pd.DataFrame(a, columns = min_max_scaler.feature_names_in_) 
#
## Also normalise other partitions
#a = min_max_scaler.transform(X_valid)
#X_valid = pd.DataFrame(a, columns = min_max_scaler.feature_names_in_) 

Normalise the data (using hardcoded approach based on domain knowledge)

In [None]:
X_train = (X_train/255*2) - 1
X_valid = (X_valid/255*2) - 1

In [None]:
display(X_train.shape)
display(X_train.head())
display(X_valid.shape)
display(X_valid.head())

Check that we haven't messed up the dataset!

In [None]:
# Print descriptive statsitcs for each column
print("Summary Stats")
if dataset.select_dtypes(include=[np.number]).shape[1] > 0: 
    display(X_train.select_dtypes(include=[np.number]).describe().transpose())
    
if dataset.select_dtypes(include=[object]).shape[1] > 0: 
    display(X_train.select_dtypes(include=[object]).describe().transpose())

Because we are working with some images plot some. 

In [None]:
pltsize=4
row_images = 5
col_images = 5
plt.figure(figsize=(col_images*pltsize, row_images*pltsize))

for i in range(row_images * col_images):
    i_rand = random.randint(0, X_train.shape[0])
    plt.subplot(row_images,col_images,i+1)
    plt.axis('off')
    plt.imshow((X_train.iloc[i_rand]).values.reshape(28,28), cmap='gray', vmin=-1, vmax=1)
    plt.title((str(classes[y_train.iloc[i_rand]])))
plt.show()

## Building Simple Models

Train a decision tree, setting min samples per leaf to a sensible value

In [None]:
my_tree = sklearn.tree.DecisionTreeClassifier(min_samples_split = 0.05)
my_tree = my_tree.fit(X_train,y_train)

Assess the performance of the decision tree on the **training set**

In [None]:
# Make a set of predictions for the training data
y_pred = my_tree.predict(X_train)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_train, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_train, y_pred, cmap = 'Blues')
plt.show()

Assess the performance of the decision tree on the **validation set**

In [None]:
# Make a set of predictions for the test data
y_pred = my_tree.predict(X_valid)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
model_valid_accuracy_comparisons["Better Tree"] = accuracy
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_valid, y_pred, cmap = 'Blues')
plt.show()

## Evaluating Using Cross Validation

Use a cross validation to perfrom an evaluation

In [None]:
my_tree = sklearn.tree.DecisionTreeClassifier(min_samples_split = 0.05)
cv_results = sklearn.model_selection.cross_validate(my_tree, X, y, cv=10)
print(cv_results)

## Choosing Parameters Using a Grid Search

A common way to tune models is to use a grid search through a large set of possible parameters. Here we try depths between 3 and 20 and different limits on the minimum number of samples per split.

In [None]:
# Set up the parameter grid to seaerch
param_grid ={'criterion': ['gini', "entropy"], \
             'max_depth': list(range(3, 50, 3)), \
             'min_samples_split': [50]}

# Perform the search
my_tuned_tree = sklearn.model_selection.GridSearchCV(sklearn.tree.DecisionTreeClassifier(), \
                                param_grid, cv=cv_folds, verbose = 2, \
                            return_train_score=True, n_jobs = -1)
my_tuned_tree.fit(X, y)

# Print details
print("Best parameters set found on development set:")
display(my_tuned_tree.best_params_)
model_tuned_params_list["Tuned Tree"] = my_tuned_tree.best_params_
display(my_tuned_tree.best_score_)
display(my_tuned_tree.cv_results_)

## Comparing Models

We can easily use the same patterns to train other types of models.

### Random Forests

Train and evaluate a simple model 

In [None]:
# Do the same job with random forests
my_model = sklearn.ensemble.RandomForestClassifier(n_estimators=300, \
                                           max_features = 3,\
                                           min_samples_split=200)
my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
model_valid_accuracy_comparisons["Random Forest"] = accuracy
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_valid, y_pred, cmap = 'Blues')
plt.show()

Choose parameters using a grid search

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
 {'n_estimators': list(range(100, 501, 50)), 'max_features': list(range(2, 10, 2)), 'min_samples_split': [200] }
]

# Perform the search
my_tuned_model = sklearn.model_selection.GridSearchCV(sklearn.ensemble.RandomForestClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs = -1)
my_tuned_model.fit(X, y)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
model_tuned_params_list["Tuned Random Forest"] = my_tuned_model.best_params_
print(my_tuned_model.best_score_)
model_accuracy_comparisons["Tuned Random Forest"] = my_tuned_model.best_score_

### Bagging

Train and evaluate a simple model 

In [None]:
# Do the same job with random forests
my_model = sklearn.ensemble.BaggingClassifier(estimator = sklearn.tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf = 50), \
                                      n_estimators=10)
my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the validation data
y_pred = my_model.predict(X_valid)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
model_valid_accuracy_comparisons["Bagging"] = accuracy
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_valid, y_pred, cmap = 'Blues')
plt.show()

Choose parameters using a grid search

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
 {'n_estimators': list(range(50, 501, 50))}
]

# Perform the search
my_tuned_model = sklearn.model_selection.GridSearchCV(sklearn.ensemble.BaggingClassifier(estimator = sklearn.tree.DecisionTreeClassifier(criterion="entropy", max_depth = 6, min_samples_leaf = 200)), param_grid, cv=cv_folds, verbose = 2, n_jobs = -1)
my_tuned_model.fit(X, y)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
model_tuned_params_list["Tuned Bagging"] = my_tuned_model.best_params_
print(my_tuned_model.best_score_)
model_accuracy_comparisons["Tuned Bagging"] = my_tuned_model.best_score_

### Gradient Boosting

Train and evaluate a simple model 

In [None]:
# Do the same job with random forests
my_model = sklearn.ensemble.GradientBoostingClassifier()
my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the validation data
y_pred = my_model.predict(X_valid)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
model_valid_accuracy_comparisons["GradBoost"] = accuracy
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_valid, y_pred, cmap = 'Blues')
plt.show()

Choose parameters using a grid search

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
 {'n_estimators': list(range(50, 501, 50)),
 'learning_rate': [0.001, 0.01, 0.1]}
]

# Perform the search
my_tuned_model = sklearn.model_selection.GridSearchCV(sklearn.ensemble.GradientBoostingClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs = -1)
my_tuned_model.fit(X, y)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
model_tuned_params_list["Tuned GradBoost"] = my_tuned_model.best_params_
print(my_tuned_model.best_score_)
model_accuracy_comparisons["Tuned GradBoost"] = my_tuned_model.best_score_

### Nearest Neighbour

Train and evaluate a simple model 

In [None]:
# Do the same job with random forests
my_model = sklearn.neighbors.KNeighborsClassifier()
my_model = my_model.fit(X_train,y_train)

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_valid)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_valid, y_pred) # , normalize=True, sample_weight=None
model_valid_accuracy_comparisons["kNN"] = accuracy
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_valid, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_valid, y_pred, cmap = 'Blues')
plt.show()

Choose parameters using a grid search

In [None]:
# Set up the parameter grid to seaerch
param_grid = [
               {'n_neighbors': list(range(1, 50, 5))}
]

# Perform the search
my_tuned_model = sklearn.model_selection.GridSearchCV(sklearn.neighbors.KNeighborsClassifier(), param_grid, 
                                                      cv=cv_folds, verbose = 2, 
                                                      n_jobs = -1)
my_tuned_model.fit(X, y)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
model_tuned_params_list["Tuned kNN"] = my_tuned_model.best_params_
print(my_tuned_model.best_score_)
model_accuracy_comparisons["Tuned kNN"] = my_tuned_model.best_score_

### Compare Results

In [None]:
display(model_valid_accuracy_comparisons)

In [None]:
plt.xlim(0, 1.0)
_ = plt.barh(range(len(model_valid_accuracy_comparisons)), list(model_valid_accuracy_comparisons.values()), align='center')
_= plt.yticks(range(len(model_valid_accuracy_comparisons)), list(model_valid_accuracy_comparisons.keys()))
plt.xlabel("Accuracy")
plt.show()

In [None]:
display(model_accuracy_comparisons)

In [None]:
plt.xlim(0, 1.0)
plt.barh(range(len(model_accuracy_comparisons)), list(model_accuracy_comparisons.values()), align='center')
plt.yticks(range(len(model_accuracy_comparisons)), list(model_accuracy_comparisons.keys()))
plt.xlabel("Accuracy")
plt.show()

In [None]:
display(model_tuned_params_list)

## Test Best Model On Test Dataset

In [None]:
test_filename = '../Data/mnist_test.csv'
test_dataset = pd.read_csv(test_filename)
test_dataset.head()

In [None]:
X_test = test_dataset[test_dataset.columns[1:]]
y_test = np.array(test_dataset[target_feature])

In [None]:
#a = min_max_scaler.transform(X_test)
#X_test = pd.DataFrame(a, columns = min_max_scaler.feature_names_in_) 

In [None]:
X_test = (X_test/255*2) - 1

In [None]:
display(X_test.shape)
display(X_test.head())

In [None]:
my_model = sklearn.ensemble.GradientBoostingClassifier(**(model_tuned_params_list["Tuned GradBoost"]))
my_model = my_model.fit(X_test,y_test)

In [None]:
# Make a set of predictions for the test data
y_pred = my_model.predict(X_test)

# Print performance details
accuracy = sklearn.metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(sklearn.metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
sklearn.metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap = 'Blues')
plt.show()