In [None]:
from sklearn import datasets, metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt

import time

In [None]:
(X_train_all, y_train_all), (X_test_all, y_test_all) = tf.keras.datasets.mnist.load_data()

In [None]:
save_results: bool = False
svm_kernel = ['linear', 'poly', 'sigmoid']
svm_gamma = [0.1, 0.5, 1.0, 'scale']
svm_class_weights = 'balanced'
pca_num_components: int = [1, 3, 5, 10, 15]
forest_depth = [5, 10, None]
forest_number_of_estimators = [1, 25, 50, 100]
forest_criterion = ['gini', 'entropy', 'log_loss']

In [None]:
images = X_train_all

for i in range(1, 7):
    subplot_idx = 430 + i
    plt.subplot(subplot_idx)
    plt.imshow((images[i]), cmap=plt.cm.gray_r, interpolation='nearest')

### Permute the indices randomly with `np.random.permutation`.

In [None]:
index_permutation = np.random.permutation(len(X_train_all))
index_permutation

In [None]:
X_train_all = X_train_all[index_permutation]
y_train_all = y_train_all[index_permutation]

### Reset index

Resets the indices of the rows to be ascending again.

In [None]:
train_sizes: list = [x for x in range(1000, 20001, 1000)]
test_sizes: list = [x // 2 for x in train_sizes]

In [None]:
X_train_all = X_train_all / 255
X_test_all = X_test_all / 255

In [None]:

def include_pca_preprocessing(data: np.ndarray, num_components: int) -> pd.DataFrame:
    standard_data = StandardScaler().fit_transform(data)
    pca = PCA(num_components)
    X_pca = pca.fit_transform(standard_data)
    return X_pca


In [None]:
def measure_time_fit(estimator, X, y, time_df: pd.DataFrame = None):
    
    start_time = time.perf_counter()
    estimator.fit(X,y)
    compute_time = time.perf_counter() - start_time
    print(compute_time)


In [None]:

def log_time(estimator, X_train, y_train, X_test, y_test, train_size: int, mode: str):
    
    start_time = time.perf_counter()
    estimator.fit(X_train, y_train)
    compute_time_train = time.perf_counter() - start_time
    
    start_time = time.perf_counter()
    y_hat = estimator.predict(X_test)
    compute_time_predict = time.perf_counter() - start_time
    
    if save_results:
        
        report = classification_report(y_test, y_hat, output_dict=True, zero_division=False)
        df_svm = pd.DataFrame(report).transpose()
        df_svm[['train time', 'predict time']] = compute_time_train, compute_time_predict
        
        save_path_csv: str = f'./results/csv/{train_size}_{mode}' 
        save_path_png: str = f'./results/figures/{train_size}_{mode}' 
        
        df_svm.to_csv(f'{save_path_csv}.csv')
        
        confusion_matrix = metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_hat)
        conf_mode = mode.replace('_', ' ').capitalize()
        confusion_matrix.figure_.suptitle(f'Confusion Matrix - {conf_mode} Trainsize {train_size}')
        plt.savefig(f'{save_path_png}.png')

In [None]:
for idx in range(len(train_sizes)):

    train_size = train_sizes[idx]
    test_size = test_sizes[idx]
    X_train, X_test = X_train_all[:train_size], X_test_all[:test_size]
    y_train, y_test = y_train_all[:train_size], y_test_all[:test_size]

    X_train = X_train.reshape(len(X_train), -1)
    X_test = X_test.reshape(len(X_test), -1)

    for gam in svm_gamma:
        svm = SVC(gamma=gam)

        log_time(svm, X_train, y_train, X_test, y_test,
                 train_size, f'svm_gamma_{gam}')

    for kern in svm_kernel:
        svm = SVC(kernel=kern)

        log_time(svm, X_train, y_train, X_test, y_test,
                 train_size, f'svm_kernel_{kern}')

    for C in range(1, 5):
        svm = SVC(C=C)
        log_time(svm, X_train, y_train, X_test,
                 y_test, train_size, f'svm_C_{C}')


In [None]:
for idx in range(len(train_sizes)):
    
    train_size = train_sizes[idx]
    test_size = test_sizes[idx]  

    for pca_num in pca_num_components:
        X_train, X_test = X_train_all[:train_size], X_test_all[:test_size]
        y_train, y_test = y_train_all[:train_size], y_test_all[:test_size]
        
        X_train = X_train.reshape(len(X_train), -1)
        X_test = X_test.reshape(len(X_test), -1)
        
        X_train = include_pca_preprocessing(X_train, num_components=pca_num)
        X_test = include_pca_preprocessing(X_test, num_components=pca_num)
        

        for kern in svm_kernel:
            svm = SVC(kernel=kern)

            log_time(svm, X_train, y_train, X_test, y_test,
                    train_size, f'pca_{pca_num}_svm_kernel_{kern}')


In [None]:
for idx in range(len(train_sizes)):

    train_size = train_sizes[idx]
    test_size = test_sizes[idx]
    X_train, X_test = X_train_all[:train_size], X_test_all[:test_size]
    y_train, y_test = y_train_all[:train_size], y_test_all[:test_size]

    X_train = X_train.reshape(len(X_train), -1)
    X_test = X_test.reshape(len(X_test), -1)
    
    for depth in forest_depth:
        random_forest = RandomForestClassifier(max_depth=depth)
        
        file_label: str = f'forest_depth_' 
        file_label += str(depth) if depth is not None else 'Any'
        
        log_time(random_forest, X_train, y_train, X_test, y_test,
                 train_size, file_label)
        
    for num_est in forest_number_of_estimators:
        random_forest = RandomForestClassifier(n_estimators=num_est)
        
        file_label: str = f'forest_num_estimators_{num_est}'
        
        log_time(random_forest, X_train, y_train, X_test, y_test,
                 train_size, file_label)
        
    for criterion in forest_criterion:
        random_forest = RandomForestClassifier(criterion=criterion)
        
        file_label: str = f'forest_criterion_{criterion}'
        
        log_time(random_forest, X_train, y_train, X_test, y_test,
                 train_size, file_label)
        