In [1]:
import time

from sklearn import datasets, metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
feature_df = pd.read_csv('./mnist_features.csv', index_col=0)
label_df = pd.read_csv('./mnist_labels.csv', index_col=0)

In [3]:
SVM_KEY = 'svm'
RANDOM_FOREST_KEY = 'random_forest'

In [4]:
svm_gamma = 'scale'
svm_class_weights = 'balanced'
pca_num_components: int = 9
forest_depth = None
forest_number_of_estimators = 100

In [5]:
split_size = [(index + 1) * 2500 for index in range(0, 22)]

In [6]:
def get_classifier(classifier_name: str):
    if classifier_name == SVM_KEY:
        return SVC(gamma=svm_gamma, class_weight=svm_class_weights)
    elif classifier_name == RANDOM_FOREST_KEY:
        return RandomForestClassifier(n_estimators=forest_number_of_estimators, max_depth=forest_depth)

In [7]:
def include_pca_preprocessing(data: pd.DataFrame, num_components: int) -> pd.DataFrame:
    standard_data = StandardScaler().fit_transform(data)
    pca = PCA(num_components)
    X_pca = pca.fit_transform(standard_data)
    return X_pca

In [8]:
def get_dataframe_path(classifier: str, split: int, is_pca: bool) -> str:
    dataframe_path = f'./results/{classifier}_num_features_{split}'
    if is_pca:
        dataframe_path +=  '_pca'
    dataframe_path += '.csv'
    
    return dataframe_path

In [9]:
def get_time_measure_path(classifier: str, is_pca: bool) -> str:
    time_path = f'./results/{classifier}_runtime'
    if is_pca:
        time_path += '_pca'
    time_path += '.csv'
    
    return time_path

In [10]:
def get_train_test_set(split, is_pca):
    index_permutation = np.random.permutation(len(feature_df))
    X = feature_df.loc[index_permutation]
    y = label_df.loc[index_permutation]
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    
    split_index = split
    end_index = round(split_index * 1.25)

    X_train, X_test = X[:split_index], X[split_index:end_index]
    y_train, y_test = y[:split_index], y[split_index:end_index]
    
    if is_pca == True:
        X_train = include_pca_preprocessing(X_train, pca_num_components)
        X_test = include_pca_preprocessing(X_test, pca_num_components)
    
    y_train = y_train.values.flatten()
    
    return X_train, y_train, X_test, y_test

In [11]:
runtime_df = pd.DataFrame(index=split_size, columns=['training runtime', 'prediction runtime'])
print(runtime_df.loc[2500]['training runtime'])

nan


In [12]:
def track_runtime(classifier: str, is_pca: bool = False):
    counter: int = 1
    
    runtime_df = pd.DataFrame(index=split_size, columns=['training runtime', 'prediction runtime', 'test size'])
    
    
    for split in split_size:        
        print(f'Run ({counter}/{len(split_size)})')
        counter += 1

        X_train, y_train, X_test, y_test = get_train_test_set(split, is_pca)
        
        model = get_classifier(classifier)
        
        start_time = time.perf_counter()
        model.fit(X_train, y_train)
        end_time = time.perf_counter()
        runtime_df.loc[split]['training runtime'] = end_time - start_time
        
        start_time = time.perf_counter()
        y_predict = model.predict(X_test)
        end_time = time.perf_counter()
        
        runtime_df.loc[split]['prediction runtime'] = end_time - start_time
        runtime_df.loc[split]['test size'] = len(X_test)
        
        report = classification_report(y_test, y_predict, output_dict=True)
        df_path = get_dataframe_path(classifier, split, is_pca)
        classification_df = pd.DataFrame(report).transpose()
        classification_df.to_csv(df_path)
        
    runtime_path = get_time_measure_path(classifier, is_pca)
    runtime_df.to_csv(runtime_path)
        


In [13]:
track_runtime(SVM_KEY, is_pca=False)

Run (1/22)
Run (2/22)
Run (3/22)
Run (4/22)
Run (5/22)
Run (6/22)
Run (7/22)
Run (8/22)
Run (9/22)
Run (10/22)
Run (11/22)
Run (12/22)
Run (13/22)
Run (14/22)
Run (15/22)
Run (16/22)
Run (17/22)
Run (18/22)
Run (19/22)
Run (20/22)
Run (21/22)
Run (22/22)


In [14]:
track_runtime(SVM_KEY, is_pca=True)

Run (1/22)
Run (2/22)
Run (3/22)
Run (4/22)
Run (5/22)
Run (6/22)
Run (7/22)
Run (8/22)
Run (9/22)
Run (10/22)
Run (11/22)
Run (12/22)
Run (13/22)
Run (14/22)
Run (15/22)
Run (16/22)
Run (17/22)
Run (18/22)
Run (19/22)
Run (20/22)
Run (21/22)
Run (22/22)


In [15]:
track_runtime(RANDOM_FOREST_KEY, is_pca=False)

Run (1/22)
Run (2/22)
Run (3/22)
Run (4/22)
Run (5/22)
Run (6/22)
Run (7/22)
Run (8/22)
Run (9/22)
Run (10/22)
Run (11/22)
Run (12/22)
Run (13/22)
Run (14/22)
Run (15/22)
Run (16/22)
Run (17/22)
Run (18/22)
Run (19/22)
Run (20/22)
Run (21/22)
Run (22/22)


In [16]:
track_runtime(RANDOM_FOREST_KEY, is_pca=True)

Run (1/22)
Run (2/22)
Run (3/22)
Run (4/22)
Run (5/22)
Run (6/22)
Run (7/22)
Run (8/22)
Run (9/22)
Run (10/22)
Run (11/22)
Run (12/22)
Run (13/22)
Run (14/22)
Run (15/22)
Run (16/22)
Run (17/22)
Run (18/22)
Run (19/22)
Run (20/22)
Run (21/22)
Run (22/22)
