In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scikitplot as skplt
from sklearn.datasets import fetch_openml
from sklearn import metrics, svm, preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer
from sklearn.preprocessing import StandardScaler, label_binarize

In [2]:
mnist = fetch_openml("mnist_784", data_home="./mnist_784", cache=True)
fmnist = fetch_openml("Fashion-MNIST", data_home="./fmnist", cache=True)



In [3]:
def count_results(y_test, y_predicted, y_predicted_proba):
    try:
        print('Accuracy score:')
        print(metrics.accuracy_score(y_test, y_predicted))
        print('Zero one loss score:')
        print(metrics.zero_one_loss(y_test, y_predicted))
        print('F1 score:')
        print(metrics.f1_score(y_test, y_predicted, average = None))
        print('\n')
        print(skplt.metrics.plot_precision_recall(y_test, y_predicted_proba))
        print(skplt.metrics.plot_roc(y_test, y_predicted_proba))
        plt.rcParams['figure.figsize'] = [20, 10]
        plt.rcParams['font.size'] = 10
        plt.show()
        print('\n')
        print('Hinge loss score:')
        print(metrics.hinge_loss(y_test, y_predicted_proba))
    except:
        pass

In [4]:
def train_model_for_different_times(dataset):

    Cs = [0.1, 0.4, 1]
    test_size = 0.25
    
    x_dataset = dataset.data[0:35000]
    y_dataset = dataset.target[0:35000]

    for C in Cs:
    
        print('----------------------------------------------------------------------------------------------------------')
        print('C value ' + str(C))
        print('Rozmiar testowy ' + str(test_size))
        print('Rozmiar treningowy ' + str(1 - test_size))
        x_train, x_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=test_size)

        scaler = StandardScaler().fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)
        x_dataset = scaler.transform(x_dataset)
        
        print('Start fit.')
        start_fit = timer()
        linear_svc = OneVsRestClassifier(svm.SVC(kernel="linear", probability=True, C=C), n_jobs=6).fit(x_train, y_train)
        end_fit = timer()
        print('Finished fit. Time: ' + str(end_fit - start_fit))
        
        print('Start predicted.')
        start_pred = timer()
        y_predicted = linear_svc.predict(x_test)
        end_pred = timer()
        print('Finished pred. Time: ' + str(end_pred - start_pred))
        
        print('Start predicted proba.')
        start_pred_proba = timer()
        y_predicted_proba = linear_svc.predict_proba(x_test)
        end_pred_proba = timer()
        print('Finished pred. Time: ' + str(end_pred_proba - start_pred_proba))
    
        count_results(y_test, y_predicted, y_predicted_proba)


In [None]:
train_model_for_different_times(mnist)
train_model_for_different_times(fmnist)

----------------------------------------------------------------------------------------------------------
C value 0.1
Rozmiar testowy 0.25
Rozmiar treningowy 0.75
Start fit.


In [None]:
def train_model_for_different_training_sizes(dataset):

    C = 0.7
    train_sizes = [0.5, 0.75, 0.85]
    
    x_dataset = dataset.data[0:35000]
    y_dataset = dataset.target[0:35000]

    for train_size in train_sizes:
        test_size = 1 - train_size
        print('----------------------------------------------------------------------------------------------------------')
        print('C value ' + str(C))
        print('Rozmiar testowy ' + str(test_size))
        print('Rozmiar treningowy ' + str(train_size))
        x_train, x_test, y_train, y_test = train_test_split(x_dataset, y_dataset, test_size=test_size)

        scaler = StandardScaler().fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)
        x_dataset = scaler.transform(x_dataset)
        
        print('Start fit.')
        start_fit = timer()
        linear_svc = OneVsRestClassifier(svm.SVC(kernel="linear", probability=True, C=C), n_jobs=6).fit(x_train, y_train)
        end_fit = timer()
        print('Finished fit. Time: ' + str(end_fit - start_fit))
        
        print('Start predicted.')
        start_pred = timer()
        y_predicted = linear_svc.predict(x_test)
        end_pred = timer()
        print('Finished pred. Time: ' + str(end_pred - start_pred))
        
        print('Start predicted proba.')
        start_pred_proba = timer()
        y_predicted_proba = linear_svc.predict_proba(x_test)
        end_pred_proba = timer()
        print('Finished pred. Time: ' + str(end_pred_proba - start_pred_proba))
    
        count_results(y_test, y_predicted, y_predicted_proba)

In [None]:
train_model_for_different_training_sizes(mnist)
train_model_for_different_training_sizes(fmnist)