In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Models
from sklearn import svm 
from sklearn import tree
from sklearn import linear_model

In [13]:
#Import CSV and get labels and data
train_set = pd.read_csv('./train.csv')
label_set = train_set['label']
train_set = train_set.drop('label', axis = 1)

In [14]:
label_set.size

42000

In [15]:
def printIndex(sample):
    sample=sample.values.reshape((28,28))
    plt.imshow(sample,cmap='gray')
    plt.show()

In [23]:
data_set_sizes = [125, 625, 1250, 6250, 12500, 25000]
svm_model = svm.SVC(gamma='scale', decision_function_shape = 'ovo')
tree_model = tree.DecisionTreeClassifier()
lr_model = linear_model.LogisticRegression()
svm_scores = []
tree_scores = []
lr_scores = []

In [24]:
# Create data for multiple sizes
for size in data_set_sizes:
    print("Testing Models at Size:  ", size)
    # Get data by size and Split
    set_X = train_set.iloc[0:size]
    set_Y = label_set.iloc[0:size]
    X_train, X_test, y_train, y_test = train_test_split(set_X, set_Y, test_size=0.2, random_state=21)
    # Fit all models
    svm_model.fit(X_train, y_train)
    tree_model.fit(X_train, y_train)
    lr_model.fit(X_train, y_train)
    # Predict 
    svm_pred = svm_model.predict(X_test)
    tree_pred = tree_model.predict(X_test)
    lr_pred = lr_model.predict(X_test)
    # Save scores
    svm_scores.append(accuracy_score(y_test, svm_pred))
    tree_scores.append(accuracy_score(y_test, tree_pred))
    lr_scores.append(accuracy_score(y_test, lr_pred))
    print("SVM at size: ", size)
    print(classification_report(y_test, svm_pred))
    print("Accuracy: ", accuracy_score(y_test, svm_pred))
    print("Tree at size: ", size)
    print(classification_report(y_test, tree_pred))
    print("Accuracy: ", accuracy_score(y_test, tree_pred))
    print("Logistic Regression at size: ", size)
    print(classification_report(y_test, lr_pred))
    print("Accuracy: ", accuracy_score(y_test, lr_pred))

Testing Models at Size:   125
SVM at size:  125
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.08      1.00      0.15         2

   micro avg       0.08      0.08      0.08        25
   macro avg       0.01      0.10      0.01        25
weighted avg       0.01      0.08      0.01        25

Accuracy:  0.08
Tree at size:  125
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.75      0.75      0.75      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SVM at size:  625
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.00      0.00      0.00        13
           2       0.10      1.00      0.19        13
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        12
           5       0.00      0.00      0.00        12
           6       0.00      0.00      0.00        13
           7       0.00      0.00      0.00         9
           8       0.00      0.00      0.00         7
           9       0.00      0.00      0.00        15

   micro avg       0.10      0.10      0.10       125
   macro avg       0.01      0.10      0.02       125
weighted avg       0.01      0.10      0.02       125

Accuracy:  0.104
Tree at size:  625
              precision    recall  f1-score   support

           0       1.00      0.65      0.79        17
           1       0.92      0.92      0.92        13
           2       0.50 



SVM at size:  1250
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       1.00      0.05      0.10        20
           2       0.14      1.00      0.24        34
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        29
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00        28
           7       0.00      0.00      0.00        30
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00        18

   micro avg       0.14      0.14      0.14       250
   macro avg       0.11      0.11      0.03       250
weighted avg       0.10      0.14      0.04       250

Accuracy:  0.14
Tree at size:  1250
              precision    recall  f1-score   support

           0       0.83      0.68      0.75        22
           1       0.86      0.95      0.90        20
           2       0.65

  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

In [None]:
# Test with full set

X_train, X_test, y_train, y_test = train_test_split(train_set, label_set, test_size=0.2, random_state=21)
# Fit all models
svm_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
# Predict 
svm_pred = svm_model.predict(X_test)
tree_pred = tree_model.predict(X_test)
lr_pred = lr_model.predict(X_test)
# Save scores
svm_scores.append(accuracy_score(svm_pred, y_test))
tree_scores.append(accuracy_score(tree_pred, y_test))
lr_scores.append(accuracy_score(lr_pred, y_test))
print("SVM at size: ", size)
print(classification_report(svm_pred, y_test))
print("Tree at size: ", size)
print(classification_report(tree_pred, y_test))
print("Logistic Regression at size: ", size)
print(classification_report(lr_pred, y_test))