In [2]:
import numpy as np
import pandas as pd
import pickle
import time
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV,\
                                    StratifiedKFold, KFold,\
                                    StratifiedShuffleSplit, RepeatedStratifiedKFold, \
                                    cross_val_score, cross_validate
                                    
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import check_random_state
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import PCA

from mne.decoding import cross_val_multiscore, LinearModel, \
                         GeneralizingEstimator, Scaler, Vectorizer

import cvxpy as cp

In [3]:
class arguments():
    def __init__(self):
        self.DATAPATH = '../data/'

args = arguments()

train = pd.read_csv(args.DATAPATH + 'studentspen-train.csv')
test = pd.read_csv(args.DATAPATH + 'studentsdigits-test.csv')

X = train[['x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6']].values
Y = train['Digit'].values
print(X.shape)
print(Y.shape)

(3747, 8)
(3747,)


# Create classifiers

In [4]:
rand_state=45
max_iter=1000000

In [5]:
# n_split for linear = 10 and for polynomial = 6
n_split_linear=10
n_split_poly=6

cv1 = StratifiedShuffleSplit(n_splits=n_split_linear, random_state=rand_state) 
cv2 = StratifiedShuffleSplit(n_splits=n_split_poly, random_state=rand_state) 

In [6]:
le = LabelEncoder()

# Question 1 and 2

Question 1): Train a  linear multi-class classification SVM with no kernel.  Specify i) Your mapping function and ii) Your loss function  (20 points)

Question 2) Describe a method to estimate your performance using an empirical method. Compare this estimate with a well known theoretical bound. Explain why/if there is a difference. (5 points)

# finalized classifiers
## Applying finalized classifier on the actual test data and save them

In [10]:
class arguments():
    def __init__(self):
        self.DATAPATH = '../data/'

args = arguments()

train = pd.read_csv(args.DATAPATH + 'studentspen-train.csv')
test = pd.read_csv(args.DATAPATH + 'studentsdigits-test.csv')

X_train = train[['x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6']].values
y_train = train['Digit'].values

X_test = test[['x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6']].values


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(3747, 8)
(3747,)
(3747, 8)


In [11]:
#---------------------------------------------------------------------------------------------------
# Linear SVC
#---------------------------------------------------------------------------------------------------
svm_final_1 = make_pipeline(StandardScaler(), LinearSVC(loss='squared_hinge', random_state=rand_state,\
                                                        C=8, max_iter=max_iter, class_weight='balanced'))

svm_final_1_scores  = cross_val_score(svm_final_1, X, Y, cv=cv1, n_jobs=1)
print('one vs rest')
print(svm_final_1_scores)
print(np.mean(svm_final_1_scores))


#---------------------------------------------------------------------------------------------------
# SVC with one vs rest wrapper
#---------------------------------------------------------------------------------------------------

svm_final_2 = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=rand_state, \
                                                  C=8, max_iter=max_iter, class_weight='balanced'))
svm_final_2_scores  = cross_val_score(svm_final_2, X, Y, cv=cv1, n_jobs=1)
print('one vs one')
print(svm_final_2_scores)
print(np.mean(svm_final_2_scores))

#----------------------------------
svm_final_3 = OneVsRestClassifier(svm_final_2)
svm_final_3_scores     = cross_val_score(svm_final_3, X, Y, cv=cv1, n_jobs=1)
print('one vs rest')
print(svm_final_3_scores)
print(np.mean(svm_final_3_scores))

one vs rest
[0.744      0.744      0.736      0.72       0.70933333 0.736
 0.73866667 0.712      0.736      0.73866667]
0.7314666666666667
one vs one
[0.86133333 0.89333333 0.86933333 0.872      0.86666667 0.89066667
 0.872      0.848      0.87733333 0.91466667]
0.8765333333333333
one vs rest
[0.71733333 0.70666667 0.70133333 0.71733333 0.69066667 0.67466667
 0.69066667 0.68       0.704      0.70133333]
0.6984


# Question 2 PAC Bound

In [12]:
def calc_perf(x1, x2):
    nn = x1.shape[0]
    scores = np.zeros([nn, 1])
    for ii in range(nn):
        if x1[ii]==x2[ii]:
            scores[ii] = 1
        else:
            scores[ii] = 0
    corr = np.sum(scores, axis=0)
    perf = (corr / nn) 
    return corr, perf

def calc_err(x1, x2):
    nn = x1.shape[0]
    scores = np.zeros([nn, 1])
    for ii in range(nn):
        if x1[ii]!=x2[ii]:
            scores[ii] = 1
        else:
            scores[ii] = 0
    incorr = np.sum(scores, axis=0)
    incorr_prcng = (incorr / nn)
    return incorr, incorr_prcng

In [19]:
#---------------------------------------------------------------------------------------------------
# One vs Rest
#---------------------------------------------------------------------------------------------------

for trg_train_index, trg_test_index in cv1.split(X_train, y_train):
    print('train shape', trg_train_index.shape)
    print('test/validation shape', trg_test_index.shape)

svm_final_1.fit(X_train[trg_train_index], y_train[trg_train_index])

y_pred_train = svm_final_1.predict(X_train[trg_train_index])
y_pred_test = svm_final_1.predict(X_train[trg_test_index])

incorr1, incorr_prcng1 = calc_err(y_train[trg_train_index], y_pred_train)
incorr2, incorr_prcng2 = calc_err(y_train[trg_test_index], y_pred_test)
print('\n one vs rest')
print(incorr1, incorr_prcng1)
print(incorr2, incorr_prcng2)
print('\n')

corr1, perf1 = calc_perf(y_train[trg_train_index], y_pred_train)
corr2, perf2 = calc_perf(y_train[trg_test_index], y_pred_test)
print(corr1, perf1)
print(corr2, perf2)
print('\n')



#---------------------------------------------------------------------------------------------------
# One vs One
#---------------------------------------------------------------------------------------------------
for trg_train_index, trg_test_index in cv1.split(X_train, y_train):
    pass

print('train shape', trg_train_index.shape)
print('test/validation shape', trg_test_index.shape)

svm_final_2.fit(X_train[trg_train_index], y_train[trg_train_index])

y_pred_train = svm_final_2.predict(X_train[trg_train_index])
y_pred_test = svm_final_2.predict(X_train[trg_test_index])

incorr1, incorr_prcng1 = calc_err(y_train[trg_train_index], y_pred_train)
incorr2, incorr_prcng2 = calc_err(y_train[trg_test_index], y_pred_test)
print('\n one vs one')
print(incorr1, incorr_prcng1)
print(incorr2, incorr_prcng2)
print('\n')

corr1, perf1 = calc_perf(y_train[trg_train_index], y_pred_train)
corr2, perf2 = calc_perf(y_train[trg_test_index], y_pred_test)
print(corr1, perf1)
print(corr2, perf2)
print('\n')

train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)
train shape (3372,)
test/validation shape (375,)

 one vs rest
[905.] [0.26838671]
[98.] [0.26133333]


[2467.] [0.73161329]
[277.] [0.73866667]


train shape (3372,)
test/validation shape (375,)

 one vs one
[423.] [0.12544484]
[32.] [0.08533333]


[2949.] [0.87455516]
[343.] [0.91466667]




### One vs Rest
PAC#1 = 1/3372(905+17.52-18.36)=0.268 -> 73.2%
### One vs One
PAC#1 = 1/3372(423+17.52-18.36)=0.125  -> ~87.5 %

PAC#2 for both= max(0.00007, 59.91) = 59.9 -> 40.08 %

# Question 3
Question 3) Submit your predictions on this test set, one prediction per line in the order given studentsdigits-test.csvPreview the document (10 points)

In [34]:
#---------------------------------------------------------------------------------------------------
# One vs Rest
#---------------------------------------------------------------------------------------------------
svm_final_1.fit(X_train, y_train)
y_pred = svm_final_1.predict(X_test)


SAVE_RESULT_ROOT = '../results/'
fn_str = SAVE_RESULT_ROOT + 'MaryZolfaghar_preds_Q3_OneVsRest.txt'
a_file = open(fn_str, "w")

for row in y_pred:
    res = str(row)
    a_file.write(str(int(float(res)))+"\n")
a_file.close()
print('done')

#---------------------------------------------------------------------------------------------------
# One Vs One
#---------------------------------------------------------------------------------------------------
svm_final_2.fit(X_train, y_train)
y_pred_ovo = svm_final_2.predict(X_test)


SAVE_RESULT_ROOT = '../results/'
fn_str = SAVE_RESULT_ROOT + 'MaryZolfaghar_preds_Q3_OneVsOne.txt'
a_file = open(fn_str, "w")

for row in y_pred_ovo:
    res = str(row)
    a_file.write(str(int(float(res)))+"\n")
a_file.close()
print('done')

done
done


# Conclusions
1. It might be easier to use GridSearch, but faster to do it separately to have a general idea
1.1. So far I got C=8 for kernel ='linear', and n_splits=10 gave me the best result, gamma was not affecting the results
1.2. I tried to double check that with 
2. It might be a better cross validation to do a nested and non-nested cross validation
3. For being able to plot I might need to use cross predict: https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_predict.html#sphx-glr-auto-examples-model-selection-plot-cv-predict-py

4. SVC was way better than linearSVC
5. For transfer learning i might be able to use cross predict