In [10]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.utils import shuffle

In [11]:
# Data loading
testData = sklearn.datasets.fetch_20newsgroups_vectorized(subset = 'test')
trainData = sklearn.datasets.fetch_20newsgroups_vectorized(subset = 'train')


X_test = testData['data']
y_test = testData['target']

X_train = trainData['data']
y_train = trainData['target']

#X_train = X_train[list(range(1000)),:]
#y_train = y_train[list(range(1000))]


X_train_short = X_train[:]
y_train_short = y_train[:]

# Shuffling array 
X_train_short, y_train_short = shuffle(X_train_short, y_train_short, random_state=1)


X_train_short = X_train_short[0:999,]
y_train_short = y_train_short[0:999]

print(X_train_short.shape)

(999, 130107)


In [12]:
## Linear kernel

linearDefault = svm.SVC(kernel='linear')
linearDefault.fit(X_train, y_train)

linearDefaultYhat_train = linearDefault.predict(X_train)
linearDefaultYhat_test = linearDefault.predict(X_test)

linearDefaultACS_train = sklearn.metrics.accuracy_score(y_train, linearDefaultYhat_train)
linearDefaultACS_test = sklearn.metrics.accuracy_score(y_test, linearDefaultYhat_test)

print('linearDefaultACS_train: ', linearDefaultACS_train)
print('linearDefaultACS_test: ', linearDefaultACS_test)




## Cross validation
C = np.logspace(-2, 10, 5)
gamma = np.logspace(-9, 3, 5)

combination = []
cvResults = []



for c in C: 
    for g in gamma: 
        model = svm.SVC(kernel ='linear', C = c, gamma = g)
        try: 
            accuraccy = np.mean(model_selection.cross_val_score(model,
                                                    X_train_short, 
                                                    y_train_short, 
                                                    cv=5, 
                                                    scoring='accuracy'))

            combination.append([c, g])
            cvResults.append(accuraccy)

        except ValueError:

            print("Combination [", [c, g], "] does not wokr!")
            
            
        

indexBest = np.argmax(cvResults)
print('Best set of hyperparameters is: ', combination[indexBest])



## linear kernel best CV

linearCV = svm.SVC(kernel ='linear', 
                 C = combination[indexBest][0], 
                 gamma = combination[indexBest][1])

linearCV.fit(X_train, y_train)

linearCVYhat_train = linearCV.predict(X_train)
linearCVYhat_test = linearCV.predict(X_test)

linearCVACS_train = sklearn.metrics.accuracy_score(y_train, linearCVYhat_train)
linearCVACS_test = sklearn.metrics.accuracy_score(y_test, linearCVYhat_test)

print('Best model with CV selected hyperparameters is: ')
print('linearCV_ACS_train: ', linearCVACS_train)
print('linearCV_ACS_test: ', linearCVACS_test)








linearDefaultACS_train:  0.961463673325084
linearDefaultACS_test:  0.7591609134360063
Best set of hyperparameters is:  [10.0, 1e-09]
Best model with CV selected hyperparameters is: 
linearCV_ACS_train:  0.9989393671557363
linearCV_ACS_test:  0.781465746149761


In [13]:
## Plynomial kernel

polyDefault = svm.SVC(kernel='poly')
polyDefault.fit(X_train, y_train)

polyDefaultYhat_train = polyDefault.predict(X_train)
polyDefaultYhat_test = polyDefault.predict(X_test)

polyDefaultACS_train = sklearn.metrics.accuracy_score(y_train, polyDefaultYhat_train)
polyDefaultACS_test = sklearn.metrics.accuracy_score(y_test, polyDefaultYhat_test)

print('polyDefaultACS_train: ', polyDefaultACS_train)
print('polyDefaultACS_test: ', polyDefaultACS_test)



## Cross validation

degree = [1,2,3, 4]
C = np.logspace(-2, 10, 5)
gamma = np.logspace(-9, 3, 5)

combination = []
cvResults = []


for d in degree:
    for c in C: 
        for g in gamma: 
            model = svm.SVC(kernel='poly', degree = d, C = c, gamma = g)
            try: 
                accuraccy = np.mean(model_selection.cross_val_score(model,
                                                    X_train_short, 
                                                    y_train_short, 
                                                    cv=5, 
                                                    scoring='accuracy'))

                combination.append([d, c, g])
                cvResults.append(accuraccy)

            except ValueError:

                print("Combination [", [d, c, g], "] does not wokr!")
            
            
        

indexBest = np.argmax(cvResults)
print('Best set of hyperparameters is: ', combination[indexBest])



## Plynomial kernel best CV

polyCV = svm.SVC(kernel='poly', 
                 degree = combination[indexBest][0], 
                 C = combination[indexBest][1], 
                 gamma = combination[indexBest][2])

polyCV.fit(X_train, y_train)

polyCVYhat_train = polyCV.predict(X_train)
polyCVYhat_test = polyCV.predict(X_test)

polyCVACS_train = sklearn.metrics.accuracy_score(y_train, polyCVYhat_train)
polyCVACS_test = sklearn.metrics.accuracy_score(y_test, polyCVYhat_test)

print('Best model with CV selected hyperparameters has: ')
print('polyCV_ACS_train: ', polyCVACS_train)
print('polyCV_ACS_test: ', polyCVACS_test)








polyDefaultACS_train:  0.9949619939897472
polyDefaultACS_test:  0.6990175252257037
Best set of hyperparameters is:  [1, 0.01, 1000.0]
Best model with CV selected hyperparameters has: 
polyCV_ACS_train:  0.9989393671557363
polyCV_ACS_test:  0.781465746149761


In [14]:


print('test')

test


In [15]:
#rbfDefault
rbfDefault = svm.SVC(kernel='rbf')
rbfDefault.fit(X_train, y_train)

rbfDefaultYhat_train = rbfDefault.predict(X_train)
rbfDefaultYhat_test = rbfDefault.predict(X_test)

rbfDefaultACS_train = sklearn.metrics.accuracy_score(y_train, rbfDefaultYhat_train)
rbfDefaultACS_test = sklearn.metrics.accuracy_score(y_test, rbfDefaultYhat_test)

print('rbfDefaultACS_train: ', rbfDefaultACS_train)
print('rbfDefaultACS_test: ', rbfDefaultACS_test)





## Cross validation


C = np.logspace(-2, 10, 5)
gamma = np.logspace(-9, 3, 5)

combination = []
cvResults = []



for c in C: 
    for g in gamma: 
        model = svm.SVC(kernel='rbf', C = c, gamma = g)
        try: 
            accuraccy = np.mean(model_selection.cross_val_score(model,
                                                    X_train_short, 
                                                    y_train_short, 
                                                    cv=5, 
                                                    scoring='accuracy'))

            combination.append([c, g])
            cvResults.append(accuraccy)

        except ValueError:

            print("Combination [", [c, g], "] does not wokr!")
            
            
        

indexBest = np.argmax(cvResults)
print('Best set of hyperparameters is: ', combination[indexBest])



## rbf kernel best CV

rbfCV = svm.SVC(kernel='rbf', 
                 C = combination[indexBest][0], 
                 gamma = combination[indexBest][1])

rbfCV.fit(X_train, y_train)

rbfCVYhat_train = rbfCV.predict(X_train)
rbfCVYhat_test = rbfCV.predict(X_test)

rbfCVACS_train = sklearn.metrics.accuracy_score(y_train, rbfCVYhat_train)
rbfCVACS_test = sklearn.metrics.accuracy_score(y_test, rbfCVYhat_test)

print('Best model with CV selected hyperparameters has: ')
print('rbfCV_ACS_train: ', rbfCVACS_train)
print('rbfCV_ACS_test: ', rbfCVACS_test)






rbfDefaultACS_train:  0.9865653173059926
rbfDefaultACS_test:  0.7446893255443441
Best set of hyperparameters is:  [10000.0, 0.001]
Best model with CV selected hyperparameters has: 
rbfCV_ACS_train:  0.9994696835778681
rbfCV_ACS_test:  0.7798725438130643


In [16]:
#sigmoid
sigmoidDefault = svm.SVC(kernel='sigmoid')
sigmoidDefault.fit(X_train, y_train)

sigmoidDefaultYhat_train = sigmoidDefault.predict(X_train)
sigmoidDefaultYhat_test = sigmoidDefault.predict(X_test)

sigmoidDefaultACS_train = sklearn.metrics.accuracy_score(y_train, sigmoidDefaultYhat_train)
sigmoidDefaultACS_test = sklearn.metrics.accuracy_score(y_test, sigmoidDefaultYhat_test)

print('sigmoidDefaultACS_train: ', sigmoidDefaultACS_train)
print('sigmoidDefaultACS_test: ', sigmoidDefaultACS_test)



## Cross validation


C = np.logspace(-2, 10, 5)
gamma = np.logspace(-9, 3, 5)

combination = []
cvResults = []



for c in C: 
    for g in gamma: 
        model = svm.SVC(kernel='sigmoid', C = c, gamma = g)
        try: 
            accuraccy = np.mean(model_selection.cross_val_score(model,
                                                    X_train_short, 
                                                    y_train_short, 
                                                    cv=5, 
                                                    scoring='accuracy'))

            combination.append([c, g])
            cvResults.append(accuraccy)

        except ValueError:

            print("Combination [", [c, g], "] does not wokr!")
            
            
        

indexBest = np.argmax(cvResults)
print('Best set of hyperparameters is: ', combination[indexBest])



## rbf kernel best CV

sigmoidCV = svm.SVC(kernel='sigmoid', 
                 C = combination[indexBest][0], 
                 gamma = combination[indexBest][1])

sigmoidCV.fit(X_train, y_train)

sigmoidCVYhat_train = sigmoidCV.predict(X_train)
sigmoidCVYhat_test = sigmoidCV.predict(X_test)

sigmoidCVACS_train = sklearn.metrics.accuracy_score(y_train, sigmoidCVYhat_train)
sigmoidCVACS_test = sklearn.metrics.accuracy_score(y_test, sigmoidCVYhat_test)

print('Best model with CV selected hyperparameters has: ')
print('sigmoidCV_ACS_train: ', sigmoidCVACS_train)
print('sigmoidCV_ACS_test: ', sigmoidCVACS_test)



sigmoidDefaultACS_train:  0.8934063991514938
sigmoidDefaultACS_test:  0.7126925119490175
Best set of hyperparameters is:  [10000.0, 0.001]
Best model with CV selected hyperparameters has: 
sigmoidCV_ACS_train:  0.9989393671557363
sigmoidCV_ACS_test:  0.781465746149761
