In [56]:
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import time 

In [57]:
start=time.time()
import urllib.request
urllib.request.urlretrieve ("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", "a.tar.gz")
import tarfile
tar = tarfile.open("a.tar.gz")
tar.extractall()
tar.close()

# Loading the data from the dataset into the train_x and test_x

In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pickle


def unpickle(file):
 '''Load byte data from file'''
 with open(file, 'rb') as f:
  data = pickle.load(f, encoding='latin-1')
  return data


def load_cifar10_data(data_dir):
 '''Return train_data, train_labels, test_data, test_labels
 The shape of data is 32 x 32 x3'''
 train_data = None
 train_labels = []

 for i in range(1, 6):
  data_dic = unpickle(data_dir + "/data_batch_{}".format(i))
  if i == 1:
   train_data = data_dic['data']
  else:
   train_data = np.vstack((train_data, data_dic['data']))
  train_labels += data_dic['labels']

 test_data_dic = unpickle(data_dir + "/test_batch")
 test_data = test_data_dic['data']
 test_labels = test_data_dic['labels']

 train_data = train_data.reshape((len(train_data), 3, 32, 32))
 train_data = np.rollaxis(train_data, 1, 4)
 train_labels = np.array(train_labels)

 test_data = test_data.reshape((len(test_data), 3, 32, 32))
 test_data = np.rollaxis(test_data, 1, 4)
 test_labels = np.array(test_labels)

 return train_data, train_labels, test_data, test_labels

data_dir = 'cifar-10-batches-py'

train_data, train_labels, test_data, test_labels = load_cifar10_data(data_dir)

print(train_data.shape)
print(train_labels.shape)

print(test_data.shape)
print(test_labels.shape)

(50000, 32, 32, 3)
(50000,)
(10000, 32, 32, 3)
(10000,)


In [None]:
print(train_data[0])

# Reshaping the data into a 2D numpy array of size 5000 X 3072

In [61]:
x_train = train_data.reshape(train_data.shape[0],-1)
x_test = test_data.reshape(test_data.shape[0], -1)

In [62]:
print(x_train.shape)
print(x_test.shape)

(50000, 3072)
(10000, 3072)


In [63]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)

In [64]:
x_test_scaled = sc.transform(x_test)

In [65]:
y_train = train_labels
y_test = test_labels

In [66]:
print(y_train.shape)
print(y_test.shape)

(50000,)
(10000,)


# Performing PCA

In [67]:
pca = PCA()
pca.fit_transform(x_train_scaled)

array([[-2.20557381e+01,  1.22849042e+01,  8.96135095e+00, ...,
         5.39089318e-03,  5.58180986e-04, -2.60774472e-03],
       [ 4.01354905e+00, -5.04915634e+00,  2.53958923e+01, ...,
        -1.20631185e-03, -1.55110611e-03,  4.72296446e-03],
       [ 2.11123034e+01, -4.76871967e+01, -1.25735508e+01, ...,
        -1.95641086e-03, -1.98150370e-03,  1.93502441e-03],
       ...,
       [-5.79011324e+00, -4.49244141e+01, -4.24725698e+00, ...,
         5.51117587e-03, -5.06192238e-03,  1.70515002e-03],
       [ 4.23917856e+01, -1.65511813e+01,  2.22660304e+01, ...,
        -1.11556537e-03, -1.43552607e-03,  6.52224660e-03],
       [ 1.30753871e+01, -3.19221852e+00, -1.73174907e+01, ...,
        -2.31232834e-03,  2.07327476e-03,  1.24354774e-03]])

# Calculating amount of Features to retain


In [68]:
# Calculating optimal k to have 0.95 variance

k = 0
total = sum(pca.explained_variance_)
current_sum = 0

while(current_sum / total < 0.99):
    current_sum += pca.explained_variance_[k]
    k += 1
k

662

In [71]:
pca_cifar = PCA(n_components=k,whiten=True)
x_train_pca_cifar = pca_cifar.fit_transform(x_train_scaled)
x_test_pca_cifar = pca_cifar.transform(x_test_scaled)

# Applying Logistic Regression for the Prediction model

In [73]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [74]:
lr.fit(x_train_pca_cifar, y_train)
y_pred_lr = lr.predict(x_test_pca_cifar)
logistic_regression_score = accuracy_score(y_test, y_pred_lr)
logistic_regression_score

0.4

# Applying Support Vector Machines for the Prediction Model 

In [None]:
svc = svm.SVC(C= 100, gamma=0.005)
svc.fit(x_train_pca_cifar,y_train)
y_pred_svm = svc.predict(x_test_pca_cifar)
svc_score = accuracy_score(y_test, y_pred_svm)
svc_score
print("time is :",(time.time()-start)/60," mins")

In [None]:
svc_score