<a href="https://colab.research.google.com/github/KungFuPanda22/CNN_Devanagiri/blob/master/Dataset_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Libraries
These are all the libraries we need to import

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
import sys
import tarfile
from six.moves import cPickle as pickle
import pandas as pd
import seaborn as sns
import time


In [0]:
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler 


#Data Storage

This section of code reads the Devnagiri Dataset and stores it in an array and randomizes it and then standardizes it.

In [0]:
#Extracting Devnagiri Data

num_classes = 18
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract('Train.tar.gz')

In [0]:
image_size = 32  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

#Storing data in NumPy array

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (plt.imread(image_file,0).astype(float)) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 1700)

In [0]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, 30600,0)




In [0]:
#Randodmize the Dataset

def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)

In [0]:
train_dataset = train_dataset.reshape(train_dataset.shape[0],1024)
sc = StandardScaler()
train_dataset = sc.fit_transform(train_dataset)


In [0]:
train_dataset.shape

# Data Visualization - Principal Component Analysis

First, we apply Principal Component Analysis(PCA) on the dataset. The dataset has 1024 features for each example. For the first step, we try to apply PCA and reduce the no. of components to two. In the second step, we reduce the components to three and make a 3D plot.

In [0]:
#Linear dimensionality reduction model
pca_2=decomposition.PCA(n_components = 2)

pca_2.fit(train_dataset)

pca_results_2 = pca_2.transform(train_dataset)

pca_2.explained_variance_ratio_.cumsum()

In [0]:
#2D plot of 18 different classes (Each color denotes a different class)

plt.figure(figsize=[15,15])
plt.scatter(pca_results_2[:, 0], pca_results_2[:, 1],
            c=train_labels, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('nipy_spectral', 18), s = 7)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

In [0]:
pca_3=decomposition.PCA(n_components = 3)

pca_3.fit(train_dataset)

pca_results_3 = pca_3.transform(train_dataset)

pca_3.explained_variance_ratio_.cumsum()

In [0]:
#2D plot of 18 different classes (Each color denotes a different class)

fig = plt.figure(figsize = [15,15])
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=pca_results_3[:, 0], ys= pca_results_3[:, 1], zs=pca_results_3[:, 2], zdir='z', 
           s=7,cmap=plt.cm.get_cmap('nipy_spectral', 18), c=train_labels, depthshade=True)


# Data Visualization - t-distributed Stochastic Neighbor Embedding

Now we apply t-distributed stochastic neighbor embedding(t-SNE) on the dataset. The dataset has 1024 features for each example. For the first step, we try to apply t-SNE and reduce the no. of components to two. In the second step, we reduce the components to three and make a 3D plot.

In [0]:
time_start = time.time()
tsne_2 = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results_2 = tsne.fit_transform(train_dataset)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
#2D plot of 18 different classes (Each color denotes a different class)

plt.figure(figsize=[15,15])
plt.scatter(tsne_results_2[:, 0], tsne_results_2[:, 1],
            c=train_labels, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('nipy_spectral', 18))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

In [0]:
time_start = time.time()
tsne_3 = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results_3 = tsne.fit_transform(train_dataset)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
#3D plot of 18 different classes (Each color denotes a different class)

fig = plt.figure(figsize = [15,15])
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=tsne_results_3[:, 0], ys= tsne_results_3[:, 1], zs=tsne_results_3[:, 2], zdir='z',
           s=7,cmap=plt.cm.get_cmap('nipy_spectral', 18), c=train_labels, depthshade=True)


# Data Visualization - PCA + t-SNE

Here, we first use PCA to reduce the components to 75 and then use t-SNE to reduce components to 2 and 3 respectively

In [0]:
#Linear dimensionality reduction model
pca_74=decomposition.PCA(n_components = 75)

pca_75.fit(train_dataset)

pca_results_75 = pca_2.transform(train_dataset)

pca_75.explained_variance_ratio_.cumsum()

In [0]:
time_start = time.time()
tsne_75_2 = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results_75_2 = tsne.fit_transform(pca_results_75)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
#2D plot of 18 different classes (Each color denotes a different class)

plt.figure(figsize=[15,15])
plt.scatter(tsne_results_75_2[:, 0], tsne_results_75_2[:, 1],
            c=train_labels, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('nipy_spectral', 18))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

In [0]:
time_start = time.time()
tsne_75_3 = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results_75_3 = tsne.fit_transform(pca_results_75)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [0]:
#3D plot of 18 different classes (Each color denotes a different class)

fig = plt.figure(figsize = [15,15])
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=tsne_results_75_3[:, 0], ys= tsne_results_75_3[:, 1], zs=tsne_results_75_3[:, 2], zdir='z',
           s=7,cmap=plt.cm.get_cmap('nipy_spectral', 18), c=train_labels, depthshade=True)