# COMS 4995_002 Deep Learning Assignment 2

The UNIs of our team

Member 1: Saahil Jain, sj2675

Member 2: Leon Stilwell, ls3223

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
import tensorflow as tf
from tensorflow.contrib.data import Dataset

In [57]:
np.random.seed(1)

In [11]:
# helper functions
def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [12]:
#functions to load data

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [67]:
data_root_path = "cifar10-hw2/"
X_train, y_train = get_train_data(data_root_path)
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'truck': 9, 'bird': 2, 'deer': 4, 'cat': 3, 'frog': 6, 'airplane': 0, 'horse': 7, 'ship': 8, 'automobile': 1, 'dog': 5}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


In [68]:
# Reshape data as necessary
y_train.reshape(50000, )
X_train = np.swapaxes(X_train,1,0)
X_test = np.swapaxes(X_test,1,0)

# Convert to Numpy array
X_train = np.asarray(X_train,dtype = np.float32)
X_test = np.asarray(X_test,dtype = np.float32)
y_train = np.asarray(y_train,dtype = np.int32)

# Print shapes
print("X_train shape is: " + str(X_train.shape))
print("X_test shape is: " + str(X_test.shape))
print("y_train shape is: " + str(y_train.shape))

X_train shape is: (50000, 3072)
X_test shape is: (10000, 3072)
y_train shape is: (50000,)


In [69]:
# Split train data into train and validation data
X_validation = X_train[-10000:,:]
X_train = X_train[:40000]
y_validation = y_train[-10000:]
y_train = y_train[:40000]

# Print shapes
print(X_validation.shape)
print(y_validation.shape)
print(X_train.shape)

(10000, 3072)
(10000,)
(40000, 3072)
