In [2]:
from keras.preprocessing.image import ImageDataGenerator
import numpy as np
from itertools import groupby
import collections
import random

Using TensorFlow backend.


In [3]:
data = np.load('Data/Emotion/Train/final_data.npy')
print(data.shape)
labels = np.load('Data/Emotion/Train/final_labels_data.npy')
print(labels.shape)

(27947, 64, 64, 1)
(27947, 8)


In [7]:
# Find all the different labels
res = []
for i in range(27947):
    winner = np.argwhere(labels[i] == np.amax(labels[i])).flatten().tolist()
    a = ''
    for w in winner:
        a += str(w)
    res.append(a)

In [8]:
# Find the frequency of every category
# Some images have two or more labels (emotions)
# eg. 01 -> neutral + happy
counter = collections.Counter(res)

In [9]:
counter

Counter({'0': 9272,
         '01': 119,
         '012': 2,
         '013': 1,
         '014': 1,
         '015': 1,
         '02': 70,
         '023': 5,
         '024': 2,
         '026': 4,
         '03': 450,
         '034': 12,
         '035': 1,
         '036': 7,
         '037': 8,
         '04': 96,
         '045': 4,
         '05': 11,
         '057': 1,
         '06': 12,
         '07': 28,
         '1': 7394,
         '12': 58,
         '124': 3,
         '13': 10,
         '14': 25,
         '145': 4,
         '15': 3,
         '16': 2,
         '17': 2,
         '2': 3349,
         '23': 16,
         '234': 1,
         '236': 1,
         '24': 40,
         '245': 2,
         '246': 6,
         '247': 1,
         '25': 3,
         '256': 1,
         '26': 97,
         '3': 3361,
         '34': 32,
         '345': 3,
         '35': 13,
         '357': 1,
         '36': 26,
         '37': 9,
         '4': 2360,
         '45': 23,
         '457': 1,
         '46': 15,
         

In [10]:
# Use keras for image augmentation
data_generator = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=.1,
        horizontal_flip=True)

In [174]:
# Augment the images and create a balanced dataset across all 8 emotions
# Final length of every image is 10000
for emotion in ['0','1','2','3','4','5','6','7']:
    print(emotion)
    aug = []
    lab = []
    indices = [index for index, value in enumerate(res) if value == emotion]
    frequency = counter[emotion]
    number_of_augmentations = 10000 - frequency
    # find the indices with the emotion
    indices = [index for index, value in enumerate(res) if value == emotion]
    # Take at random 'number_of_augmentations' items
    if number_of_augmentations > frequency:
        indices = int(np.ceil(number_of_augmentations/frequency)) * indices
        random_items = random.sample(indices, number_of_augmentations)
    else:
        random_items = random.sample(indices, number_of_augmentations)
    for item in random_items:
        for X_batch, in data_generator.flow(np.reshape(data[item], (1,64,64,1)), save_to_dir='aug'+emotion, save_format='jpg', batch_size=1):
            X_batch = X_batch.astype('uint')
            X_batch = np.reshape(X_batch,(64,64,1)).tolist()
            aug.append(X_batch)
            lab.append(labels[item].tolist())
            break
    arr_aug = np.array(aug)
    arr_lab = np.array(lab)
    
    np.save('./aug' + emotion + '/' + 'arr' + emotion, arr_aug)
    np.save('./aug' + emotion + '/' + 'lab' + emotion, arr_lab)

0
1
2
3
4
5
6
7


In [187]:
# Concatenate the arrays
for i in range(8):
    d = np.load('aug{}/arr{}.npy'.format(i,i))
    l = np.load('aug{}/lab{}.npy'.format(i,i))
    data = np.concatenate((data, d), axis=0)
    labels = np.concatenate((labels, l), axis=0)

In [189]:
data.shape

(81241, 64, 64, 1)

In [190]:
labels.shape

(81241, 8)

In [195]:
# Save the augmented train set
np.save('Data/Emotion/Train/final_data_aug.npy', data)
np.save('Data/Emotion/Train/final_labels_data_aug.npy', labels)