In [3]:
import os
import pandas as pd
import numpy as np
import random
import math
import statistics
import imblearn
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from shutil import copyfile, move
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms
from torch.utils.data.sampler import SubsetRandomSampler
os.getcwd()

# Data Preprocessing Pipeline

For our data cleaning and preparation, we did the following
1. Extract image data from AFF-Wild2 dataset
2. Transform data to fit our model
3. Create different variations of the dataset

## Convert AFF-Wild2 videos into images
As the original AFF-Wild2 dataset are a series of videos, the focus is to convert selected frames of the video into images. <br/> We used frames where there was a change in emotion label as our selection criteria, to reduce similarities between images within the same label

In [None]:
# Extract individual frames as images to create dataset
def videoToImage(fileName):
    label_data_dir = os.getcwd()+"\\datasets\\AFF Wild\\annotations\\Train_Set\\"
    vid_data_dir = os.getcwd()+"\\datasets\\AFF Wild\\videos\\Train_Set\\"
    img_data_dir = os.getcwd()+"\\datasets\\AFF Wild\\images\\"

    file = open(label_data_dir+fileName, "r")
    imgLabels = []
    lastValue = 10
    # Get labels from .txt files. Specifically focusing on getting the frames of video where label changes
    for x in file:
        if(x[0]!='N' and x[0]!='-'):
            currentValue = int(x[0])
            if lastValue==10 or lastValue!=currentValue:
                imgLabels.append({'frame': count, 'label': currentValue})
                lastValue = currentValue
                count+=1

    # Get specific frames and store as images
    images = []
    cap = cv2.VideoCapture(vid_data_dir+fileName[:-4]+".mp4")
    for x in imgLabels:
        cap.set(1, x['frame'])
        ret, frame = cap.read()
        if(ret):
            images.append(frame)

    # Store images into a separate folder
    labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
    count = 0
    for x in images:
        img = Image.fromarray(x, 'RGB')
        img.save(img_data_dir+labels[imgLabels[count]['label']]+'\\'+fileName[:-4]+'-'+str(count)+'.png')
        count+=1

In [None]:
# Create necessary directories (comment out if already created)
os.mkdir(os.getcwd()+"\\datasets\\AFF Wild\\images")
labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
for label in labels:
    os.mkdir(os.getcwd()+"\\datasets\\AFF Wild\\images\\"+label)

# Loop through all video files
data_dir = os.getcwd() + "\\datasets\\AFF Wild\\annotations\\Train_Set\\"
files = os.listdir(data_dir)
for file in files:
    videoToImage(file)

## Data Augmentation
Our data augmentation steps focus on transforming the data to fit our model and creating variations in the dataset for our experiments

### Creating custom transformers
This transformer uses the haarcascade classifier to identify facial features, and crop the image to only the facial features <br/>
This transformer also converts the image from RGB to a 3 channel grayscale image

In [None]:
class CustomTransform(object):
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size
        
    def __call__(self, img):
        opencvImage = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(opencvImage, cv2.COLOR_BGR2GRAY)
        tripleGray = np.stack((gray,)*3, axis=-1)
        faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
        faces = faceCascade.detectMultiScale(
                tripleGray,
                scaleFactor=1.3,
                minNeighbors=3,
                minSize=(30, 30)
        )
        for (x, y, w, h) in faces:
            if len(faces)==1:
                tripleGray[y:y+h, x:x+w]
        if isinstance(self.output_size, int):
            resized = cv2.resize(tripleGray, (self.output_size, self.output_size))
        if isinstance(self.output_size, tuple):
            resized = cv2.resize(tripleGray, self.output_size)
        return resized

## Dataset Creation
There are 3 variations of the balanced datasets created here, the undersampled dataset, oversampled dataset and progressive dataset <br/>
The balanced datasets are then transformed and converted into tensor files

## Undersampled dataset
Create a balanced dataset by sampling image data to match the lowest label<br/>
Final tensor files are then stored together to form the dataset

### CK+

In [None]:
datadir = os.getcwd()+"\\datasets\\CK+\\sorted_dataset\\"

labels = ['anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
train_size =[]


for x in range(6):
    files = os.listdir(datadir+labels[x])
    count = 0
    # Create a 80/20 train/validation split
    train_size.append(math.floor(len(files)*0.8))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    for file in files:
        if count in valSample:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\undersampled_dataset\\val\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\undersampled_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1

# Obtain size of label with lowest data
minLabel = min(train_size)

for x in range(6):
    count=0
    files = os.listdir(datadir+"\\undersampled_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=minLabel)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"\\CK_undersampled_dataset\\train\\"+labels[x]+"\\"+file, datadir+"\\CK_undersampled_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\CK_undersampled_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val']}

# Convert our augmented images and store as tensor files
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1
    
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1

### FER2013

In [None]:
data_dir = "\\datasets\\FER2013\\train\\"
labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
FERlabels = ['neutral', 'angry', 'disgust', 'fear', 'happy', 'sad', 'surprise']
train_size=[]

for x in range(7):
    files = os.listdir(datadir+"train\\"+FERlabels[x])
    count = 0
    # Create a 80/20 train/validation split
    train_size.append(math.floor(len(files)*0.8))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    
    for file in files:
        if count in valSample:
            copyfile(datadir+"train\\"+FERlabels[x]+"\\"+file, datadir+"\\FER_undersampled_dataset\\val\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+"train\\"+FERlabels[x]+"\\"+file, datadir+"\\FER_undersampled_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1
        
# Obtain size of label with lowest data
minLabel = min(train_size)

for x in range(7):
    count=0
    files = os.listdir(datadir+"\\FER_undersampled_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=minLabel)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"\\FER_undersampled_dataset\\train\\"+labels[x]+"\\"+file, datadir+"\\FER_undersampled_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\FER2013\\FER_undersampled_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val']}

# Convert our augmented images and store as tensor files
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1
    
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1

### AffectNet

In [None]:
# Sort AffectNet images into folders of respective emotion labels
source_data_dir = os.getcwd() + "\\datasets\\AffectNet\\Automatically_Annotated_compressed\\"
target_data_dir = os.getcwd() + "\\datasets\\AffectNet_sorted\\"
df = pd.read_csv(source_data_dir+"automatically_annotated.csv")
for index, row in tqdm(df.iterrows()):

    imageName = row['subDirectory_filePath'].split("/")
    if row['expression']==0:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\neutral\\"+imageName[1])
    if row['expression']==1:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\happy\\"+imageName[1])
    if row['expression']==2:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\sadness\\"+imageName[1])
    if row['expression']==3:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\suprise\\"+imageName[1])
    if row['expression']==4:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\fear\\"+imageName[1])
    if row['expression']==5:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\disgust\\"+imageName[1])
    if row['expression']==6:
        copyfile(source_data_dir+"\\Automatically_Annotated\\Automatically_Annotated_images\\"+row['subDirectory_filePath'], target_data_dir+"\\anger\\"+imageName[1])

In [None]:
datadir = os.getcwd()+"\\datasets\\AffectNet_sorted\\"

labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
train_size=[]

for x in range(7):
    files = os.listdir(datadir+labels[x])
    count = 0
    trainCount = 0
    valCount=0
    maxInput = 0
    # Hardcoded minimum size of label
    if len(files)>890:
        maxInput = 890
    else:
        maxInput = len(files)
    # Create a 80/20 train/validation split
    train_size.append(math.floor(maxInput*0.8))
    valSample = np.random.choice(len(files), maxInput-train_size[x], replace=False)
    
    for file in files:
        if valCount>=len(valSample) and trainCount>=train_size[x]:
            break
        if count in valSample and valCount<len(valSample):
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\AffectNet_undersampled_dataset\\val\\"+labels[x]+"\\"+file)
            valCount+=1
        if trainCount<train_size[x]:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\AffectNet_undersampled_dataset\\train\\"+labels[x]+"\\"+file)
            trainCount+=1
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\AffectNet_sorted\\AffectNet_undersampled_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

# Convert our augmented images and store as tensor files
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1
    
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1

### Aff-Wild2

In [None]:
datadir = os.getcwd()+"\\datasets\\AFF Wild\\"

train_size=[]

for x in range(7):
    files = os.listdir(datadir+"images\\"+labels[x])
    count = 0
    # Create a 60/20/20 train/test/validation split
    train_size.append(math.floor(len(files)*0.6))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    
    for file in files:
        if count in valSample:
            if count%2==0:
                copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"Aff_undersampled_dataset\\val\\"+labels[x]+"\\"+file)
            else:
                copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"Aff_undersampled_dataset\\test\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"Aff_undersampled_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1

# Obtain size of label with lowest data
minLabel = min(train_size)

for x in range(7):
    count=0
    files = os.listdir(datadir+"balanced_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=minLabel)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"Aff_undersampled_dataset\\train\\"+labels[x]+"\\"+file, datadir+"Aff_undersampled_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
    'test': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\AFF Wild\\Aff_undersampled_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val', 'test']}

# Convert our augmented images and store as tensor files
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1
    
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1
    
count=0
data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\undersampled_dataset_tensors\\test\\"
for tensor in tqdm(image_datasets['test']):
    torch.save(tensor, data_dir+str(count)+'.pt')
    count+=1

### Oversampled dataset/Progressive dataset
Create a balanced dataset by sampling image data to match the median label<br/>
Final tensor files are either stored together (oversampled dataset) or separately (progressive dataset)

### CK+

In [None]:
datadir = os.getcwd()+"\\datasets\\CK+\\sorted_dataset\\"

labels = ['anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
train_size =[]


for x in range(6):
    files = os.listdir(datadir+labels[x])
    count = 0
    # Create a 80/20 train/validation split
    train_size.append(math.floor(len(files)*0.8))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    for file in files:
        if count in valSample:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\CK_balanced_dataset\\val\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\CK_balanced_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1

# Obtain size of label with median amount data
median = math.ceil(statistics.median(train_size))

for x in range(6):
    count=0
    files = os.listdir(datadir+"\\CK_balanced_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=median)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"\\CK_balanced_dataset\\train\\"+labels[x]+"\\"+file, datadir+"\\CK_balanced_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\CK+\\sorted_dataset\\CK_balanced_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val']}

# Convert our augmented images and store as tensor files
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\CK_balanced_dataset_tensors\\val\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1
    
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\CK_balanced_dataset_tensors\\train\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1

### FER2013
FER2013 has slightly different naming conventions, so slight adjustments need to be made

In [None]:
data_dir = "\\datasets\\FER2013\\train\\"
labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
FERlabels = ['neutral', 'angry', 'disgust', 'fear', 'happy', 'sad', 'surprise']
train_size=[]

for x in range(7):
    files = os.listdir(datadir+"train\\"+FERlabels[x])
    count = 0
    # Create a 80/20 train/validation split
    train_size.append(math.floor(len(files)*0.8))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    
    for file in files:
        if count in valSample:
            copyfile(datadir+"train\\"+FERlabels[x]+"\\"+file, datadir+"\\FER_balanced_dataset\\val\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+"train\\"+FERlabels[x]+"\\"+file, datadir+"\\FER_balanced_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1

# Obtain size of label with median amount data
median = math.ceil(statistics.median(train_size))

for x in range(7):
    count=0
    files = os.listdir(datadir+"\\FER_balanced_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=median)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"\\FER_balanced_dataset\\train\\"+labels[x]+"\\"+file, datadir+"\\FER_balanced_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\FER2013\\FER_balanced_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val']}

# Convert our augmented images and store as tensor files
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\FER_balanced_dataset_tensors\\val\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1
    
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\FER_balanced_dataset_tensors\\train\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1

### AffectNet

In [None]:
datadir = os.getcwd()+"\\datasets\\AffectNet_sorted\\"

labels = ['neutral', 'anger', 'disgust', 'fear', 'happy', 'sadness', 'suprise']
train_size=[]

for x in range(7):
    files = os.listdir(datadir+labels[x])
    count = 0
    trainCount = 0
    valCount=0
    maxInput = 0
    # Hardcoded median size of label
    if len(files)>20854:
        maxInput = 20854
    else:
        maxInput = len(files)
    # Create a 80/20 train/validation split
    train_size.append(math.floor(maxInput*0.8))
    valSample = np.random.choice(len(files), maxInput-train_size[x], replace=False)
    
    for file in files:
        if valCount>=len(valSample) and trainCount>=train_size[x]:
            break
        if count in valSample and valCount<len(valSample):
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\AffectNet_balanced_dataset\\val\\"+labels[x]+"\\"+file)
            valCount+=1
        if trainCount<train_size[x]:
            copyfile(datadir+labels[x]+"\\"+file, datadir+"\\AffectNet_balanced_dataset\\train\\"+labels[x]+"\\"+file)
            trainCount+=1
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\AffectNet_sorted\\AffectNet_balanced_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}

# Convert our augmented images and store as tensor files
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\AffectNet_balanced_dataset_tensors\\val\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1
    
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\AffectNet_balanced_dataset_tensors\\train\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1

### AFF-Wild2

In [None]:
datadir = os.getcwd()+"\\datasets\\AFF Wild\\"

train_size=[]

for x in range(7):
    files = os.listdir(datadir+"images\\"+labels[x])
    count = 0
    # Create a 60/20/20 train/test/validation split
    train_size.append(math.floor(len(files)*0.6))
    valSample = np.random.choice(len(files), len(files)-train_size[x], replace=False)
    
    for file in files:
        if count in valSample:
            if count%2==0:
                copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"AFF_balanced_dataset\\val\\"+labels[x]+"\\"+file)
            else:
                copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"AFF_balanced_dataset\\test\\"+labels[x]+"\\"+file)
        else:
            copyfile(datadir+"images\\"+labels[x]+"\\"+file, datadir+"AFF_balanced_dataset\\train\\"+labels[x]+"\\"+file)
        count+=1
        
# Obtain size of label with median amount data
median = math.ceil(statistics.median(train_size))
for x in range(7):
    count=0
    files = os.listdir(datadir+"AFF_balanced_dataset\\train\\"+labels[x])
    trainSample = np.random.randint(low=0, high=train_size[x], size=median)
    for file in files:
        freq = np.count_nonzero(trainSample == count)
        if freq>0:
            for i in range(freq):
                copyfile(datadir+"AFF_balanced_dataset\\train\\"+labels[x]+"\\"+file, datadir+"AFF_balanced_dataset\\train_sampled\\"+labels[x]+"\\"+str(i)+file)
        count+=1

In [None]:
# Transform our data
data_transforms = {
    'train_sampled': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.Normalize([0.5], [0.5])
    ]),
    'val': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
    'test': transforms.Compose([
        CustomTransform(224),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5])
    ]),
}

data_dir = os.getcwd() + "\\datasets\\AFF Wild\\AFF_balanced_dataset\\"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train_sampled', 'val', 'test']}

# Convert our augmented images and store as tensor files
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\AFF_balanced_dataset_tensors\\val\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\val\\"
for tensor in tqdm(image_datasets['val']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1
    
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\AFF_balanced_dataset_tensors\\train\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\train\\"
for tensor in tqdm(image_datasets['train_sampled']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1
    
count=0
progressive_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\AFF_balanced_dataset_tensors\\test\\"
oversampled_data_dir = os.getcwd() + "\\datasets\\sorted_dataset\\combined_balanced_dataset_tensors\\test\\"
for tensor in tqdm(image_datasets['test']):
    torch.save(tensor, progressive_data_dir+str(count)+'.pt')
    torch.save(tensor, oversampled_data_dir+str(count)+'.pt')
    count+=1