In [19]:
import numpy as np
import pandas as pd
import os
import math
from scipy.io import savemat
import shutil
from PIL import Image

In [20]:
np.random.seed(101)

In [21]:
base_path_structured = "/Users/benhoskings/Documents/Datasets/Fusion"
train_path_structured = os.path.join(base_path_structured, "train_set")
test_path_structured = os.path.join(base_path_structured, "test_set")

In [22]:
emotions_affect_net = pd.Series(["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger", "Contempt"])
emotions_aff_wild = pd.Series(["Neutral","Anger","Disgust","Fear","Happy","Sad","Surprise","Other"])

for set in ["train_set", "test_set"]:
    for em in pd.unique(pd.concat([emotions_affect_net, emotions_aff_wild])):
        if not os.path.isdir(os.path.join(base_path_structured, set, em)):
            os.mkdir(os.path.join(base_path_structured, set, em))
        

In [23]:
def get_sample_ids(emotions, counts, max_size=None, seed=None):
    # counts = [24882, 3750, 3803, 6378, 134414, 74874, 25459, 14090]
    label_count = dict(zip(emotions, counts))
    
    if max_size:
        max_size = min([max_size, min(label_count.values())])
    else:
        max_size = min(label_count.values())
        
    ids1 = np.empty((max_size, 0), np.int32)
    ids2 = np.empty((0, 1), np.int32)
    
    for idx, emotion in enumerate(emotions):
        file_count = label_count[emotion]
        emIds = np.random.permutation(np.arange(file_count))[:max_size]
        start_idx = sum(counts[:idx])
        ids1 = np.append(ids1, np.expand_dims(emIds, axis=1), axis=1)
        ids2 = np.append(ids2, start_idx + emIds)
        
    return ids1, ids2, class_count

def num_string(num):
    if num != 0:
        return f"0000{int(num)}"[int(math.log10(num)):]
    else:
        return "00000"

def is_corrupted(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify() # verify that it is, in fact an image
        return False
    except:
        return True

# Aff Wild Processing

In [24]:
base_path_raw = "/Users/benhoskings/Documents/Datasets/Aff-Wild-V2/Provided"
label_path = os.path.join(base_path_raw, "Third ABAW Annotations/MTL_Challenge")
image_path_raw = os.path.join(base_path_raw, "Images")

train_labels = pd.read_csv(os.path.join(label_path, "train_set.txt"), index_col=0)
train_labels = train_labels.loc[train_labels['expression'] >= 0]
train_labels = train_labels.sort_values(by=["expression"])
corrupt = np.array([is_corrupted(os.path.join(image_path_raw, "batch1", path)) for path in train_labels.index])
train_labels = train_labels.loc[np.logical_not(corrupt)]
train_labels = train_labels[~train_labels.index.duplicated(keep='first')]
# print(len(pd.unique(train_labels.index)))

aff_wild_class_count = train_labels.value_counts(subset=['expression']) 
class_count = np.array(aff_wild_class_count, dtype=np.int64)
class_labels = np.array([id[0] for id in aff_wild_class_count.index], dtype=np.uint16)
class_count = class_count[np.argsort(class_labels)]

In [25]:
id1, id2, sample_count = get_sample_ids(emotions_aff_wild, class_count)
train_subset = train_labels.iloc[id2, :]
print(train_subset.value_counts(subset=['expression']) )

expression
0             2053
1             2053
2             2053
3             2053
4             2053
5             2053
6             2053
7             2053
Name: count, dtype: int64


In [26]:
class_counts = np.zeros((len(emotions_aff_wild), 1))

for im_path in train_subset.index:
    values = train_subset.loc[im_path]
    emotion_idx = int(values["expression"])
    emotion = emotions_aff_wild[emotion_idx]
    class_idx = class_counts[emotion_idx]
    sample_path = os.path.join(train_path_structured, emotion, "AW-" + num_string(class_idx.item()))
    # savemat(f"{sample_path}.mat", values.to_dict())
    shutil.copy(os.path.join(image_path_raw, "batch1", im_path), f"{sample_path}.png")
    class_counts[emotion_idx] += 1
    

# AffectNet Processing

In [27]:
base_path_raw = "/Users/benhoskings/Documents/Datasets/AffectNet/Data/train_set-1"
has_image = np.array([os.path.isfile(f"{base_path_raw}/images/{idx}.jpg") for idx in range(414796)])
has_emotion = np.array([os.path.isfile(f"{base_path_raw}/annotations/{idx}_exp.npy") for idx in range(414796)])

In [28]:
train_labels_affect_net = pd.DataFrame(index=range(414796))
train_labels_affect_net["has_image"] = has_image
train_labels_affect_net["has_emotion"] = has_emotion
train_labels_affect_net = train_labels_affect_net.loc[np.logical_and(train_labels_affect_net["has_image"] == True, train_labels_affect_net["has_emotion"] == True)]

In [29]:
image_paths = [f"{base_path_raw}/images/{idx}.jpg" for idx in train_labels_affect_net.index]
affect_net_emotions = np.array([np.load(f"{base_path_raw}/annotations/{idx}_exp.npy") for idx in train_labels_affect_net.index], dtype=np.uint8)

In [30]:
train_labels_affect_net["image_path"] = image_paths
train_labels_affect_net["emotion"] = affect_net_emotions

In [31]:
train_labels_affect_net = train_labels_affect_net.set_index("image_path")

In [32]:
train_labels_affect_net = train_labels_affect_net.sort_values(by=["emotion"])
train_labels_affect_net = train_labels_affect_net[~train_labels_affect_net.index.duplicated(keep='first')]

In [33]:
affect_net_class_count = train_labels_affect_net.value_counts(subset=['emotion']) 
class_count = np.array(affect_net_class_count)
class_labels = np.array([id for id in affect_net_class_count.index]).flatten()
class_count = class_count[np.argsort(class_labels)]

In [34]:
id1, id2, sample_count = get_sample_ids(emotions=emotions_affect_net, counts=class_count)
train_subset = train_labels_affect_net.iloc[id2, :]
print(train_subset.value_counts(subset=['emotion']))

emotion
0          3750
1          3750
2          3750
3          3750
4          3750
5          3750
6          3750
7          3750
Name: count, dtype: int64


In [35]:
class_counts = np.zeros((len(emotions_affect_net), 1))

for im_path in train_subset.index:
    values = train_subset.loc[im_path]
    emotion_idx = int(values["emotion"])
    emotion = emotions_affect_net[emotion_idx]
    class_idx = class_counts[emotion_idx]
    sample_path = os.path.join(train_path_structured, emotion, "AN-" + num_string(class_idx.item()))
    # savemat(f"{sample_path}.mat", values.to_dict())
    shutil.copy(im_path, f"{sample_path}.png")
    class_counts[emotion_idx] += 1