# Visual Model Preprocessing 

In [2]:
import pandas as pd
import os
from sklearn.utils import resample

In [3]:
images = pd.read_csv("data_csv/fer.csv")

Below we print the number of instances for each class 

In [4]:
# Get the count of each label
label_counts = images['emotion'].value_counts()

# Display the count of each label
print(label_counts)

emotion
3    8989
6    6198
4    6077
2    5121
0    4953
5    4002
1     547
Name: count, dtype: int64


The structure of our data is ouputted below 

In [5]:
images.shape
images.columns

Index(['emotion', 'pixels', 'Usage'], dtype='object')

Step 1: Identify the class with the highest frequency (majority class) and the classes with lower frequencies (minority classes)

In [6]:
label_counts = images['emotion'].value_counts()
majority_class = label_counts.idxmax()
minority_classes = label_counts[label_counts.index != majority_class].index.tolist()

print(majority_class)
print(minority_classes)

3
[6, 4, 2, 0, 5, 1]


Step 2: Calculate the count of the majority class

In [7]:
majority_count = label_counts[majority_class]
print(majority_count)

8989


Step 3: Upsample the minority classes

In [8]:
balanced_df = pd.DataFrame(columns=images.columns)
for label in minority_classes:
    # Select rows with the current minority class
    minority_df = images[images['emotion'] == label]
    
    # Upsample the minority class to match the count of the majority class
    minority_df_upsampled = resample(
        minority_df,
        replace=True,  # sample with replacement to duplicate samples
        n_samples=majority_count,  # match majority class count
        random_state=42  # for reproducibility
    )
    
    # Concatenate the upsampled minority class and the majority class
    balanced_df = pd.concat([balanced_df, minority_df_upsampled])

# Include the majority class without resampling
balanced_df = pd.concat([balanced_df, images[images['emotion'] == majority_class]])

# Step 4: Shuffle the DataFrame to mix the classes
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

Below we check whether we succesfully upsampled minority classes 

In [9]:
# Get the count of each label
label_counts = balanced_df['emotion'].value_counts()
# Display the count of each label
print(label_counts)

emotion
2    8989
3    8989
4    8989
0    8989
5    8989
6    8989
1    8989
Name: count, dtype: int64


We now divide the data into training, validation, and testing datasets 

In [10]:
# Calculate the sizes for each subset
total_size = len(images)
first_subset_size = int(total_size * 0.2)
second_subset_size = int(total_size * 0.7)

# Split the DataFrame into three non-overlapping subsets
test_df = images.iloc[:first_subset_size]
train_df = images.iloc[first_subset_size:first_subset_size + second_subset_size]
val_df = images.iloc[first_subset_size + second_subset_size:]

print(test_df.shape)
print(train_df.shape)
print(val_df.shape)


(7177, 3)
(25120, 3)
(3590, 3)


Save the data for model assembly

In [11]:
train_df.to_csv('data_csv/train.csv')
test_df.to_csv('data_csv/test.csv')
val_df.to_csv('data_csv/val.csv')