# Face Emotion Recognition

https://huggingface.co/datasets/tukey/human_face_emotions_roboflow/viewer/default/train?p=1&views%5B%5D=train

In [2]:
import pandas as pd
import io
from PIL import Image

df = pd.read_parquet("hf://datasets/tukey/human_face_emotions_roboflow/data/train-00000-of-00001.parquet")

  from .autonotebook import tqdm as notebook_tqdm


# Data Overview & Cleaning

In [3]:
# Standardize column names (strip whitespace, lower-case, replace spaces with underscores)
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

# Check for missing values
print("Missing values per column:")
print(df.isna().sum())

# No missing values or duplicates, so we can proceed with the data as is

# Print out summary information
print("\nDataframe Info:")
print(df.info())

# Print the first few rows to inspect the data
print("\nFirst 5 rows of the dataset:")
print(df.head())

Missing values per column:
image    0
qa       0
dtype: int64

Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9400 entries, 0 to 9399
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   9400 non-null   object
 1   qa      9400 non-null   object
dtypes: object(2)
memory usage: 147.0+ KB
None

First 5 rows of the dataset:
                                               image  \
0  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
1  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
2  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
3  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   
4  {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...   

                                                  qa  
0  [{'question': 'How does the person feel in the...  
1  [{'question': 'How does the person feel in the...  
2  [{'question': 'How does the person feel in the...  
3  [{'question': 'How does the person

In [4]:
# Check for missing values in each column
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
image    0
qa       0
dtype: int64


In [5]:
import json
import numpy as np

def extract_emotion(qa_entry):
    try:
        # If the qa_entry is a string, strip it and parse as JSON.
        if isinstance(qa_entry, str):
            qa_entry = qa_entry.strip()
            qa_data = json.loads(qa_entry)
        else:
            qa_data = qa_entry

        # If the data is a numpy array, convert it to a list.
        if isinstance(qa_data, np.ndarray):
            qa_data = qa_data.tolist()

        # Now you can check if it's a list or tuple using this condition.
        if isinstance(qa_data, (list, tuple)) and len(qa_data) > 0:
            return qa_data[0].get("answer")
        else:
            print("Unexpected qa_data structure:", qa_data, "with type", type(qa_data))
    except Exception as e:
        print("Error parsing qa entry:", qa_entry, "\nError:", e)
    return None

In [6]:
# Assuming df is your DataFrame that includes the 'qa' column
df["emotion"] = df["qa"].apply(extract_emotion)

# Verify the new column
print(df[["qa", "emotion"]].head())

                                                  qa  emotion
0  [{'question': 'How does the person feel in the...      sad
1  [{'question': 'How does the person feel in the...    anger
2  [{'question': 'How does the person feel in the...  neutral
3  [{'question': 'How does the person feel in the...     fear
4  [{'question': 'How does the person feel in the...  content


In [7]:
# Check unique values and distribution of facial emotion labels
if 'emotion' in df.columns:
    print("\nUnique emotion labels:")
    print(df['emotion'].unique())

    print("\nDistribution of emotion labels:")
    print(df['emotion'].value_counts())


Unique emotion labels:
['sad' 'anger' 'neutral' 'fear' 'content' 'happy' 'disgust' 'surprise']

Distribution of emotion labels:
emotion
surprise    1238
neutral     1225
sad         1184
fear        1181
anger       1175
disgust     1165
content     1144
happy       1088
Name: count, dtype: int64


In [8]:
import matplotlib.pyplot as plt

# Example: Plot a histogram for a numeric column, adjust 'score' to the relevant column name
if 'score' in df.columns:
    plt.hist(df['score'].dropna(), bins=30, edgecolor='k')
    plt.xlabel("Score")
    plt.ylabel("Frequency")
    plt.title("Histogram of Scores")
    plt.show()

In [10]:
# Optionally, save the cleaned dataframe to disk as a new parquet file or CSV
df.to_parquet("cleaned_human_face_emotions.parquet")
# Alternatively, you can save as CSV:
# df.to_csv("cleaned_human_face_emotions.csv", index=False)