In [21]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import os

# 1. Get Train Labels csv

In [5]:
df = pd.read_csv('AllLabels.csv')
df.head()
print(df.shape)

(8925, 5)


In [6]:
# Cargar Labels del CSV
def load_labels(csv_path):
    df = pd.read_csv(csv_path)
    labels_dict = {}
    for _, row in df.iterrows():
        clip_id = row['ClipID'].replace('.avi', '').replace('.mp4', '')
        labels_dict[clip_id] = [row['Boredom'], row['Engagement'], row['Confusion'], row['Frustration']]
    return labels_dict

In [7]:
# Procesar dataset
def process_dataset(frames_root_folder):
    frames_root_folder = Path(frames_root_folder)
    video_folders = list(frames_root_folder.glob("*/*"))
    with open("video_folders.txt", "w") as f:
        for folder in video_folders:
            folder = str(folder).split("\\")[-1]
            f.write(str(folder) + "\n")

In [9]:
frames_root = "output_frames/Train"
process_dataset(frames_root)

In [10]:
# Load the text file with ClipIDs
with open("video_folders.txt", "r") as f:
    clip_ids = [line.strip() for line in f]
df['ClipID'] = df['ClipID'].str.replace('.avi', '').str.replace('.mp4', '')
filtered_df = df[df['ClipID'].isin(clip_ids)]
filtered_df.shape

(5481, 5)

In [None]:
# save filtered_df to a csv file
filtered_df.to_csv("TrainLabels.csv", index=False)

# 2. Get Frames per Video

In [23]:
def extract_frames_from_video(video_path, output_folder, max_frames=None):
    cap = cv2.VideoCapture(str(video_path))
    frame_count = 0
    success, frame = cap.read()
    
    while success:
        if max_frames and frame_count >= max_frames:
            break
        frame_filename = output_folder / f"frame_{frame_count:04d}.jpg"
        cv2.imwrite(str(frame_filename), frame)
        success, frame = cap.read()
        frame_count += 1
    cap.release()

def get_all_video_paths(dataset_dir, subset="Train"):
    dataset_path = Path(dataset_dir) / subset
    video_paths = []
    for person_folder in dataset_path.iterdir():
        if person_folder.is_dir():
            for video_folder in person_folder.iterdir():
                if video_folder.is_dir():
                    for video_file in video_folder.glob("*.*"):
                        if video_file.suffix.lower() in [".avi", ".mp4"]:
                            video_paths.append(video_file)

    return video_paths

def getFramesPerVideo(dataset_dir, subset="Train",max_frames_per_video=None, output_base="output_frames"):
    video_paths = get_all_video_paths(dataset_dir, subset=subset)
    print(f"Procesando {len(video_paths)} videos del conjunto {subset}...")

    for video_path in tqdm(video_paths, desc="Extrayendo frames"):
        relative_path = video_path.relative_to(dataset_dir)
        output_folder = Path(output_base) / relative_path.parent
        output_folder.mkdir(parents=True, exist_ok=True)
        extract_frames_from_video(video_path, output_folder, max_frames=max_frames_per_video)

In [None]:
subset = "Train"  # Cambia a "Test" o "Validation" según sea necesario
getFramesPerVideo(
    dataset_dir="../Datasets/DaiSee/DAiSEE/DataSet/",          # Ruta raíz al dataset DAiSEE
    subset=subset,
    #max_videos=6000,             # Cuántos videos procesar
    max_frames_per_video=75,       # Frames máximos por video (None = todos)
    output_base="output_frames"   # Carpeta donde guardar los frames extraídos
)

Procesando 5481 videos del conjunto Train...


Extrayendo frames:   8%|▊         | 465/5481 [06:03<49:12,  1.70it/s]  

# 3. Balance Data

In [4]:
df_train = pd.read_csv("TrainLabels.csv")
print(df_train.head())
print(df_train.shape)

       ClipID  Boredom  Engagement  Confusion  Frustration
0  1100011002        0           2          0            0
1  1100011003        0           2          0            0
2  1100011004        0           3          0            0
3  1100011005        0           3          0            0
4  1100011006        0           3          0            0
(5481, 5)


In [7]:
# print value_counts of each emotion
print(df_train['Boredom'].value_counts(), df_train['Engagement'].value_counts(), df_train['Confusion'].value_counts(), df_train['Frustration'].value_counts())

Boredom
0    2488
1    1763
2    1074
3     156
Name: count, dtype: int64 Engagement
2    2649
3    2584
1     214
0      34
Name: count, dtype: int64 Confusion
0    3691
1    1287
2     436
3      67
Name: count, dtype: int64 Frustration
0    4285
1     959
2     194
3      43
Name: count, dtype: int64


In [6]:
df_train

Unnamed: 0,ClipID,Boredom,Engagement,Confusion,Frustration
0,1100011002,0,2,0,0
1,1100011003,0,2,0,0
2,1100011004,0,3,0,0
3,1100011005,0,3,0,0
4,1100011006,0,3,0,0
...,...,...,...,...,...
5476,5221290275,1,2,1,1
5477,5221290279,0,3,0,0
5478,5221290280,1,1,0,1
5479,5221290282,0,3,1,0


In [8]:
import pandas as pd

def balance_per_emotion(df, emotion, samples_per_class=34):
    balanced = []
    for level in range(4):  # niveles 0,1,2,3
        subset = df[df[emotion] == level]
        if len(subset) >= samples_per_class:
            balanced.append(subset.sample(samples_per_class, random_state=42))
        else:
            print(f"No hay suficientes muestras para {emotion} nivel {level}. Solo {len(subset)} disponibles.")
            balanced.append(subset)  # Agrega todo lo que haya (aunque sea menos de 34)
    return pd.concat(balanced).reset_index(drop=True)

# Balancear cada emoción por separado
boredom_df = balance_per_emotion(df_train, 'Boredom')
engagement_df = balance_per_emotion(df_train, 'Engagement')
confusion_df = balance_per_emotion(df_train, 'Confusion')
frustration_df = balance_per_emotion(df_train, 'Frustration')

# Ahora tienes 4 datasets:
print("Boredom shape:", boredom_df.shape)
print("Engagement shape:", engagement_df.shape)
print("Confusion shape:", confusion_df.shape)
print("Frustration shape:", frustration_df.shape)

# Opcional: guardar cada uno por separado
# boredom_df.to_csv('boredom_balanced.csv', index=False)
# engagement_df.to_csv('engagement_balanced.csv', index=False)
# confusion_df.to_csv('confusion_balanced.csv', index=False)
# frustration_df.to_csv('frustration_balanced.csv', index=False)

Boredom shape: (136, 5)
Engagement shape: (136, 5)
Confusion shape: (136, 5)
Frustration shape: (136, 5)


In [18]:
boredom_df.shape

(136, 5)

In [17]:
boredom_df['Boredom'].value_counts(), boredom_df['Engagement'].value_counts(), boredom_df['Confusion'].value_counts(), boredom_df['Frustration'].value_counts()

(Boredom
 0    34
 1    34
 2    34
 3    34
 Name: count, dtype: int64,
 Engagement
 2    65
 3    57
 1    13
 0     1
 Name: count, dtype: int64,
 Confusion
 0    82
 1    35
 2    17
 3     2
 Name: count, dtype: int64,
 Frustration
 0    99
 1    27
 2     7
 3     3
 Name: count, dtype: int64)