In [1]:
import os
import sys
from tqdm import tqdm
import xml.etree.ElementTree as ET

src_path = os.path.abspath(os.path.join('../../', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from tools.data_processor import DataProcessor
from tools.frame_processors import SupervisionVertexProcessorWithLandmarkFrontalization
from tools.frame_preprocessors import TextureFrontalizationPreprocessor, FaceExtractionPreprocessor




In [2]:
SILESIAN_DATA_PATH = os.path.abspath(os.path.join('..', '..', 'data', 'raw', 'silesian_deception'))
MIDWAY_DATA_PATH = os.path.abspath(os.path.join('..', '..', 'data', 'raw', 'silesian_deception_cut'))
PROCESSED_DATA_PATH = os.path.abspath(os.path.join('..', '..', 'data', 'processed', 'silesian_deception'))
REFERENCE_POINTS_PATH = os.path.abspath(os.path.join('..', '..', 'data', 'reference_points', 'key_points_xyz.npy'))

MODELS_PATH = os.path.join(src_path, 'models', 'frontalization_models')

### For this dataset we first need to cut the videos into separate questions based on Truth / Deception

In [3]:
# according to https://www.researchgate.net/publication/301461665_Silesian_Deception_Database_Presentation_and_Analysis
QUESTIONS_DECEPTION = [0, 0, 1, 1, 1, 1, 1, 1, 0, 1]

In [4]:
mappings = {}

In [5]:
for subfolder in ["poli1Video", "poli2Video", "poli3Video"]:
    mappings[subfolder] = {}
    SUBFOLDER_DIR = os.path.join(SILESIAN_DATA_PATH, subfolder)
    for file in os.listdir(SUBFOLDER_DIR):
        if ".avi" not in file:
            continue
        person_id = file.split(".")[0]
        mappings[subfolder][person_id] = {}
        tree = ET.parse(os.path.join(SUBFOLDER_DIR, f"{person_id}.eaf"))
        time_slot_mapping = {
            slot.get("TIME_SLOT_ID"): slot.get("TIME_VALUE")
            for slot in tree.findall(".//TIME_SLOT")
        }
        for tier in tree.findall("TIER"):
            if (
                tier.get("DEFAULT_LOCALE") == "pl"
                and tier.get("LINGUISTIC_TYPE_REF") == "Question"
                and tier.get("TIER_ID") == "Question"
            ):
                for i, annotation in enumerate(tier.findall(".//ANNOTATION/ALIGNABLE_ANNOTATION"), start=1):
                    time_slot1 = annotation.get("TIME_SLOT_REF1")
                    time_slot2 = annotation.get("TIME_SLOT_REF2")
                    mappings[subfolder][person_id][i] = (int(time_slot_mapping[time_slot1]), int(time_slot_mapping[time_slot2]))
        

In [6]:
from moviepy import VideoFileClip


for subfolder in ["poli1Video", "poli2Video", "poli3Video"]:
    SUBFOLDER_DIR = os.path.join(SILESIAN_DATA_PATH, subfolder)
    OUTPUT_DIR = os.path.join(MIDWAY_DATA_PATH, subfolder)
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    for file in os.listdir(SUBFOLDER_DIR):
        if ".avi" not in file:
            continue
        person_id = file.split(".")[0]
        if os.path.exists(os.path.join(OUTPUT_DIR, f"{person_id}_1.avi")):  # skip for existing
            continue

        for q_id in range(1, 11):
            with VideoFileClip(os.path.join(SUBFOLDER_DIR, file)) as video:
                try:
                    new = video.subclipped(
                        mappings[subfolder][person_id][q_id][0] / 1000,
                        mappings[subfolder][person_id][q_id][1] / 1000,
                    )
                    new.write_videofile(
                        os.path.join(OUTPUT_DIR, f"{person_id}_{q_id}.avi"),
                        codec="libx264",
                        preset="ultrafast",
                        bitrate="5000k",
                        audio=False,
                        logger=None
                    )
                except Exception as e:
                    print(f"Error processing {file}: {e}")

### Data processor

In [7]:
dp = DataProcessor(
    frame_preprocessors=[
        FaceExtractionPreprocessor(
            skip_bad_frames=False,
            output_size=(200, 200)
        ),
        TextureFrontalizationPreprocessor(
            models_path=MODELS_PATH,
            do_calculate_symmetry=True
        ),
    ],
    frame_processor=SupervisionVertexProcessorWithLandmarkFrontalization(
        reference_points_path=REFERENCE_POINTS_PATH,
        do_make_face_mesh=False  # Output pure landmarks
    ),
)

# Process silesian data

In [8]:
# os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [9]:
already_processed_data = set(file.split(".")[0] for file in os.listdir(PROCESSED_DATA_PATH))

In [10]:
files_to_process = []
for subfolder in ["poli1Video", "poli2Video", "poli3Video"]:
    for file in os.listdir(os.path.join(MIDWAY_DATA_PATH, subfolder)):
        files_to_process.append((subfolder, file))

In [11]:
for subfolder, file in tqdm(files_to_process, desc=f"Processing files", total=len(files_to_process), leave=False):
    file_path = os.path.join(MIDWAY_DATA_PATH, subfolder, file)
    out_file = f"{subfolder}_{file.split(".")[0]}"
    if out_file in already_processed_data:
        continue
    try:
        dp.process_data(file_path, os.path.join(PROCESSED_DATA_PATH, out_file))
    except Exception as e:
        print(f"Could not process {file_path} due to {e}")
        continue

                                                                        