In [6]:
import os
import re
import cv2
import nltk
import librosa
import ffmpeg
import numpy as np
import pandas as pd
import face_recognition
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download required resources
nltk.download('punkt')
nltk.download('vader_lexicon')

class DeceptionPreprocessor:
    def __init__(self, base_path):
        self.base_path = base_path
        self.annotation_file = os.path.join(base_path, "Annotation", "All_Gestures_Deceptive and Truthful.csv")
        self.video_dirs = {
            "deceptive": os.path.join(base_path, "Clips", "Deceptive"),
            "truthful": os.path.join(base_path, "Clips", "Truthful")
        }
        self.transcript_dirs = {
            "deceptive": os.path.join(base_path, "Transcription", "Deceptive"),
            "truthful": os.path.join(base_path, "Transcription", "Truthful")
        }
        self.face_embeddings = {}
        self.cluster_labels = {}
        self.sia = SentimentIntensityAnalyzer()

    def preprocess(self):
        all_embeddings = []
        video_files = []

        print("🔍 Extracting facial embeddings for clustering...")
        for label, video_dir in self.video_dirs.items():
            for file in tqdm(os.listdir(video_dir)):
                if not file.endswith('.mp4'):
                    continue
                video_path = os.path.join(video_dir, file)
                embedding = self.extract_face_embedding(video_path)
                if embedding is not None:
                    all_embeddings.append(embedding)
                    video_files.append(file)

        print("📦 Clustering faces into subjects...")
        cluster_ids = self.cluster_faces(all_embeddings)
        for f, cid in zip(video_files, cluster_ids):
            self.cluster_labels[f] = f"subject_{cid}"

        print("✅ Clustering done. Extracting features and saving...")
        data = []
        for label, video_dir in self.video_dirs.items():
            transcript_dir = self.transcript_dirs[label]
            for file in os.listdir(video_dir):
                if not file.endswith('.mp4'):
                    continue

                video_path = os.path.join(video_dir, file)
                transcript_path = os.path.join(transcript_dir, file.replace('.mp4', '.txt'))
                if not os.path.exists(transcript_path):
                    continue

                subject_id = self.cluster_labels.get(file, "unknown")
                text_features = self.extract_text_features(transcript_path)
                audio_features = self.extract_audio_features(video_path)

                data.append({
                    "subject_id": subject_id,
                    "label": label,
                    "text_features": text_features,
                    "audio_features": audio_features
                })

        df = pd.DataFrame(data)
        df.to_pickle("preprocessed_data.pkl")
        print("✅ Preprocessing complete. Saved to preprocessed_data.pkl")

    def extract_face_embedding(self, video_path):
        try:
            vidcap = cv2.VideoCapture(video_path)
            success, frame = vidcap.read()
            while success:
                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                encodings = face_recognition.face_encodings(rgb)
                if encodings:
                    return encodings[0]
                success, frame = vidcap.read()
        except Exception as e:
            print(f"⚠️ Error processing {video_path}: {e}")
        return None

    def cluster_faces(self, embeddings):
        X = np.array(embeddings)
        clustering = DBSCAN(metric='euclidean', eps=0.6, min_samples=1).fit(X)
        return clustering.labels_

    def extract_text_features(self, transcript_path):
        with open(transcript_path, 'r', encoding='utf-8', errors='replace') as f:
            text = f.read()
        sentiment = self.sia.polarity_scores(text)
        return sentiment


    def extract_audio_features(self, video_path):
        try:
            audio_path = "temp_audio.wav"
            (
                ffmpeg
                .input(video_path)
                .output(audio_path, acodec='pcm_s16le', ac=1, ar='16000')
                .overwrite_output()
                .run(quiet=True)
            )
            y, sr = librosa.load(audio_path, sr=None)
            mfcc = librosa.feature.mfcc(y=y, sr=sr)
            return np.mean(mfcc, axis=1)
        except Exception as e:
            print(f"⚠️ Audio extraction failed: {e}")
            return np.zeros(13)

# === Usage ===
base_path = "Real-life_Deception_Detection_2016"
processor = DeceptionPreprocessor(base_path)
processor.preprocess()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phars\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\phars\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


🔍 Extracting facial embeddings for clustering...


  0%|          | 0/61 [00:00<?, ?it/s]

100%|██████████| 61/61 [01:56<00:00,  1.91s/it]
100%|██████████| 60/60 [01:59<00:00,  1.99s/it]


📦 Clustering faces into subjects...
✅ Clustering done. Extracting features and saving...
✅ Preprocessing complete. Saved to preprocessed_data.pkl


In [None]:

import subprocess

try:
    result = subprocess.run(["FeatureExtraction", "-help"], capture_output=True, text=True)
    if result.returncode == 0 or "OpenFace" in result.stdout:
        print("✅ OpenFace is installed and accessible via CLI.")
    else:
        print("⚠️ FeatureExtraction exists but failed to run properly.")
except FileNotFoundError:
    print("❌ OpenFace FeatureExtraction binary not found. Please check your installation.")


❌ OpenFace FeatureExtraction binary not found. Please check your installation.


In [1]:
# Run this in a Colab cell (takes ~8–10 minutes)
!sudo apt-get install -y libboost-all-dev cmake libopenblas-dev liblapack-dev libgtk2.0-dev
!git clone https://github.com/TadasBaltrusaitis/OpenFace.git
%cd OpenFace
!bash download_models.sh
!cd lib && make
!cd .. && make


Command not found


[WinError 2] The system cannot find the file specified: 'OpenFace'
d:\Coding\Deception Detection


'git' is not recognized as an internal or external command,
operable program or batch file.
  bkms = self.shell.db.get('bookmarks', {})


^C


The system cannot find the path specified.
'make' is not recognized as an internal or external command,
operable program or batch file.
