In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from mtcnn import MTCNN
from tqdm import tqdm

# Initialize MTCNN face detector
detector = MTCNN()

# Paths and setup
video_folder = '../input/deepfake-detection-challenge/train_sample_videos/'
metadata_path = '../input/deepfake-detection-challenge/train_sample_videos/metadata.json'
output_real_folder = './extracted_faces/real/'
output_fake_folder = './extracted_faces/fake/'

# Create directories if they don't exist
os.makedirs(output_real_folder, exist_ok=True)
os.makedirs(output_fake_folder, exist_ok=True)

# Load metadata
train_sample_metadata = pd.read_json(metadata_path).T

# Function to detect and save faces from video frames
def extract_faces(video_path, output_folder, label, video_name):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        faces = detector.detect_faces(frame_rgb)
        
        for i, face in enumerate(faces):
            x, y, width, height = face['box']
            face_img = frame_rgb[y:y+height, x:x+width]
            face_img = cv2.resize(face_img, (224, 224))  # Resize face to 224x224
            
            # Save the face image
            face_filename = f"{label}_{video_name}_{frame_count}_{i}.jpg"
            face_filepath = os.path.join(output_folder, face_filename)
            cv2.imwrite(face_filepath, cv2.cvtColor(face_img, cv2.COLOR_RGB2BGR))
        
        frame_count += 1
    
    cap.release()

# Extract faces from videos
for video_name, row in tqdm(train_sample_metadata.iterrows(), total=train_sample_metadata.shape[0]):
    video_path = os.path.join(video_folder, video_name)
    label = row['label']
    output_folder = output_real_folder if label == 'REAL' else output_fake_folder
    extract_faces(video_path, output_folder, label, video_name)
