In [6]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
%matplotlib inline

In [7]:
# Feature extraction functions
def extract_frame_features(frame):
    """Extract features from a single frame"""
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    hist = cv2.calcHist([gray], [0], None, [32], [0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    blur_value = cv2.Laplacian(gray, cv2.CV_64F).var()
    means, stds = cv2.meanStdDev(frame)
    color_features = np.concatenate([means, stds]).flatten()
    features = np.hstack([hist, [blur_value], color_features])
    return features

def extract_video_features(video_path, max_frames=30):
    """Extract features from a video"""
    cap = cv2.VideoCapture(video_path)
    features = []
    frame_count = 0
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, total_frames // max_frames)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % step == 0 and len(features) < max_frames:
            frame = cv2.resize(frame, (224, 224))
            frame_feats = extract_frame_features(frame)
            features.append(frame_feats)
        frame_count += 1
    cap.release()
    if not features:
        return np.zeros((max_frames, 32 + 1 + 6))
    features = np.array(features)
    avg_features = np.mean(features, axis=0)
    return avg_features

def process_directory(directory_path, label, max_videos=None):
    features_list = []
    labels = []
    video_files = [f for f in os.listdir(directory_path) if f.endswith('.mp4')]
    if max_videos is not None:
        video_files = video_files[:max_videos]
    for video_file in tqdm(video_files, desc=f'Processing {label} videos'):
        video_path = os.path.join(directory_path, video_file)
        features = extract_video_features(video_path)
        features_list.append(features)
        labels.append(label)
    return features_list, labels

In [8]:
real_dir = 'real-vid'
fake_dir = 'fake-vid'

real_features, real_labels = process_directory(real_dir, 0)
fake_features, fake_labels = process_directory(fake_dir, 1)

all_features = real_features + fake_features
all_labels = real_labels + fake_labels

feature_names = []
feature_names += [f'hist_{i}' for i in range(32)]
feature_names += ['blurness']
feature_names += [f'mean_{c}' for c in ['b', 'g', 'r']]
feature_names += [f'std_{c}' for c in ['b', 'g', 'r']]

df = pd.DataFrame(all_features, columns=feature_names)
df['label'] = all_labels

df.to_csv('data.csv', index=False)
print("Data saved")

Processing 0 videos: 100%|██████████| 1000/1000 [17:22<00:00,  1.04s/it]
Processing 1 videos: 100%|██████████| 1051/1051 [16:34<00:00,  1.06it/s]


Data saved
