*Data Loading*

In [2]:
import json

# Load metadata from JSON file
def load_metadata(metadata_path):
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)g
    return metadata

# Example usage
metadata_path = '/Users/lakshya/Desktop/Projects/VeriFace/Video Dataset/metadata.json'
metadata = load_metadata(metadata_path)
print(metadata)  # Inspect metadata structure

{'aagfhgtpmv.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'vudstovrck.mp4'}, 'aapnvogymq.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'jdubbvfswz.mp4'}, 'abarnvbtwb.mp4': {'label': 'REAL', 'split': 'train', 'original': None}, 'abofeumbvv.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'atvmxvwyns.mp4'}, 'abqwwspghj.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'qzimuostzz.mp4'}, 'acifjvzvpm.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'kbvibjhfzo.mp4'}, 'acqfdwsrhi.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'ccfoszqabv.mp4'}, 'acxnxvbsxk.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'fjlyaizcwc.mp4'}, 'acxwigylke.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'ffcwhpnpuw.mp4'}, 'aczrgyricp.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'slwkmefgde.mp4'}, 'adhsbajydo.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'fysyrqfguw.mp4'}, 'adohikbdaz.mp4': {'label': 'FAKE', 'split': 'train', 'original': 'qjlhemtkx

In [None]:
import os
import numpy as np
import cv2

def load_video(video_path, max_frames=100, target_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    
    while cap.isOpened() and len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = cv2.resize(frame, target_size)
        frame = frame / 255.0
        frames.append(frame)
    
    cap.release()
    return np.array(frames)

def process_videos_with_metadata(video_dir, metadata, max_frames=100, target_size=(224, 224)):
    videos = []
    labels = []
    additional_features = []

    for video_file in os.listdir(video_dir):
        video_path = os.path.join(video_dir, video_file)
        
        # Extract metadata for the video
        video_id = os.path.splitext(video_file)[0]
        video_metadata = metadata.get(video_id, {})
        
        frames = load_video(video_path, max_frames, target_size)
        videos.append(frames)
        labels.append(video_metadata.get('label', 0))  # Default to 0 if no label
        additional_features.append(video_metadata.get('features', {}))  # Adjust based on your metadata structure
    
    return np.array(videos), np.array(labels), additional_features

# Example usage
video_dir = '/Users/lakshya/Desktop/Projects/VeriFace/Video Dataset/train_data'
metadata = load_metadata(metadata_path)
videos, labels, additional_features = process_videos_with_metadata(video_dir, metadata)

*Data Extraction*
1. Video Extraction

In [None]:
import cv2
import os

def extract_frames(video_path, output_dir, frame_rate=1):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    cap = cv2.VideoCapture(video_path)
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            cv2.imwrite(os.path.join(output_dir, f"frame_{count}.jpg"), frame)
        count += 1
    cap.release()

def process_videos(video_dir, output_base_dir, frame_rate=1):
    if not os.path.exists(output_base_dir):
        os.makedirs(output_base_dir)
    
    # Get all video files in the specified directory
    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    # Limit to 400 videos
    video_files = video_files[:400]
    
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        output_dir = os.path.join(output_base_dir, os.path.splitext(video_file)[0])
        extract_frames(video_path, output_dir, frame_rate)
        print(f"Processed {video_file}")


process_videos('/Users/lakshya/Desktop/Projects/VeriFace/Video Dataset/train_data', '/Users/lakshya/Desktop/Projects/VeriFace/Extracted Video')

2. Audio Extraction

In [None]:
from pydub import AudioSegment
import os

def extract_audio(video_path, output_audio_path):
    audio = AudioSegment.from_file(video_path, format="mp4")
    audio.export(output_audio_path, format="wav")

def process_audios(video_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get all video files in the specified directory
    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    # Limit to 400 videos
    video_files = video_files[:400]
    
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        output_audio_path = os.path.join(output_dir, os.path.splitext(video_file)[0] + '.wav')
        extract_audio(video_path, output_audio_path)
        print(f"Extracted audio from {video_file}")


process_audios('/Users/lakshya/Desktop/Projects/VeriFace/Video Dataset/train_data', '/Users/lakshya/Desktop/Projects/VeriFace/Extracted Audio')

*Preprocessing*
1. Video Preprocessing: Resizing, Normalizing, Augmentation

In [None]:
import tensorflow as tf

def preprocess_video_frame(frame, target_size=(224, 224)):
    frame = tf.image.resize(frame, target_size)
    frame = tf.cast(frame, tf.float32) / 255.0  # Normalize
    return frame


2. Audio Preprocessing: Resampling, Noise reduction

In [None]:
import librosa

def preprocess_audio(audio_path, target_sr=16000):
    y, sr = librosa.load(audio_path, sr=target_sr)
    y = librosa.effects.trim(y)[0]  # Trim silence
    return y

*Model Creation*
1. Video Models: ResNet, VGG-16, C3D, TCN

In [None]:
from tensorflow.keras.applications import ResNet50, VGG16

def create_resnet_model(input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')  # Modify for your task
    ])
    return model

video_model = create_resnet_model()
video_model.summary()

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, models

def create_vgg16_model(input_shape=(224, 224, 3), num_classes=10):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False  

    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')  
    ])

    return model

# Example usage:
vgg16_model = create_vgg16_model()
vgg16_model.summary()

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

def create_c3d_model(input_shape=(16, 112, 112, 3), num_classes=10):
    model = models.Sequential()

    model.add(layers.Conv3D(64, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling3D(pool_size=(1, 2, 2)))

    model.add(layers.Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    model.add(layers.Conv3D(256, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.Conv3D(256, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    model.add(layers.Conv3D(512, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.Conv3D(512, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    model.add(layers.Conv3D(512, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.Conv3D(512, kernel_size=(3, 3, 3), activation='relu'))
    model.add(layers.MaxPooling3D(pool_size=(2, 2, 2)))

    model.add(layers.Flatten())
    model.add(layers.Dense(4096, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(4096, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))

    return model

# Example usage:
c3d_model = create_c3d_model()
c3d_model.summary()

In [None]:
from tensorflow.keras import layers, models

def create_tcn_model(input_shape=(128, 3), num_classes=10):
    model = models.Sequential()

    # TCN block 1
    model.add(layers.Conv1D(filters=64, kernel_size=3, padding='causal', dilation_rate=1, activation='relu', input_shape=input_shape))
    model.add(layers.Conv1D(filters=64, kernel_size=3, padding='causal', dilation_rate=2, activation='relu'))
    model.add(layers.MaxPooling1D(pool_size=2))

    # TCN block 2
    model.add(layers.Conv1D(filters=128, kernel_size=3, padding='causal', dilation_rate=4, activation='relu'))
    model.add(layers.Conv1D(filters=128, kernel_size=3, padding='causal', dilation_rate=8, activation='relu'))
    model.add(layers.MaxPooling1D(pool_size=2))

    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))

    return model

# Example usage:
tcn_model = create_tcn_model(input_shape=(128, 3))  # Modify the input shape based on your data
tcn_model.summary()

2. Audio Models: Wav2Vec, CRNN, VGGish, WaveNet

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import librosa

# Load pre-trained Wav2Vec2.0 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Load and preprocess audio
def extract_wav2vec2_features(audio_path):
    # Load audio file with librosa
    y, sr = librosa.load(audio_path, sr=16000)  # Wav2Vec2 requires 16kHz sample rate
    
    # Preprocess the audio to match the input format for Wav2Vec2.0
    input_values = processor(y, return_tensors="pt", sampling_rate=sr).input_values
    
    # Extract features (output from the last hidden layer)
    with torch.no_grad():
        features = model(input_values).last_hidden_state
    
    return features


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import librosa
import numpy as np

# Example function to extract Mel-spectrogram features from audio
def extract_mel_spectrogram(audio_path, n_mels=128):
    y, sr = librosa.load(audio_path, sr=None)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

# CRNN model
def create_crnn_model(input_shape=(128, 128, 1), num_classes=10):
    model = models.Sequential()
    
    # CNN layers
    model.add(layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(layers.Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(layers.Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))

    # RNN layers (GRU or LSTM)
    model.add(layers.Reshape(target_shape=(-1, 128)))
    model.add(layers.GRU(128, return_sequences=False))
    
    # Dense layers
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(num_classes, activation='softmax'))
    
    return model

# Example usage
crnn_model = create_crnn_model(input_shape=(128, 128, 1))  # input shape should match the Mel-spectrogram shape
crnn_model.summary()

In [None]:
from tensorflow.keras import layers, models

def residual_block(x, dilation_rate):
    conv = layers.Conv1D(filters=64, kernel_size=2, padding='causal', dilation_rate=dilation_rate)(x)
    conv = layers.Activation('relu')(conv)
    conv = layers.Conv1D(filters=64, kernel_size=2, padding='causal')(conv)
    
    # Residual connection
    x = layers.add([x, conv])
    return x

def create_wavenet_model(input_shape=(16000, 1), num_classes=10):
    inputs = layers.Input(shape=input_shape)
    
    x = layers.Conv1D(filters=64, kernel_size=2, padding='causal')(inputs)
    
    # Stack of residual blocks with increasing dilation rates
    for dilation_rate in [1, 2, 4, 8, 16]:
        x = residual_block(x, dilation_rate)
    
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = models.Model(inputs, outputs)
    return model

# Example usage
wavenet_model = create_wavenet_model()
wavenet_model.summary()

In [None]:
from pydub import AudioSegment
import os

def extract_audio(video_path, output_audio_path):
    audio = AudioSegment.from_file(video_path, format="mp4")
    audio.export(output_audio_path, format="wav")

def process_audios(video_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get all video files in the specified directory
    video_files = [f for f in os.listdir(video_dir) if f.endswith(('.mp4', '.avi', '.mov'))]
    
    # Limit to 400 videos
    video_files = video_files[:400]
    
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        output_audio_path = os.path.join(output_dir, os.path.splitext(video_file)[0] + '.wav')
        extract_audio(video_path, output_audio_path)
        print(f"Extracted audio from {video_file}")

# Example usage
process_audios('input_videos', 'audio_files')


*Feature Extraction*
1. Video 

In [None]:
def extract_vgg16_features(video_frames):
    vgg16_model = create_vgg16_model()
    vgg16_features = vgg16_model.predict(video_frames)
    return vgg16_features

def extract_resnet50_features(video_frames):
    resnet_model = create_resnet_model()
    resnet_features = resnet_model.predict(video_frames)
    return resnet_features

def extract_c3d_features(video_frames):
    c3d_model = create_c3d_model()
    c3d_features = c3d_model.predict(video_frames)
    return c3d_features

def extract_tcn_features(mel_spectrogram):
    tcn_model = create_tcn_model()
    tcn_features = tcn_model.predict(mel_spectrogram)
    return tcn_features

2. Audio

In [None]:
def extract_crnn_features(mel_spectrogram):
    crnn_model = create_crnn_model()
    crnn_features = crnn_model.predict(mel_spectrogram)
    return crnn_features

def extract_wavenet_features(audio_data):
    wavenet_model = create_wavenet_model()
    wavenet_features = wavenet_model.predict(audio_data)
    return wavenet_features

def extract_vggish_features(audio_data):
    vggish_model = hub.load("https://tfhub.dev/google/vggish/1")
    vggish_features = vggish_model(audio_data)
    return vggish_features

def extract_wav2vec_features(audio_data):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
    input_values = processor(audio_data, return_tensors="pt", sampling_rate=16000).input_values
    with torch.no_grad():
        wav2vec_features = model(input_values).last_hidden_state
    return wav2vec_features


*Feature Fusion*
1. Video 


In [None]:
from tensorflow.keras.layers import concatenate

def fuse_video_features(vgg16_features, resnet50_features, c3d_features, tcn_features):
    fused_features = concatenate([vgg16_features, resnet50_features, c3d_features, tcn_features], axis=-1)
    return fused_features

2. Audio

In [None]:
from tensorflow.keras.layers import concatenate

def fuse_audio_features(crnn_features, wavenet_features, vggish_features, wav2vec_features):
    fused_features = concatenate([crnn_features, wavenet_features, vggish_features, wav2vec_features], axis=-1)
    return fused_features

*Final Model*

In [None]:
def create_final_model(video_shape, audio_shape, num_classes=10):
    # Create video and audio feature extraction models
    vgg16_model = create_vgg16_model()
    resnet50_model = create_resnet_model()
    c3d_model = create_c3d_model()
    

    crnn_model = create_crnn_model()
    wavenet_model = create_wavenet_model()
    vggish_model = hub.load("https://tfhub.dev/google/vggish/1")
    
    # Define input layers
    video_input = layers.Input(shape=video_shape)
    audio_input = layers.Input(shape=audio_shape)
    
    # Extract features
    vgg16_features = vgg16_model(video_input)
    resnet50_features = resnet50_model(video_input)
    c3d_features = c3d_model(video_input)
    
    crnn_features = crnn_model(audio_input)
    wavenet_features = wavenet_model(audio_input)
    vggish_features = vggish_model(audio_input)
    
    # Fuse video features
    fused_video_features = fuse_video_features(vgg16_features, resnet50_features, c3d_features)
    
    # Fuse audio features
    fused_audio_features = fuse_audio_features(crnn_features, wavenet_features, vggish_features)
    
    # Combine fused features
    combined_features = concatenate([fused_video_features, fused_audio_features], axis=-1)
    
    # Final classification model
    x = layers.Dense(256, activation='relu')(combined_features)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = models.Model(inputs=[video_input, audio_input], outputs=outputs)
    return model