In [41]:
import json
import os
import pandas as pd
import numpy as np
import cv2
import tensorflow_io as tfio

# read the metadata.json file
with open('metadata.json') as f:
    metadata = json.load(f)

# create a dataframe with the metadata
df = pd.DataFrame(metadata).T

# remove the split column
df = df.drop(columns=['split'])

# create 1 and 0 labels from the label column
df['label'] = df.label.apply(lambda x: 1 if x == 'FAKE' else 0)

labels = df.label.values
labels = tf.reshape(labels, (-1, 1))

df.head()

Unnamed: 0,label,original
owxbbpjpch.mp4,1,wynotylpnm.mp4
vpmyeepbep.mp4,0,
fzvpbrzssi.mp4,0,
htorvhbcae.mp4,1,wclvkepakb.mp4
fckxaqjbxk.mp4,1,vpmyeepbep.mp4


In [33]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.layers import Dense, Activation, Input, Input, Conv1D, Conv2D, MaxPooling1D
from tensorflow.python.keras.layers import MaxPooling2D, Dense, Dropout, Activation, Flatten, InputLayer
from tensorflow.python.keras.layers import concatenate, Reshape, Lambda, Add, Multiply, Average, Subtract
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# print Tensorflow and CUDA information
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

if tf.test.gpu_device_name():
    gpu_devices = tf.config.list_physical_devices('GPU')
    details = tf.config.experimental.get_device_details(gpu_devices[0])
    name = details.get('device_name', 'Unknown GPU')
    
    print(f"Using {name}")
else:
    print("No GPU found")

Num GPUs Available:  1
Num CPUs Available:  1
Tensorflow version: 2.4.1
Keras version: 2.4.0
Using Tesla V100-SXM2-32GB


2023-03-09 11:24:55.019467: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-03-09 11:24:55.020659: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:16:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-03-09 11:24:55.020722: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-03-09 11:24:55.020744: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-03-09 11:24:55.020805: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.10
2023-03-09 11:24:55.020820: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10

In [34]:
# print the number of real and fake videos
print('Number of real videos: ', len(df[df.label == 'REAL']))
print('Number of fake videos: ', len(df[df.label == 'FAKE']))

Number of real videos:  0
Number of fake videos:  0


In [35]:
path = 'data/'

# get a list of all mp4 files
filenames = [f for f in os.listdir(path) if f.endswith('.mp4')]

# remove the .mp4 extension from the filenames
filenames = [f[:-4] for f in filenames]

In [36]:
PROCESS_AUDIO = False

if PROCESS_AUDIO:
    import librosa
    import moviepy.editor as mp
    from multiprocessing import Pool

    def save_audio_files():
        for i, file in enumerate(filenames):
            clip = mp.VideoFileClip(path + file + '.mp4')
            clip.audio.write_audiofile(path + file + '.wav', verbose=False, logger=None)

            # print the progress as a percentage
            print(f'{i / len(filenames) * 100:.2f}%', end='\r')
        print('Audio files saved')


    def save_audio_feature(file):
        y, sr = librosa.load(path + file + '.wav')

        frame_length = y.shape[0] // 300
        
        # discard trailing frames
        y = y[:(frame_length * 300) - 1]

        # this removes about half a frame of audio
        # in the model this lost time will be shared between the frames
        # this results in the error in each frame being very small

        # calculate the mfcc with 32 features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32, n_fft=frame_length, hop_length=frame_length)

        # save the mfcc
        np.save(path + file + '.npy', mfcc)

    def save_audio_feature_parallel():
        with Pool(16) as p:
            p.map(save_audio_feature, filenames)

        print('Audio features saved')

    save_audio_files()
    save_audio_feature_parallel()

# print the shape of a sample audio file
print(np.load(path + filenames[0] + '.npy').shape)

(32, 300)


In [95]:
# create a tensorflow dataset
dataset = tf.data.Dataset.list_files(path + '*.mp4')

def get_video_asarray(file):    
    frames = []
    cap = cv2.VideoCapture(file.numpy().decode("utf-8"))
    for i in range(0, 30):
        ret, frame = cap.read()
        frame = preprocess_frame(frame)
        frames.append(frame)
    cap.release()
    return np.stack(frames, axis=0)

def preprocess_frame(frame):
    # video is a numpy array of shape (height, width, 3)
    # convert the video to grayscale
    # frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # resize the video to 128x128
    frame = cv2.resize(frame, (80, 45))
    
    frame = np.transpose(frame, (1, 0, 2))

    # normalize the video
    frame = frame / 255

    return frame

def get_audio_asarray(file):
    file = file.numpy().decode("utf-8")
    
    return np.load(file[:-4] + '.npy').T

def preprocess_input(file):
    # load the video
    video = tf.py_function(get_video_asarray, [file], Tout=[tf.float32])
    audio = tf.py_function(get_audio_asarray, [file], Tout=[tf.float32])
    
    audio = tf.expand_dims(audio, axis=-1)

    return video, audio

# Use map to apply the preprocess_input function to each file in the dataset
dataset = dataset.map(preprocess_input)

# Split the dataset into two separate datasets for video and audio
video_dataset = dataset.map(lambda x, y: x)
audio_dataset = dataset.map(lambda x, y: y)

zipped_dataset = tf.data.Dataset.zip((video_dataset, audio_dataset))

In [82]:
%%time
# preprocess the video and audio for the first video
for video, audio in zipped_dataset.take(1):
    print(f'Video shape: {video.shape}')
    print(f'Audio shape: {audio.shape}')

Video shape: (1, 30, 80, 45, 3)
Audio shape: (1, 300, 32, 1)
CPU times: user 815 ms, sys: 5.01 ms, total: 820 ms
Wall time: 970 ms


In [96]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv3D, Conv2D, Conv1D, Flatten, Dense, concatenate, MaxPooling3D, MaxPooling2D

# Define the text input shape
video_input = Input(shape=(30, 80, 45, 3), name='video_input')

# Define the image input shape
audio_input = Input(shape=(300, 32, 1), name='audio_input')

# 3D convolutional layers for video input
x = Conv3D(16, kernel_size=(5,4,3), activation='relu')(video_input)
x = MaxPooling3D(pool_size=(2,3,3))(x)
x = Conv3D(32, kernel_size=(3,4,3), activation='relu')(x)
x = MaxPooling3D(pool_size=(2,4,3))(x)
x = Conv3D(64, kernel_size=(3,4,3), activation='relu')(x)
x = Flatten()(x)

# 2D convolutional layers for audio input
y = Conv2D(16, kernel_size=(3,3), activation='relu')(audio_input)
y = MaxPooling2D(pool_size=(2,2))(y)
y = Conv2D(32, kernel_size=(3,3), activation='relu')(y)
y = MaxPooling2D(pool_size=(2,2))(y)
y = Conv2D(64, kernel_size=(3,3), activation='relu')(y)
y = Flatten()(y)

# 1D convolutional layers for both inputs
z = Conv1D(16, kernel_size=4, activation='relu')(audio_input)
z = Conv1D(32, kernel_size=8, activation='relu')(z)
z = Conv1D(64, kernel_size=12, activation='relu')(z)
z = Flatten()(z)

# Concatenate the outputs from all three convolutional layers
merged = concatenate([x, y, z])

# Dense layers for classification
merged = Dense(128, activation='relu')(merged)
merged = Dense(64, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(merged)

# Create the model
model = Model(inputs=[video_input, audio_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
video_input (InputLayer)        [(None, 30, 80, 45,  0                                            
__________________________________________________________________________________________________
audio_input (InputLayer)        [(None, 300, 32, 1)] 0                                            
__________________________________________________________________________________________________
conv3d_44 (Conv3D)              (None, 26, 77, 43, 1 2896        video_input[0][0]                
__________________________________________________________________________________________________
conv2d_21 (Conv2D)              (None, 298, 30, 16)  160         audio_input[0][0]                
____________________________________________________________________________________________

In [None]:
labels_t = tf.data.Dataset.from_tensor_slices(labels)
dataset_with_labels = tf.data.Dataset.zip((zipped_dataset, labels_t))

model.fit(dataset_with_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 221/1334 [===>..........................] - ETA: 11:58 - loss: 0.3902 - accuracy: 0.8895