In [1]:
import json
import os
import pandas as pd
import numpy as np
import cv2

# read the metadata.json file
with open('metadata.json') as f:
    metadata = json.load(f)

# create a dataframe with the metadata
df = pd.DataFrame(metadata).T

# remove the split column
df = df.drop(columns=['split'])

df.head()

Unnamed: 0,label,original
owxbbpjpch.mp4,FAKE,wynotylpnm.mp4
vpmyeepbep.mp4,REAL,
fzvpbrzssi.mp4,REAL,
htorvhbcae.mp4,FAKE,wclvkepakb.mp4
fckxaqjbxk.mp4,FAKE,vpmyeepbep.mp4


In [2]:
# print the number of real and fake videos
print('Number of real videos: ', len(df[df.label == 'REAL']))
print('Number of fake videos: ', len(df[df.label == 'FAKE']))

Number of real videos:  86
Number of fake videos:  1248


In [3]:
path = 'data/'

# get a list of all mp4 files
filenames = [f for f in os.listdir(path) if f.endswith('.mp4')]

# remove the .mp4 extension from the filenames
filenames = [f[:-4] for f in filenames]

In [4]:
PROCESS_AUDIO = True

if PROCESS_AUDIO:
    import librosa
    import moviepy.editor as mp
    from multiprocessing import Pool

    def save_audio_file():
        for i, file in enumerate(filenames):
            clip = mp.VideoFileClip(path + file + '.mp4')
            clip.audio.write_audiofile(path + file + '.wav', verbose=False, logger=None)

            # print the progress as a percentage
            print(f'{i / len(filenames) * 100:.2f}%', end='\r')
        print('Audio files saved')


    def save_audio_feature(file):
        y, sr = librosa.load(path + file + '.wav')

        frame_length = y.shape[0] // 300
        
        # discard trailing frames
        y = y[:(frame_length * 300) - 1]

        # this removes about half a frame of audio
        # in the model this lost time will be shared between the frames
        # this results in the error in each frame being very small

        # calculate the mfcc with 32 features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=32, n_fft=frame_length, hop_length=frame_length)

        # save the mfcc
        np.save(path + file + '.npy', mfcc)

    def save_audio_feature_parallel():
        with Pool() as p:
            p.map(save_audio_feature, filenames)

        print('Audio features saved')

    save_audio_file()
    # save_audio_feature_parallel()

# print the shape of a sample audio file
# print(np.load(path + filenames[0] + '.npy').shape)

13.94%

KeyboardInterrupt: 

In [None]:
import tensorflow as tf

path = 'data/'

# create a tensorflow dataset
dataset = tf.data.Dataset.list_files(path + '*.mp4')

def preprocess_video(video):
    return video

def preprocess_audio(audio):
    return audio

def preprocess_input(file):
    # all videos are 300 frames long
    # load the video
    video = cv2.VideoCapture(file)
    audio = librosa.load(file)

    # Extract visual frames and audio samples
    visual_frames = []
    audio_samples = []

    # loop through the video and extract the frames
    for i in range(300):
        ret, frame = video.read()
        visual_frames.append(frame)
    
    # break the audio into 300 samples
    audio_samples = librosa.util.frame(audio, frame_length=300, hop_length=300)

    return visual_frames, audio_samples



        

# preprocess the video and audio for the first video
for video in dataset.take(1):
    video, audio = preprocess_input(video.numpy().decode('utf-8'))

    print(f'Video shape: {video.shape}')
    print(f'Audio shape: {audio.shape}')

In [None]:
import tensorflow as tf

# Create a tuple of text data and image data
text_data = ["This is an example text.", "Another text sample."]
image_data = [image1, image2]  # List of images

# Create a dataset from the tuple of data
dataset = tf.data.Dataset.from_tensor_slices((text_data, image_data))

# Import necessary modules
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

# Define the text input shape
text_input = Input(shape=(MAX_TEXT_LENGTH,), name='text_input')

# Define the image input shape
image_input = Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS), name='image_input')

# Process the text input
text_embedding = Embedding(input_dim=VOCABULARY_SIZE, output_dim=EMBEDDING_SIZE)(text_input)
text_lstm = LSTM(units=LSTM_UNITS)(text_embedding)
text_dense = Dense(units=DENSE_UNITS)(text_lstm)

# Process the image input
image_conv1 = Conv2D(filters=32, kernel_size=(3, 3), activation='relu')(image_input)
image_pool1 = MaxPooling2D(pool_size=(2, 2))(image_conv1)
image_conv2 = Conv2D(filters=64, kernel_size=(3, 3), activation='relu')(image_pool1)
image_pool2 = MaxPooling2D(pool_size=(2, 2))(image_conv2)
image_flatten = Flatten()(image_pool2)
image_dense = Dense(units=DENSE_UNITS)(image_flatten)

# Combine the text and image representations
combined = Concatenate()([text_dense, image_dense])
output = Dense(units=NUM_CLASSES, activation='softmax')(combined)

# Create the model
model = Model(inputs=[text_input, image_input], outputs=output)

zipped_dataset = tf.data.Dataset.zip((dataset1, dataset2))

def concatenate_strings(string1, string2):
    return string1 + string2

concatenated_dataset = zipped_dataset.map(lambda x, y: concatenate_strings(x, y))
