## Set up

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import tensorflow as tf

import os
import glob
import cv2

## Reading the split data from FaceForenscis++ github (dataset/splits)

In [2]:
import json 

test = open('FaceForensics-master/dataset/splits/test.json',) 
test_videos = json.load(test) 
for i in test_videos[0:3]: 
    print(i)

['953', '974']
['012', '026']
['078', '955']


In [3]:
train = open('FaceForensics-master/dataset/splits/train.json',) 
train_videos = json.load(train) 
for i in train_videos[0:3]: 
    print(i)

['071', '054']
['087', '081']
['881', '856']


In [4]:
val = open('FaceForensics-master/dataset/splits/val.json',) 
val_videos = json.load(val) 
for i in val_videos[0:3]: 
    print(i)

['720', '672']
['939', '115']
['284', '263']


## Performing checks that there are no overlaps in the videos for any split category

The main idea is that if a given video appears in a train set, it must not appear in any way in the validation or test set (this includes facial region or the background: for example a training original video must not be used as a source or target in any validation or test video).

First check obtains a list of all distinct videos in the pairs from .json files and finds that there is no overlap for any two categories. Second check counts the number of distinct pairs ((a, b) = (b, a)) and verifies that they match perfectly onto the originally provided pairs.
Last check verifiess that there is no intersept in pairs for any two categories.

In [5]:
distinct_train = []
distinct_train_pairs = []
for el in train_videos:
    if not el[0] in distinct_train: distinct_train.append(el[0])
    if not el[1] in distinct_train: distinct_train.append(el[1])
    if not el in distinct_train_pairs and not [el[1], el[0]] in distinct_train_pairs:
        distinct_train_pairs.append(el)
print("Training:", len(distinct_train), len(train_videos), len(distinct_train_pairs),
     train_videos == distinct_train_pairs)

distinct_val = []
distinct_val_pairs = []
for el in val_videos:
    if not el[0] in distinct_val: distinct_val.append(el[0])
    if not el[1] in distinct_val: distinct_val.append(el[1])
    if not el in distinct_val_pairs and not [el[1], el[0]] in distinct_val_pairs:
        distinct_val_pairs.append(el)
print("Validation:", len(distinct_val), len(val_videos), len(distinct_val_pairs),
     val_videos == distinct_val_pairs)

distinct_test = []
distinct_test_pairs = []
for el in test_videos:
    if not el[0] in distinct_test: distinct_test.append(el[0])
    if not el[1] in distinct_test: distinct_test.append(el[1])
    if not el in distinct_test_pairs and not [el[1], el[0]] in distinct_test_pairs:
        distinct_test_pairs.append(el)
print("Testing:", len(distinct_test), len(test_videos), len(distinct_test_pairs),
     test_videos == distinct_test_pairs)

Training: 720 360 360 True
Validation: 140 70 70 True
Testing: 140 70 70 True


In [6]:
test_set = set(distinct_test)
train_set = set(distinct_train)
val_set = set(distinct_val)

print(len(test_set.intersection(train_set)),
      len(test_set.intersection(val_set)), len(train_set.intersection(val_set)))

0 0 0


In [7]:
train_test_intersept = []
for el in distinct_train_pairs:
    if el in distinct_test_pairs or [el[1], el[0]] in distinct_test_pairs:
        train_test_intersept.append(el)
        
train_val_intersept = []
for el in distinct_train_pairs:
    if el in distinct_val_pairs or [el[1], el[0]] in distinct_val_pairs:
        train_val_intersept.append(el)
        
val_test_intersept = []
for el in distinct_val_pairs:
    if el in distinct_test_pairs or [el[1], el[0]] in distinct_test_pairs:
        val_test_intersept.append(el)
        
print(len(train_test_intersept), len(train_val_intersept), len(val_test_intersept))

0 0 0


## Moving data

In [12]:
def assign_path(method, folder):
    '''This function provides a new path for a given method and folder based on the distinct
    video numbers calculated earlier
    
    This path is used in the second function (next cell) to move a copy on an image to that 
    location'''
    
    if method == 'Original':
        if folder in distinct_test: split_categoty = 'test/'
        elif folder in distinct_train: split_categoty = 'train/'
        elif folder in distinct_val: split_categoty = 'validation/'
        
        class_category = 'authentic/'
        
    else:
        folder_1 = folder[0:3]
        folder_2 = folder[4:7]
        
        if folder_1 in distinct_test and folder_2 in distinct_test:
            split_categoty = 'test/'
        elif folder_1 in distinct_train and folder_2 in distinct_train:
            split_categoty = 'train/'
        elif folder_1 in distinct_val and folder_2 in distinct_val:
            split_categoty = 'validation/'
        else: print('There is a mismatch in folders for the fake class')
        
        class_category = 'fake/'
          
    additional_name = method + '_' + folder + '_'
    
    new_path = 'forensics_split/' + split_categoty + class_category + additional_name
    
    return new_path

In [16]:
def copy_and_move(methods, n):
    '''Loops through all cropped images and moves their coppies to required new locations
    (split by category (train, test or validation) and class (fake or authentic))'''
    
    for method in methods:
        if method == 'Original': path = 'original_sequences/youtube/c0/'
        else: path = 'manipulated_sequences/' + method + '/c0/'
            
        
        print("Starting to copy", method, "images")
        folders =  os.listdir(path + method + '_images')
        folders = sorted(list(set(folders).difference(set(['.DS_Store']))))

        # only look at the first n videos for a given method
        for folder in folders[0:n]:
             # output progress
            if folder[2] == '0' and folder[1] == '0': print(folder)

            full_path = path + method + '_images/' + folder
            # obtain full paths for all images
            images_list = os.listdir(full_path)
          
            for el in images_list:
                img = cv2.imread(full_path + '/' + el) #[:,:,::-1] 
                # save image in a new place
                new_path = assign_path(method, folder)
                cv2.imwrite(new_path + el, img)
                # print(new_path + el)
                
        print("Copying", method, "images is complete\n")

    return None

In [14]:
copy_and_move(['Original'], 1000)

Starting to copy Original images
000
100
200
300
400
500
600
700
800
900
Copying Original images is complete



In [17]:
copy_and_move(['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures'], 1000)

Starting to copy Deepfakes images
000_003
100_077
200_189
300_304
400_476
500_592
600_505
700_813
800_840
900_926
Copying Deepfakes images is complete

Starting to copy Face2Face images
000_003
100_077
200_189
300_304
400_476
500_592
600_505
700_813
800_840
900_926
Copying Face2Face images is complete

Starting to copy FaceSwap images
000_003
100_077
200_189
300_304
400_476
500_592
600_505
700_813
800_840
900_926
Copying FaceSwap images is complete

Starting to copy NeuralTextures images
000_003
100_077
200_189
300_304
400_476
500_592
600_505
700_813
800_840
900_926
Copying NeuralTextures images is complete



## Getting the total counts for each class and split category

Here I wanted to verify that the numbers match those quoted in the FaceForensics++ paper (page 12, appendix). Since we only took every 15th frame, our numbers equate to those in the paper divided by 15. Note, this is only approximate because a given video rarely contained a number of frames divisible by 15, hence our cropped images represent a slightly larger proportion of the video (more than a fifteenth). 

My numbers are usually around a 1000 images more that the paper's equivalent numbers, hence there is very unlikely to be any error.

In [24]:
for method in ['Original', 'Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']:
    if method == 'Original': method_group = 'authentic/'; print(method)
    else: method_group = 'fake/'; print(method)
    
    for split_group in ['train', 'validation', 'test']:
        path = 'forensics_split/' + split_group + '/' + method_group
        data = os.listdir(path)
        data = [el for el in data if el.startswith(method)]
        print(split_group + ':', len(data))
        
    print('\n')

Original
train: 24789
validation: 4629
test: 4981


Deepfakes
train: 24775
validation: 4629
test: 4986


Face2Face
train: 24785
validation: 4629
test: 4987


FaceSwap
train: 19790
validation: 3702
test: 4049


NeuralTextures
train: 19789
validation: 3702
test: 4051


