**Before this process**, you should delete all temporary folders (abandoned, troublesome, low_quality) manually to proceed.

---

This notebook will generate cilp info for further clipping.

In [1]:
import sys

if '../Utils/' not in sys.path:
    sys.path.append('../Utils/')
from FileOperation import *

import cv2
import statistics
import json
import statistics

rootDir = '/home/kyr/GazeForensicsData/'

In [2]:
def gen_clip_len_list(
        len_per_vid,
        vid_category_dict,
        max_splits=None
    ):
    '''
    Parameters
        len_per_vid: frame count for each video clip
        vid_category_dict: dict of lists of video categories, contains 4 lists of cropped video paths,
            should be like {'fake_train': [...], 'fake_test': [...], 'real_train': [...], 'real_test': [...]}
        max_splits: maximum number of splits,
            e.g. if max_splits = 5, len_per_vid=10, actual video length is 65,
                it will be split into 5 clips of length 10 instead of 6 clips
            None means no limit
    '''
    # Get split number for each video
    for vid_category_key in vid_category_dict.keys():
        for i, vid_path in enumerate(vid_category_dict[vid_category_key]):
            vid = cv2.VideoCapture(vid_path)
            vid_len = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
            split_num = int(vid_len // len_per_vid)
            if max_splits and 'train' in vid_category_key:
                split_num = min(split_num, max_splits)
            vid.release()
            vid_category_dict[vid_category_key][i] = {'path': vid_path, 'split_num': split_num, 'vid_len': vid_len}
    # Get split number for each category
    split_num_dict = {}
    for vid_category_key in vid_category_dict.keys():
        split_num_dict[vid_category_key] = sum(
            [vid['split_num'] for vid in vid_category_dict[vid_category_key]]
        )
    print('split_num_dict before tweaking:', split_num_dict)
    # Make fake_train and real_train have the same number of splits by reducing the category with more splits
    # Cut one split from the video with the most splits until two categories have the same number of splits
    more_split_category = 'fake_train' if split_num_dict['fake_train'] > split_num_dict['real_train'] else 'real_train'
    while split_num_dict['fake_train'] != split_num_dict['real_train']:
        # Find the video with the most splits
        max_split_vid = max(vid_category_dict[more_split_category], key=lambda x: x['split_num'])
        # print('max_split_vid:', max_split_vid)
        # Cut one split from the video with the most splits
        max_split_vid['split_num'] -= 1
        # Update split number list
        split_num_dict[more_split_category] -= 1
    print('split_num_dict after tweaking:', split_num_dict)
    vid_category_dict['len_per_vid'] = len_per_vid
    return vid_category_dict

In [3]:
def display_statistic_info(myData, tab_num=0):
    print('\t' * tab_num + 'min:', min(myData))
    print('\t' * tab_num + 'max:', max(myData))
    print('\t' * tab_num + 'mean:', round(sum(myData) / len(myData), 2))
    print('\t' * tab_num + 'median:', int(statistics.median(myData)))



def display_category_dict_info(vid_category_dict):
    print('Fake Train:\n\tvid_len:')
    display_statistic_info([vid['vid_len'] for vid in vid_category_dict['fake_train']], tab_num=2)
    print('\tsplit_num:')
    display_statistic_info([vid['split_num'] for vid in vid_category_dict['fake_train']], tab_num=2)
    print('Fake Test:\n\tvid_len:')
    display_statistic_info([vid['vid_len'] for vid in vid_category_dict['fake_test']], tab_num=2)
    print('\tsplit_num:')
    display_statistic_info([vid['split_num'] for vid in vid_category_dict['fake_test']], tab_num=2)
    print('Real Train:\n\tvid_len:')
    display_statistic_info([vid['vid_len'] for vid in vid_category_dict['real_train']], tab_num=2)
    print('\tsplit_num:')
    display_statistic_info([vid['split_num'] for vid in vid_category_dict['real_train']], tab_num=2)
    print('Real Test:\n\tvid_len:')
    display_statistic_info([vid['vid_len'] for vid in vid_category_dict['real_test']], tab_num=2)
    print('\tsplit_num:')
    display_statistic_info([vid['split_num'] for vid in vid_category_dict['real_test']], tab_num=2)

**CDF**

In [4]:
CDF_vid_category_dict = {'fake_train': None, 'fake_test': None, 'real_train': None, 'real_test': None}
with open(rootDir + 'cropped_videos/CDF_crop/List_of_testing_videos.txt', 'r') as f:
    CDF_test_list = f.readlines()
    CDF_vid_category_dict['fake_test'] = [rootDir + 'cropped_videos/CDF_crop/' + vid[2:].replace('\n', '') for vid in CDF_test_list if vid[0] == '0']
    CDF_vid_category_dict['real_test'] = [rootDir + 'cropped_videos/CDF_crop/' + vid[2:].replace('\n', '') for vid in CDF_test_list if vid[0] == '1']
all_CDF_videos = [rootDir + 'cropped_videos/CDF_crop/' + vid for vid in fileWalk(rootDir + 'cropped_videos/CDF_crop/') if vid[-4:] == '.mp4']
CDF_vid_category_dict['fake_train'] = [vid for vid in all_CDF_videos if vid not in CDF_vid_category_dict['fake_test']]
CDF_vid_category_dict['fake_train'] = [vid for vid in CDF_vid_category_dict['fake_train'] if 'synthesis' in vid]
CDF_vid_category_dict['real_train'] = [vid for vid in all_CDF_videos if vid not in CDF_vid_category_dict['real_test']]
CDF_vid_category_dict['real_train'] = [vid for vid in CDF_vid_category_dict['real_train'] if 'real' in vid]

In [5]:
CDF_vid_category_dict = gen_clip_len_list(
    len_per_vid=14,
    vid_category_dict=CDF_vid_category_dict,
    max_splits=None
)

split_num_dict before tweaking: {'fake_train': 139528, 'fake_test': 8977, 'real_train': 20268, 'real_test': 4940}
split_num_dict after tweaking: {'fake_train': 20268, 'fake_test': 8977, 'real_train': 20268, 'real_test': 4940}


In [6]:
display_category_dict_info(CDF_vid_category_dict)

Fake Train:
	vid_len:
		min: 161
		max: 740
		mean: 375.28
		median: 342
	split_num:
		min: 3
		max: 4
		mean: 3.82
		median: 4
Fake Test:
	vid_len:
		min: 105
		max: 639
		mean: 376.2
		median: 365
	split_num:
		min: 7
		max: 45
		mean: 26.4
		median: 26
Real Train:
	vid_len:
		min: 161
		max: 740
		mean: 405.51
		median: 428
	split_num:
		min: 11
		max: 52
		mean: 28.51
		median: 30
Real Test:
	vid_len:
		min: 126
		max: 611
		mean: 394.99
		median: 410
	split_num:
		min: 9
		max: 43
		mean: 27.75
		median: 29


In [7]:
CDF_vid_category_dict

{'fake_train': [{'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id47_id45_0004.mp4',
   'split_num': 3,
   'vid_len': 314},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id3_id4_0004.mp4',
   'split_num': 3,
   'vid_len': 499},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id13_id7_0008.mp4',
   'split_num': 3,
   'vid_len': 459},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id1_id9_0004.mp4',
   'split_num': 3,
   'vid_len': 409},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id39_id47_0001.mp4',
   'split_num': 3,
   'vid_len': 315},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id4_id30_0005.mp4',
   'split_num': 3,
   'vid_len': 467},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/CDF_crop/Celeb-synthesis/id30_id29_0001.mp4',
   'split_num': 3,
   'vid_len': 451},
  {'path': '

In [8]:
with open(rootDir + 'clip_info/CDF_vid_category_dict.json', 'w') as f:
    json.dump(CDF_vid_category_dict, f)

**FF++**

In [9]:
FF_vid_category_dict = {'fake_train': [], 'fake_test': [], 'real_train': None, 'real_test': None}
with open(rootDir + 'cropped_videos/FF++_crop/test.json', 'r') as f:
    FF_test_list = json.load(f)
    FF_test_list = [item for sublist in FF_test_list for item in sublist]
with open(rootDir + 'cropped_videos/FF++_crop/train.json', 'r') as f:
    FF_train_list = json.load(f)
    FF_train_list = [item for sublist in FF_train_list for item in sublist]
manipulation_types = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/real/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/real/') if i[-4:] == '.mp4']
FF_vid_category_dict['real_train'] = [vid for vid in temp_vids_list if vid.split('/')[-1][:-4] in FF_train_list]
temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/real/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/real/') if i[-4:] == '.mp4']
FF_vid_category_dict['real_test'] = [vid for vid in temp_vids_list if vid.split('/')[-1][:-4] in FF_test_list]
for manipulation_type in manipulation_types:
    temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/') if i[-4:] == '.mp4']
    FF_vid_category_dict['fake_train'].extend([vid for vid in temp_vids_list if vid.split('/')[-1].split('_')[0] in FF_train_list])
    temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/') if i[-4:] == '.mp4']
    FF_vid_category_dict['fake_test'].extend([vid for vid in temp_vids_list if vid.split('/')[-1].split('_')[0] in FF_test_list])

In [10]:
FF_vid_category_dict = gen_clip_len_list(
    len_per_vid=14,
    vid_category_dict=FF_vid_category_dict,
    max_splits=None
)

split_num_dict before tweaking: {'fake_train': 92624, 'fake_test': 18810, 'real_train': 25854, 'real_test': 5207}
split_num_dict after tweaking: {'fake_train': 25854, 'fake_test': 18810, 'real_train': 25854, 'real_test': 5207}


In [11]:
display_category_dict_info(FF_vid_category_dict)

Fake Train:
	vid_len:
		min: 164
		max: 1496
		mean: 457.03
		median: 409
	split_num:
		min: 8
		max: 9
		mean: 8.98
		median: 9
Fake Test:
	vid_len:
		min: 278
		max: 1814
		mean: 476.42
		median: 437
	split_num:
		min: 19
		max: 129
		mean: 33.59
		median: 31
Real Train:
	vid_len:
		min: 292
		max: 1496
		mean: 509.34
		median: 459
	split_num:
		min: 20
		max: 106
		mean: 35.91
		median: 32
Real Test:
	vid_len:
		min: 301
		max: 1814
		mean: 526.73
		median: 460
	split_num:
		min: 21
		max: 129
		mean: 37.19
		median: 32


---

Found that the FF++ dataset has some very long videos that took too much splits

So set the `max_splits` to the mean of real_train's split_num

---

In [12]:
FF_vid_category_dict = {'fake_train': [], 'fake_test': [], 'real_train': None, 'real_test': None}
with open(rootDir + 'cropped_videos/FF++_crop/test.json', 'r') as f:
    FF_test_list = json.load(f)
    FF_test_list = [item for sublist in FF_test_list for item in sublist]
with open(rootDir + 'cropped_videos/FF++_crop/train.json', 'r') as f:
    FF_train_list = json.load(f)
    FF_train_list = [item for sublist in FF_train_list for item in sublist]
manipulation_types = ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']
temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/real/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/real/') if i[-4:] == '.mp4']
FF_vid_category_dict['real_train'] = [vid for vid in temp_vids_list if vid.split('/')[-1][:-4] in FF_train_list]
temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/real/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/real/') if i[-4:] == '.mp4']
FF_vid_category_dict['real_test'] = [vid for vid in temp_vids_list if vid.split('/')[-1][:-4] in FF_test_list]
for manipulation_type in manipulation_types:
    temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/') if i[-4:] == '.mp4']
    FF_vid_category_dict['fake_train'].extend([vid for vid in temp_vids_list if vid.split('/')[-1].split('_')[0] in FF_train_list])
    temp_vids_list = [rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/' + i for i in ls(rootDir + 'cropped_videos/FF++_crop/fake/' + manipulation_type + '/') if i[-4:] == '.mp4']
    FF_vid_category_dict['fake_test'].extend([vid for vid in temp_vids_list if vid.split('/')[-1].split('_')[0] in FF_test_list])

In [13]:
FF_vid_category_dict = gen_clip_len_list(
    len_per_vid=14,
    vid_category_dict=FF_vid_category_dict,
    max_splits=36
)

split_num_dict before tweaking: {'fake_train': 83593, 'fake_test': 18810, 'real_train': 21990, 'real_test': 5207}
split_num_dict after tweaking: {'fake_train': 21990, 'fake_test': 18810, 'real_train': 21990, 'real_test': 5207}


In [14]:
display_category_dict_info(FF_vid_category_dict)

Fake Train:
	vid_len:
		min: 164
		max: 1496
		mean: 457.03
		median: 409
	split_num:
		min: 7
		max: 8
		mean: 7.64
		median: 8
Fake Test:
	vid_len:
		min: 278
		max: 1814
		mean: 476.42
		median: 437
	split_num:
		min: 19
		max: 129
		mean: 33.59
		median: 31
Real Train:
	vid_len:
		min: 292
		max: 1496
		mean: 509.34
		median: 459
	split_num:
		min: 20
		max: 36
		mean: 30.54
		median: 32
Real Test:
	vid_len:
		min: 301
		max: 1814
		mean: 526.73
		median: 460
	split_num:
		min: 21
		max: 129
		mean: 37.19
		median: 32


In [15]:
FF_vid_category_dict

{'fake_train': [{'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/001_870.mp4',
   'split_num': 7,
   'vid_len': 460},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/002_006.mp4',
   'split_num': 7,
   'vid_len': 693},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/005_010.mp4',
   'split_num': 7,
   'vid_len': 385},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/006_002.mp4',
   'split_num': 7,
   'vid_len': 310},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/007_132.mp4',
   'split_num': 7,
   'vid_len': 505},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/008_990.mp4',
   'split_num': 7,
   'vid_len': 637},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/FF++_crop/fake/Deepfakes/009_027.mp4',
   'split_num': 7,
   'vid_len': 588},
  {'path': '/home/kyr/GazeForensicsData/cropped_videos/

In [16]:
with open(rootDir + 'clip_info/FF++_vid_category_dict.json', 'w') as f:
    json.dump(FF_vid_category_dict, f)

**WDF**

In [17]:
WDF_vid_category_dict = {'fake_train': None, 'fake_test': None, 'real_train': None, 'real_test': None}
temp_dir = rootDir + 'cropped_videos/WDF_crop/'
WDF_vid_category_dict['fake_train'] = [temp_dir + 'fake_train/' + i for i in fileWalk(temp_dir + 'fake_train/') if i[-4:] == '.mp4']
WDF_vid_category_dict['fake_test'] = [temp_dir + 'fake_test/' + i for i in fileWalk(temp_dir + 'fake_test/') if i[-4:] == '.mp4']
WDF_vid_category_dict['real_train'] = [temp_dir + 'real_train/' + i for i in fileWalk(temp_dir + 'real_train/') if i[-4:] == '.mp4']
WDF_vid_category_dict['real_test'] = [temp_dir + 'real_test/' + i for i in fileWalk(temp_dir + 'real_test/') if i[-4:] == '.mp4']

In [18]:
WDF_vid_category_dict = gen_clip_len_list(
    len_per_vid=14,
    vid_category_dict=WDF_vid_category_dict,
    max_splits=None
)

split_num_dict before tweaking: {'fake_train': 43739, 'fake_test': 7452, 'real_train': 25675, 'real_test': 3999}
split_num_dict after tweaking: {'fake_train': 25675, 'fake_test': 7452, 'real_train': 25675, 'real_test': 3999}


In [19]:
display_category_dict_info(WDF_vid_category_dict)

Fake Train:
	vid_len:
		min: 51
		max: 29008
		mean: 204.12
		median: 125
	split_num:
		min: 3
		max: 12
		mean: 8.28
		median: 8
Fake Test:
	vid_len:
		min: 52
		max: 3418
		mean: 260.98
		median: 144
	split_num:
		min: 3
		max: 244
		mean: 18.18
		median: 10
Real Train:
	vid_len:
		min: 51
		max: 3572
		mean: 112.02
		median: 85
	split_num:
		min: 3
		max: 255
		mean: 7.53
		median: 6
Real Test:
	vid_len:
		min: 51
		max: 3655
		mean: 148.13
		median: 86
	split_num:
		min: 3
		max: 261
		mean: 10.1
		median: 6


---

Found that the Wilddeepfake dataset has some very long videos that took too much splits

So set the `max_splits` to the mean of real_train's split_num

---

In [20]:
WDF_vid_category_dict = {'fake_train': None, 'fake_test': None, 'real_train': None, 'real_test': None}
temp_dir = rootDir + 'cropped_videos/WDF_crop/'
WDF_vid_category_dict['fake_train'] = [temp_dir + 'fake_train/' + i for i in fileWalk(temp_dir + 'fake_train/') if i[-4:] == '.mp4']
WDF_vid_category_dict['fake_test'] = [temp_dir + 'fake_test/' + i for i in fileWalk(temp_dir + 'fake_test/') if i[-4:] == '.mp4']
WDF_vid_category_dict['real_train'] = [temp_dir + 'real_train/' + i for i in fileWalk(temp_dir + 'real_train/') if i[-4:] == '.mp4']
WDF_vid_category_dict['real_test'] = [temp_dir + 'real_test/' + i for i in fileWalk(temp_dir + 'real_test/') if i[-4:] == '.mp4']

In [21]:
WDF_vid_category_dict = gen_clip_len_list(
    len_per_vid=14,
    vid_category_dict=WDF_vid_category_dict,
    max_splits=8
)

split_num_dict before tweaking: {'fake_train': 21426, 'fake_test': 7452, 'real_train': 19736, 'real_test': 3999}
split_num_dict after tweaking: {'fake_train': 19736, 'fake_test': 7452, 'real_train': 19736, 'real_test': 3999}


In [22]:
display_category_dict_info(WDF_vid_category_dict)

Fake Train:
	vid_len:
		min: 51
		max: 29008
		mean: 204.12
		median: 125
	split_num:
		min: 3
		max: 8
		mean: 6.37
		median: 7
Fake Test:
	vid_len:
		min: 52
		max: 3418
		mean: 260.98
		median: 144
	split_num:
		min: 3
		max: 244
		mean: 18.18
		median: 10
Real Train:
	vid_len:
		min: 51
		max: 3572
		mean: 112.02
		median: 85
	split_num:
		min: 3
		max: 8
		mean: 5.79
		median: 6
Real Test:
	vid_len:
		min: 51
		max: 3655
		mean: 148.13
		median: 86
	split_num:
		min: 3
		max: 261
		mean: 10.1
		median: 6


In [29]:
WDF_vid_category_dict[list(WDF_vid_category_dict.keys())[1]]

[{'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/81_24.mp4',
  'split_num': 11,
  'vid_len': 157},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/4_432.mp4',
  'split_num': 4,
  'vid_len': 58},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/79_184.mp4',
  'split_num': 12,
  'vid_len': 178},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/62_170.mp4',
  'split_num': 15,
  'vid_len': 215},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/44_40.mp4',
  'split_num': 25,
  'vid_len': 362},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/33_0.mp4',
  'split_num': 9,
  'vid_len': 134},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/35_2.mp4',
  'split_num': 51,
  'vid_len': 719},
 {'path': '/home/kyr/GazeForensicsData/cropped_videos/WDF_crop/fake_test/52_23.mp4',
  'split_num': 20,
  'vid_len': 289},
 {'path': '/home/ky

In [24]:
with open(rootDir + 'clip_info/WDF_vid_category_dict.json', 'w') as f:
    json.dump(WDF_vid_category_dict, f)