In [2]:
import os
import json
import pandas as pd
import glob

## FF++

In [None]:
dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data/FF++_trainwotest"

ff_dict = {
    "Deepfakes": "FF-DF",
    "Face2Face": "FF-F2F",
    "FaceSwap": "FF-FS",
    "Real": "FF-real",
    "NeuralTextures": "FF-NT",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)

# Create a dictionary for searching the data split
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'val'
    video_to_mode[d2] = 'val'
    video_to_mode[d1+'_'+d2] = 'val'
    video_to_mode[d2+'_'+d1] = 'val'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'

# Comment following to get whole FF++dataset to train
# for d1, d2 in val_json:
#     video_to_mode[d1] = 'val'
#     video_to_mode[d2] = 'val'
#     video_to_mode[d1+'_'+d2] = 'val'
#     video_to_mode[d2+'_'+d1] = 'val'
# for d1, d2 in test_json:
#     video_to_mode[d1] = 'test'
#     video_to_mode[d2] = 'test'
#     video_to_mode[d1+'_'+d2] = 'test'
#     video_to_mode[d2+'_'+d1] = 'test'

dataset_dict = {}

# FaceForensics++ real dataset
label = "Real"
dataset_dict["FaceForensics++"] = {}
dataset_dict["FaceForensics++"]["FF-real"] = {}

# Iterate over all compression levels: c23, c40, raw
dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}

# Iterate over all videos
for video_path in os.scandir(
        os.path.join(
            dataset_path,
            "original_sequences",
            "youtube",
            compression_level,
            "rawframes",
        )):
    if video_path.is_dir() and 'ipynb' not in video_path.name:
        video_name = video_path.name
        mode = video_to_mode[video_name]
        frame_paths = [
            os.path.join(video_path, frame.name)
            for frame in os.scandir(video_path) \
                if os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'landmarks').replace('png', 'npy')) and\
                os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'retina').replace('png', 'npy'))
        ]

        dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][
            video_name] = {
                "label": ff_dict[label],
                "frames": frame_paths
            }


# FaceForensics++ fake datasets
for label_dir in os.scandir(os.path.join(dataset_path,
                                         "manipulated_sequences")):
    if "youtube" in label_dir.name or "FaceShifter" in label_dir.name:
        continue
    label = label_dir.name
    print(label)
    dataset_dict["FaceForensics++"][ff_dict[label]] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

    dataset_dict["FaceForensics++"][
        ff_dict[label]]["train"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["test"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["val"][compression_level] = {}

    # Iterate over all videos
    for video_path in os.scandir(
            os.path.join(
                dataset_path,
                "manipulated_sequences",
                label,
                compression_level,
                "rawframes",
            )):
        if video_path.is_dir() and 'ipynb' not in video_path.name:
            video_name = video_path.name
            mode = video_to_mode[video_name]
            frame_paths = [
                os.path.join(video_path, frame.name)
                for frame in os.scandir(video_path) \
                    if os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'landmarks').replace('png', 'npy')) and\
                    os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'retina').replace('png', 'npy'))
            ]

            dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
                video_name] = {
                    "label": ff_dict[label],
                    "frames": frame_paths
                }




for label, value in dataset_dict["FaceForensics++"].items():
    if label != "FF-real":
        with open(os.path.join(output_file_path, f"{label}.json"),
                    "w") as f:
            data = {
                label: {
                    "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
                    label: value,
                }
            }
            json.dump(data, f)
            print(f"Finish writing {label}.json")

with open(os.path.join(output_file_path, "FaceForensics++.json"), "w") as f:
    json.dump(dataset_dict, f)


## unseen manipulation train

In [4]:
dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data/FF++_trainwotest"

ff_dict = {
    "Deepfakes": "FF-DF",
    "Face2Face": "FF-F2F",
    "FaceSwap": "FF-FS",
    "Real": "FF-real",
    "NeuralTextures": "FF-NT",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)

# Create a dictionary for searching the data split
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'train'
    video_to_mode[d2] = 'train'
    video_to_mode[d1+'_'+d2] = 'train'
    video_to_mode[d2+'_'+d1] = 'train'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'

# Comment following to get whole FF++dataset to train
# for d1, d2 in val_json:
#     video_to_mode[d1] = 'val'
#     video_to_mode[d2] = 'val'
#     video_to_mode[d1+'_'+d2] = 'val'
#     video_to_mode[d2+'_'+d1] = 'val'
# for d1, d2 in test_json:
#     video_to_mode[d1] = 'test'
#     video_to_mode[d2] = 'test'
#     video_to_mode[d1+'_'+d2] = 'test'
#     video_to_mode[d2+'_'+d1] = 'test'


ignore_list = ["Deepfakes","Face2Face", "FaceSwap", "NeuralTextures"]
for ignore in ignore_list:
    # Iterate over all videos
    dataset_dict = {}

    # FaceForensics++ real dataset
    label = "Real"
    dataset_dict["FaceForensics++"] = {}
    dataset_dict["FaceForensics++"]["FF-real"] = {}

    # Iterate over all compression levels: c23, c40, raw
    dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}
    for video_path in os.scandir(
            os.path.join(
                dataset_path,
                "original_sequences",
                "youtube",
                compression_level,
                "rawframes",
            )):
        if video_path.is_dir() and 'ipynb' not in video_path.name:
            video_name = video_path.name
            mode = video_to_mode[video_name]
            frame_paths = [
                os.path.join(video_path, frame.name)
                for frame in os.scandir(video_path) \
                    if os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'landmarks').replace('png', 'npy')) and\
                    os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'retina').replace('png', 'npy'))
            ]

            dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][
                video_name] = {
                    "label": ff_dict[label],
                    "frames": frame_paths
                }


    # FaceForensics++ fake datasets
    for label_dir in os.scandir(os.path.join(dataset_path,
                                            "manipulated_sequences")):
        if "youtube" in label_dir.name or "FaceShifter" in label_dir.name or ignore in label_dir.name:
            continue
        label = label_dir.name
        print(label)
        dataset_dict["FaceForensics++"][ff_dict[label]] = {}
        dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
        dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
        dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

        dataset_dict["FaceForensics++"][
            ff_dict[label]]["train"][compression_level] = {}
        dataset_dict["FaceForensics++"][
            ff_dict[label]]["test"][compression_level] = {}
        dataset_dict["FaceForensics++"][
            ff_dict[label]]["val"][compression_level] = {}

        # Iterate over all videos
        for video_path in os.scandir(
                os.path.join(
                    dataset_path,
                    "manipulated_sequences",
                    label,
                    compression_level,
                    "rawframes",
                )):
            if video_path.is_dir() and 'ipynb' not in video_path.name:
                video_name = video_path.name
                mode = video_to_mode[video_name]
                frame_paths = [
                    os.path.join(video_path, frame.name)
                    for frame in os.scandir(video_path) \
                        if os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'landmarks').replace('png', 'npy')) and\
                        os.path.isfile(os.path.join(video_path, frame.name).replace('rawframes', 'retina').replace('png', 'npy'))
                ]

                dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
                    video_name] = {
                        "label": ff_dict[label],
                        "frames": frame_paths
                    }




    # for label, value in dataset_dict["FaceForensics++"].items():
    #     if label != "FF-real":
    #         with open(os.path.join(output_file_path, f"{label}.json"),
    #                     "w") as f:
    #             data = {
    #                 label: {
    #                     "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
    #                     label: value,
    #                 }
    #             }
    #             json.dump(data, f)
    #             print(f"Finish writing {label}.json")

    with open(os.path.join(output_file_path, f"FaceForensics++_without_{ignore}.json"), "w") as f:
        json.dump(dataset_dict, f)
    print(f"Finish writing FaceForensics++_without_{ignore}.json")


NeuralTextures
Face2Face
FaceSwap
Finish writing FaceForensics++_without_Deepfakes.json
NeuralTextures
Deepfakes
FaceSwap
Finish writing FaceForensics++_without_Face2Face.json
NeuralTextures
Face2Face
Deepfakes
Finish writing FaceForensics++_without_FaceSwap.json
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_without_NeuralTextures.json


## unseen manipulation test

In [3]:

dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data"

ff_dict = {
    "Deepfakes": "FF-DF",
    "Face2Face": "FF-F2F",
    "FaceSwap": "FF-FS",
    "Real": "FF-real",
    "NeuralTextures": "FF-NT",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'train'
    video_to_mode[d2] = 'train'
    video_to_mode[d1+'_'+d2] = 'train'
    video_to_mode[d2+'_'+d1] = 'train'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'



labels = ['Deepfakes', 'FaceSwap', 'Face2Face', 'FaceShifter', 'NeuralTextures']
# FaceForensics++ fake datasets
for label in labels:
    
    print(label)
    dataset_dict = {}

    # FaceForensics++ real dataset
    dataset_dict["FaceForensics++"] = {}
    dataset_dict["FaceForensics++"]["FF-real"] = {}

    # Iterate over all compression levels: c23, c40, raw
    dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
    dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}

    # Iterate over all videos
    for video_path in os.scandir(
            os.path.join(
                dataset_path,
                "original_sequences",
                "youtube",
                compression_level,
                "rawframes_test",
            )):
        if video_path.is_dir() and 'ipynb' not in video_path.name:
            video_name = video_path.name
            mode = video_to_mode[video_name]

            frame_paths = [
                os.path.join(video_path, frame.name)
                for frame in os.scandir(video_path) if '_' in frame.name
            ]
            dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][video_name] = {
                    "label": 'FF-real',
                    "frames": frame_paths
                }


    dataset_dict["FaceForensics++"][ff_dict[label]] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

    dataset_dict["FaceForensics++"][
        ff_dict[label]]["train"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["test"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["val"][compression_level] = {}

    # Iterate over all videos
    for video_path in os.scandir(
            os.path.join(
                dataset_path,
                "manipulated_sequences",
                label,
                compression_level,
                "rawframes_test",
            )):
        
        if video_path.is_dir() and 'ipynb' not in video_path.name:
            video_name = video_path.name
            mode = video_to_mode[video_name]
            frame_paths = [
                os.path.join(video_path, frame.name)
                for frame in os.scandir(video_path) if '_' in frame.name
            ]

            dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
                video_name] = {
                    "label": ff_dict[label],
                    "frames": frame_paths 
                }




    for label, value in dataset_dict["FaceForensics++"].items():
        if label != "FF-real":
            with open(os.path.join(output_file_path, f"{label}.json"),
                        "w") as f:
                data = {
                    label: {
                        "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
                        label: value,
                    }
                }
                json.dump(data, f)
                print(f"Finish writing {label}.json")

# with open(os.path.join(output_file_path, "FF-FH.json"), "w") as f:
#     json.dump(dataset_dict, f)


Deepfakes
Finish writing FF-DF.json
FaceSwap
Finish writing FF-FS.json
Face2Face
Finish writing FF-F2F.json
FaceShifter
Finish writing FF-FH.json
NeuralTextures
Finish writing FF-NT.json


## Robustness val

In [4]:
dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data/FF++_trainwotest"

ff_dict = {
    "Deepfakes": "FF-DF",
    "Face2Face": "FF-F2F",
    "FaceSwap": "FF-FS",
    "Real": "FF-real",
    "NeuralTextures": "FF-NT",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)

# Create a dictionary for searching the data split
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'val'
    video_to_mode[d2] = 'val'
    video_to_mode[d1+'_'+d2] = 'val'
    video_to_mode[d2+'_'+d1] = 'val'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'

# Comment following to get whole FF++dataset to train
# for d1, d2 in val_json:
#     video_to_mode[d1] = 'val'
#     video_to_mode[d2] = 'val'
#     video_to_mode[d1+'_'+d2] = 'val'
#     video_to_mode[d2+'_'+d1] = 'val'
# for d1, d2 in test_json:
#     video_to_mode[d1] = 'test'
#     video_to_mode[d2] = 'test'
#     video_to_mode[d1+'_'+d2] = 'test'
#     video_to_mode[d2+'_'+d1] = 'test'

dataset_dict = {}

# FaceForensics++ real dataset
label = "Real"
dataset_dict["FaceForensics++"] = {}
dataset_dict["FaceForensics++"]["FF-real"] = {}

# Iterate over all compression levels: c23, c40, raw
dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}

# Iterate over all videos
for video_path in os.scandir(
        os.path.join(
            dataset_path,
            "original_sequences",
            "youtube",
            compression_level,
            "rawframes_test",
        )):
    if video_path.is_dir() and 'ipynb' not in video_path.name:
        video_name = video_path.name
        mode = video_to_mode[video_name]
        frame_paths = [
            os.path.join(video_path, frame.name)
            for frame in os.scandir(video_path)
        ]

        dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][
            video_name] = {
                "label": ff_dict[label],
                "frames": frame_paths
            }


# FaceForensics++ fake datasets
for label_dir in os.scandir(os.path.join(dataset_path,
                                         "manipulated_sequences")):
    if "youtube" in label_dir.name or "FaceShifter" in label_dir.name:
        continue
    label = label_dir.name
    print(label)
    dataset_dict["FaceForensics++"][ff_dict[label]] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
    dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

    dataset_dict["FaceForensics++"][
        ff_dict[label]]["train"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["test"][compression_level] = {}
    dataset_dict["FaceForensics++"][
        ff_dict[label]]["val"][compression_level] = {}

    # Iterate over all videos
    for video_path in os.scandir(
            os.path.join(
                dataset_path,
                "manipulated_sequences",
                label,
                compression_level,
                "rawframes_test",
            )):
        if video_path.is_dir() and 'ipynb' not in video_path.name:
            video_name = video_path.name
            mode = video_to_mode[video_name]
            frame_paths = [
                os.path.join(video_path, frame.name)
                for frame in os.scandir(video_path)
            ]

            dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
                video_name] = {
                    "label": ff_dict[label],
                    "frames": frame_paths
                }




for label, value in dataset_dict["FaceForensics++"].items():
    if label != "FF-real":
        with open(os.path.join(output_file_path, f"{label}.json"),
                    "w") as f:
            data = {
                label: {
                    "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
                    label: value,
                }
            }
            json.dump(data, f)
            print(f"Finish writing {label}.json")

with open(os.path.join(output_file_path, "FaceForensics++.json"), "w") as f:
    json.dump(dataset_dict, f)


NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FF-NT.json
Finish writing FF-F2F.json
Finish writing FF-DF.json
Finish writing FF-FS.json


## Robustness test

In [8]:
dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data/FF++_robustness"

ff_dict = {
    "Deepfakes": "FF-DF",
    "Face2Face": "FF-F2F",
    "FaceSwap": "FF-FS",
    "Real": "FF-real",
    "NeuralTextures": "FF-NT",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)

# Create a dictionary for searching the data split
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'val'
    video_to_mode[d2] = 'val'
    video_to_mode[d1+'_'+d2] = 'val'
    video_to_mode[d2+'_'+d1] = 'val'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'

# Iterate over all videos
for type in ['CS', 'CC', 'BW', 'GNC', 'GB', 'JPEG']:
    for level in range(1 , 6):

        dataset_dict = {}

        # FaceForensics++ real dataset
        label = "Real"
        dataset_dict["FaceForensics++"] = {}
        dataset_dict["FaceForensics++"]["FF-real"] = {}

        # Iterate over all compression levels: c23, c40, raw
        dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
        dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
        dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
        dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
        dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
        dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}


        for video_path in os.scandir(
                os.path.join(
                    dataset_path,
                    "original_sequences",
                    "youtube",
                    f"{type}_{level}",
                    "rawframes_test",
                )):
            if video_path.is_dir() and 'ipynb' not in video_path.name:
                video_name = video_path.name
                mode = video_to_mode[video_name]
                frame_paths = [
                    os.path.join(video_path, frame.name)
                    for frame in os.scandir(video_path)
                ]

                dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][
                    video_name] = {
                        "label": ff_dict[label],
                        "frames": frame_paths
                    }


        # FaceForensics++ fake datasets
        for label_dir in os.scandir(os.path.join(dataset_path,
                                                "manipulated_sequences")):
            if "youtube" in label_dir.name or "FaceShifter" in label_dir.name:
                continue
            label = label_dir.name
            print(label)
            dataset_dict["FaceForensics++"][ff_dict[label]] = {}
            dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
            dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
            dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

            dataset_dict["FaceForensics++"][
                ff_dict[label]]["train"][compression_level] = {}
            dataset_dict["FaceForensics++"][
                ff_dict[label]]["test"][compression_level] = {}
            dataset_dict["FaceForensics++"][
                ff_dict[label]]["val"][compression_level] = {}

            # Iterate over all videos
            for video_path in os.scandir(
                    os.path.join(
                        dataset_path,
                        "manipulated_sequences",
                        label,
                        f"{type}_{level}",
                        "rawframes_test",
                    )):
                if video_path.is_dir() and 'ipynb' not in video_path.name:
                    video_name = video_path.name
                    mode = video_to_mode[video_name]
                    frame_paths = [
                        os.path.join(video_path, frame.name)
                        for frame in os.scandir(video_path)
                    ]

                    dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
                        video_name] = {
                            "label": ff_dict[label],
                            "frames": frame_paths
                        }




        # for label, value in dataset_dict["FaceForensics++"].items():
        #     if label != "FF-real":
        #         with open(os.path.join(output_file_path, f"{label}.json"),
        #                     "w") as f:
        #             data = {
        #                 label: {
        #                     "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
        #                     label: value,
        #                 }
        #             }
        #             json.dump(data, f)
        #             print(f"Finish writing {label}.json")
        os.makedirs(output_file_path , exist_ok=True)
        with open(os.path.join(output_file_path, f"FaceForensics++_{type}_{level}.json"), "w") as f:
            json.dump(dataset_dict, f)
            print(f"Finish writing FaceForensics++_{type}_{level}.json")


NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CS_1.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CS_2.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CS_3.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CS_4.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CS_5.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CC_1.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CC_2.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CC_3.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CC_4.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_CC_5.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceForensics++_BW_1.json
NeuralTextures
Face2Face
Deepfakes
FaceSwap
Finish writing FaceFo

## DFDC

In [13]:
dataset_root_path = '/mnt/sdb/maisie/Dataset_Deepfake/'
dataset_path = os.path.join(dataset_root_path, "DFDC/test")
output_file_path = "/mnt/sdb/maisie/SelfBlendedImages/data"

dataset_dict = {}
dataset_dict['DFDC'] = {
    "DFDC_Real": {
        "train": {},
        "test": {},
        "val": {}
    },
    "DFDC_Fake": {
        "train": {},
        "test": {},
        "val": {}
    },
}

df = pd.read_csv(
    os.path.join(dataset_path,"labels.csv"))
labels = ["DFDC_Real", "DFDC_Fake"]

for index, row in df.iterrows():
    vidname = row["filename"].split(".mp4")[0]
    label = labels[row["label"]]
    assert label in [
        "DFDC_Real",
        "DFDC_Fake",
    ], "Invalid label: {}".format(label)
    frame_paths = glob.glob(
        os.path.join(dataset_path,"rawframes",vidname, "*png"))
    # print(os.path.join(dataset_path,"rawframes",vidname, "*png"))
    # frame_paths = [frames_path for frames_path in frame_paths if os.path.isfile(frames_path.replace('rawframes', 'retina').replace('png', 'npy'))]

    dataset_dict["DFDC"][label]["test"][vidname] = {
        "label": label,
        "frames": frame_paths,
        }

with open(os.path.join(output_file_path, "DFDC.json"), "w") as f:
    json.dump(dataset_dict, f)

## CelebDF

In [7]:

dataset_root_path = '/mnt/sdc/maisie/'
dataset_path = os.path.join(dataset_root_path, "Celeb-DF-v2")
output_file_path = "/mnt/sdb/maisie/SelfBlendedImages/data"

dataset_dict = {}
dataset_dict["Celeb-DF-v2"] = {}

for folder in os.scandir(dataset_path):
    if not os.path.isdir(folder):
        continue
    if folder.name in ["Celeb-real", "YouTube-real"]:
        label = "CelebDFv2_real"
    else:
        label = "CelebDFv2_fake"
    assert label in [
        "CelebDFv2_real",
        "CelebDFv2_fake",
    ], "Invalid label: {}".format(label)
    dataset_dict["Celeb-DF-v2"][label] = {}
    dataset_dict["Celeb-DF-v2"][label]["train"] = {}
    dataset_dict["Celeb-DF-v2"][label]["val"] = {}
    dataset_dict["Celeb-DF-v2"][label]["test"] = {}


with open(os.path.join(dataset_root_path, "Celeb-DF-v2","List_of_testing_videos.txt"), "r") as f:
    lines = f.readlines()
for line in lines:
    if "real" in line:
        label = "CelebDFv2_real"
    elif "synthesis" in line:
        label = "CelebDFv2_fake"
    else:
        raise ValueError(f"wrong in processing vidname Celeb-DF-v2: {line}")

    vidname = line.split("\n")[0].split("/")[-1].split(".mp4")[0]
    frame_paths = glob.glob(
        os.path.join(
            dataset_root_path,
            "Celeb-DF-v2",
            line.split(" ")[1].split("/")[0],
            "rawframes",
            vidname,
            "*png",
        ))

    # frame_paths = [
    #     frame_paths[i] for i in range(len(frame_paths))
    #     if os.path.isfile(frame_paths[i].replace(
    #         "/rawframes/", "/retina/").replace(".png", ".npy"))
    # ]

    dataset_dict["Celeb-DF-v2"][label]["test"][vidname] = {
        "label": label,
        "frames": frame_paths,
    }


with open(os.path.join(output_file_path, "Celeb-DF-v2.json"), "w") as f:
    json.dump(dataset_dict, f)

## Faceshifter

In [13]:

dataset_root_path = "/mnt/sdc/maisie/"
compression_level = "c23"
output_file_path  = "/mnt/sdb/maisie/SelfBlendedImages/data"

ff_dict = {
    "Real": "FF-real",
    "FaceShifter": "FF-FH",
}

# Load the JSON files for data split
dataset_path = os.path.join(dataset_root_path, "FaceForensics++")

# Load the JSON files for data split
with open(
        file=os.path.join(
            os.path.join(
                dataset_path,
                "train.json",
            )),
        mode="r",
) as f:
    train_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"val.json")),
        mode="r",
) as f:
    val_json = json.load(f)
with open(
        file=os.path.join(
            os.path.join(dataset_path,"test.json")),
        mode="r",
) as f:
    test_json = json.load(f)
video_to_mode = dict()
for d1, d2 in train_json:
    video_to_mode[d1] = "train"
    video_to_mode[d2] = "train"
    video_to_mode[d1 + "_" + d2] = "train"
    video_to_mode[d2 + "_" + d1] = "train"
for d1, d2 in val_json:
    video_to_mode[d1] = 'train'
    video_to_mode[d2] = 'train'
    video_to_mode[d1+'_'+d2] = 'train'
    video_to_mode[d2+'_'+d1] = 'train'
for d1, d2 in test_json:
    video_to_mode[d1] = 'test'
    video_to_mode[d2] = 'test'
    video_to_mode[d1+'_'+d2] = 'test'
    video_to_mode[d2+'_'+d1] = 'test'

dataset_dict = {}

# FaceForensics++ real dataset
label = "Real"
dataset_dict["FaceForensics++"] = {}
dataset_dict["FaceForensics++"]["FF-real"] = {}

# Iterate over all compression levels: c23, c40, raw
dataset_dict["FaceForensics++"]["FF-real"]["train"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"] = {}
dataset_dict["FaceForensics++"]["FF-real"]["train"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["test"][compression_level] = {}
dataset_dict["FaceForensics++"]["FF-real"]["val"][compression_level] = {}

# Iterate over all videos
for video_path in os.scandir(
        os.path.join(
            dataset_path,
            "original_sequences",
            "youtube",
            compression_level,
            "rawframes",
        )):
    if video_path.is_dir() and 'ipynb' not in video_path.name:
        video_name = video_path.name
        mode = video_to_mode[video_name]
        print(video_path)
        frame_paths = [
            os.path.join(video_path, frame.name)
            for frame in os.scandir(video_path) if '_' in frame.name
        ]
        dataset_dict["FaceForensics++"]["FF-real"][mode][compression_level][video_name] = {
                "label": ff_dict[label],
                "frames": frame_paths
            }


# FaceForensics++ fake datasets
label = "FaceShifter"
print(label)
dataset_dict["FaceForensics++"][ff_dict[label]] = {}
dataset_dict["FaceForensics++"][ff_dict[label]]["train"] = {}
dataset_dict["FaceForensics++"][ff_dict[label]]["test"] = {}
dataset_dict["FaceForensics++"][ff_dict[label]]["val"] = {}

dataset_dict["FaceForensics++"][
    ff_dict[label]]["train"][compression_level] = {}
dataset_dict["FaceForensics++"][
    ff_dict[label]]["test"][compression_level] = {}
dataset_dict["FaceForensics++"][
    ff_dict[label]]["val"][compression_level] = {}

# Iterate over all videos
for video_path in os.scandir(
        os.path.join(
            dataset_path,
            "manipulated_sequences",
            label,
            compression_level,
            "rawframes",
        )):
    
    if video_path.is_dir() and 'ipynb' not in video_path.name:
        video_name = video_path.name
        mode = video_to_mode[video_name]
        frame_paths = [
            os.path.join(video_path, frame.name)
            for frame in os.scandir(video_path) if '_' in frame.name
        ]

        dataset_dict["FaceForensics++"][ff_dict[label]][mode][compression_level][
            video_name] = {
                "label": ff_dict[label],
                "frames": frame_paths 
            }




for label, value in dataset_dict["FaceForensics++"].items():
    if label != "FF-real":
        with open(os.path.join(output_file_path, f"{label}.json"),
                    "w") as f:
            data = {
                label: {
                    "FF-real": dataset_dict["FaceForensics++"]["FF-real"],
                    label: value,
                }
            }
            json.dump(data, f)
            print(f"Finish writing {label}.json")

# with open(os.path.join(output_file_path, "FF-FH.json"), "w") as f:
#     json.dump(dataset_dict, f)


<DirEntry '186'>
<DirEntry '615'>
<DirEntry '835'>
<DirEntry '877'>
<DirEntry '320'>
<DirEntry '177'>
<DirEntry '565'>
<DirEntry '371'>
<DirEntry '237'>
<DirEntry '187'>
<DirEntry '953'>
<DirEntry '951'>
<DirEntry '242'>
<DirEntry '227'>
<DirEntry '697'>
<DirEntry '873'>
<DirEntry '909'>
<DirEntry '131'>
<DirEntry '588'>
<DirEntry '272'>
<DirEntry '681'>
<DirEntry '997'>
<DirEntry '502'>
<DirEntry '344'>
<DirEntry '184'>
<DirEntry '767'>
<DirEntry '826'>
<DirEntry '560'>
<DirEntry '964'>
<DirEntry '917'>
<DirEntry '661'>
<DirEntry '488'>
<DirEntry '600'>
<DirEntry '908'>
<DirEntry '024'>
<DirEntry '967'>
<DirEntry '165'>
<DirEntry '927'>
<DirEntry '480'>
<DirEntry '378'>
<DirEntry '241'>
<DirEntry '162'>
<DirEntry '037'>
<DirEntry '106'>
<DirEntry '431'>
<DirEntry '257'>
<DirEntry '297'>
<DirEntry '285'>
<DirEntry '611'>
<DirEntry '812'>
<DirEntry '238'>
<DirEntry '987'>
<DirEntry '146'>
<DirEntry '687'>
<DirEntry '122'>
<DirEntry '149'>
<DirEntry '789'>
<DirEntry '228'>
<DirEntry '056