# Simple Features

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import tensorflow as tf
from tensorflow.keras import Model
#from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, GlobalAveragePooling2D
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
import os
import glob
import cv2
from io import BytesIO
from PIL import Image
from numpy import expand_dims
from tensorflow import keras
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import array_to_img
from tensorflow.keras.callbacks import *
import warnings
import logging
from IPython.display import clear_output
from collections import Counter
import pickle
import json
import sys
sys.path.insert(1, '/home/jupyter/DeepFake-2019-20/augmentations')
sys.path.insert(1, '/home/jupyter/DeepFake-2019-20/hyperparameters')
sys.path.insert(1, '/home/jupyter/DeepFake-2019-20/visualisations')
import VisualisationTools as plotting
import hyper_utils as hp
import math

import radialProfile
from scipy.interpolate import griddata
import time

warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(100)
plot_losses = plotting.PlotLearning()
os.chdir('/home/jupyter/DeepFake-2019-20')

print("Tensorflow version:", tf.__version__)

Tensorflow version: 2.1.0


In [2]:
def build_model(dropout, lr_rate, vector_length = 300):
    """Buil a model of just four layers, with an
    Adam optimiser."""

    input_shape = (vector_length)
    inputs = tf.keras.Input(shape=input_shape)
    t_dense1 = Dense(1024, activation='relu')(inputs)
    t_dense2 = Dense(256, activation='relu')(t_dense1)
    t_dense3 = Dense(128, activation='relu')(t_dense2)
    t_do = Dropout(dropout)(t_dense3)
    predictions = Dense(2, activation= 'softmax')(t_do)
    model = Model(inputs=inputs, outputs=predictions, name = 'simple_model')
    
    opt = tf.keras.optimizers.Adam(learning_rate= lr_rate, decay=1e-6)
    
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    
    print(model.summary())
    
    return model

In [3]:
def augment_data(directory, batch):
    '''Prepares train-time augmentation using given training and validations data
    
    Returns train_data, val_data'''

    data_aug = tf.keras.preprocessing.image.ImageDataGenerator(samplewise_center=True, samplewise_std_normalization=True)
            
    # Classes give the folders storing the two different categories
    train_data = data_aug.flow_from_directory(directory + '/train',
                                             target_size=(224,224), batch_size = batch)
    
    val_data = data_aug.flow_from_directory(directory + '/validation',
                                             target_size=(224,224), batch_size = batch)
    
    return train_data, val_data

In [4]:
def rgb2gray(rgb):
    """Gray-scale an image."""
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [5]:
def featurevectorise(img, N = 300):
    """Turn an image into its feature vector. First, the Fourier Transformer
    is carried out. Then this output is Azimuthally averaged. 
    img is an image, N is the length of the feature vector to be calculated. 
    """
    epsilon = 1e-8
    img = rgb2gray(img)
    h = int(img.shape[1]/3)
    w = int(img.shape[2]/3)
    img = img[0,h:-h,w:-w]

    f = np.fft.fft2(img)
    fshift = np.fft.fftshift(f)
    fshift += epsilon


    magnitude_spectrum = 20*np.log(np.abs(fshift))
    # Calculate the azimuthally averaged 1D power spectrum
    psd1D = radialProfile.azimuthalAverage(magnitude_spectrum)
    points = np.linspace(0,N,num=psd1D.size) # coordinates of a
    xi = np.linspace(0,N,num=N) # coordinates for interpolation

    interpolated = griddata(points,psd1D,xi,method='cubic')
    interpolated /= interpolated[0]
    
    return interpolated

In [6]:
def preprocess_data(data, number_of_images = 1000, N = 300):
    """Preprocess a specified number of images, with
    a specified length of the feature vector."""
    X = np.zeros([number_of_images, N])
    Y = np.zeros([number_of_images, 2])
    
    cont = 0
    start_time = time.time()
    for image in data:
        X[cont,:] = featurevectorise(image[0], N)             
        Y[cont,:] = image[1]
        cont+=1
        if cont == number_of_images:
            break
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return X, Y

In [7]:
def calculate_class_weights(train_data):
    '''Calculates class weights that weight the data based on the imbalance.
    Allows for better analysis in the case of imbalanced data - has no effect
    if data is balanced since the weights are then equal for each class.
    Use the generator obtained from the flow_from_directory method to obtain
    the class_weights.
    
    Input:
    train_data: the generator obtained during augmentation
    
    Returns a dictionary with class weights, required format for training'''
    
    # Calculate class weights which are required to fully balance the classes
    # Compares frequencies of appearence for each distinct label
    
    # The line of code below can be used on a generator to find the index labels
    print('Ensure class weights function corresponds to these class indices:',
          train_data.class_indices)
    
    counter = Counter(train_data.classes)                          
    max_val = float(max(counter.values()))       
    class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}                     

    return class_weights

In [None]:
def train_model(model, X_train, Y_train, X_test, Y_test, class_weights, epochs = 50, batch_size = 64):
    '''Trains a provided model.
    Takes 6 arguments:
    
    1. model: a built model with an architecture specified in the build function
    2. train_data: augmented data obtained from the augment_data function
    3. val_data: validation data obtained from the augment_data function
    4. epochs -- number of epochs
    5. class weights -- a dictionary with weights (equal for balanced data so
    no negative impact)
    6. architecture: can choose vgg, xception, resnet50, mobilenet or efficientnet
    7. lr_rate: initial learning rate
    8. last_epoch: if training fails, pick up from the most recent epoch
    '''
    
    # Make a trained_models folder if it doesn't exist
    if not os.path.exists('../all_faces_bucket/trained_models'):
        os.makedirs('../all_faces_bucket/trained_models')
    
    # Make a weights folder if it doesn't exist
    if not os.path.exists('../all_faces_bucket/trained_models/weights'):
        os.makedirs('../all_faces_bucket/trained_models/weights')
        
    # Make a weights folder for the architecture if it doesn't exist
    if not os.path.exists('../all_faces_bucket/trained_models/weights/{}'.format(config_file)):
        os.makedirs('../all_faces_bucket/trained_models/weights/{}'.format(config_file))

    # Below saves on file - the weights with the highest validation accuracy
    filepath1="../all_faces_bucket/trained_models/weights/{}/highest_val_acc.hdf5".format(config_file)
    filepath2="../all_faces_bucket/trained_models/weights/{}/last_epoch.hdf5".format(config_file)
    checkpoint = ModelCheckpoint(filepath1, monitor='val_accuracy', 
                                verbose=1, save_best_only=True, mode='max')
    last_epoch_checkpoint = ModelCheckpoint(filepath2, monitor='val_accuracy', 
                                verbose=1, save_best_only=False, mode='max')
    
    # Make a folder to store training accuracies if it doesn't exist
    if not os.path.exists('../all_faces_bucket/trained_models/training_accuracies'):
        os.makedirs('../all_faces_bucket/trained_models/training_accuracies')
    
    # Callback to save training accuracies after each epoch
    csv_logger = CSVLogger('../all_faces_bucket/trained_models/training_accuracies/{}.csv'.format(config_file),
                           separator=',', append=True)
                          
    # Load previous weights from training if there are any
    #load_model_weights(model, architecture)
    
    cb = [plot_losses, checkpoint, last_epoch_checkpoint, csv_logger]

    history = model.fit(X_train, Y_train,
              batch_size=batch_size,
              epochs=epochs,
              class_weight = class_weights,
              verbose=1,
              validation_data=(X_test, Y_test),
              max_queue_size=100,                # maximum size for the generator queue
              workers=16,                        # maximum number of processes to spin up when using process-based threading
              use_multiprocessing=False)
    
    # Make a saved models folder if it doesn't exist
    if not os.path.exists('../all_faces_bucket/trained_models/saved_models'):
        os.makedirs('../all_faces_bucket/trained_models/saved_models')
        
    model.save_weights('../all_faces_bucket/trained_models/weights/{}/lastepoch.hdf5'.format(config_file)) 
    model.save('../all_faces_bucket/trained_models/saved_models/{}.h5'.format(config_file))    

### Build model, pre-process data

In [None]:
model = build_model(0.5, 0.002, 300)
BATCH = 1
train_data, val_data = augment_data('../ff-alldata/home/jupyter/forensics_split', BATCH)

In [None]:
X_train, Y_train = preprocess_data(train_data, 113928, 300)

In [None]:
X_test, Y_test = preprocess_data(val_data, 21291, 300)

### Train the model

In [None]:
epochs = 50
batch_size = 32
class_weights = calculate_class_weights(train_data)
config_file = 'simplefeatures300alldata'

In [None]:
train_model(model, X_train, Y_train, X_test, Y_test, class_weights, epochs, batch_size)

## Test set predictions
The code below is from the testing code. Ignore until the final cell if you're familiar with this code.

In [8]:
af_dir = '../all_faces_bucket/'
configname = 'simplefeatures300alldata'
model = tf.keras.models.load_model(af_dir + 'trained_models/saved_models/' + configname + '.h5')

In [9]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(samplewise_std_normalization=True, samplewise_center=True)
generator = datagen.flow_from_directory('../ff-alldata/home/jupyter/forensics_split/validation', target_size=(224, 224),
                                            shuffle = False, batch_size=1)
filenames = generator.filenames
nb_samples = len(filenames)
generator.reset() # figure out this 

Found 23054 images belonging to 2 classes.


In [10]:
X_Test, Y_Test = preprocess_data(generator,23054,300)

--- 212.59035062789917 seconds ---


In [11]:
multidim_predictions = model.predict(X_Test, steps = nb_samples, verbose=1, workers=8)



In [12]:
def get_image_predictions(arr, soft=True):
    '''Obtains image predictions.
    soft: a true value returns probabilities as opposed to hard predictions.'''

    if soft:
        # probability of belonging to fake (second) class,
        # hence return second value for each element in the list
        return [el[1] for el in arr]
    # returns a list of 0's and 1's
    return np.argmax(arr, axis=1)

In [13]:
predictions = get_image_predictions(multidim_predictions, soft=True)
print(filenames[10:13])
print(predictions[10:13])

['authentic/Original_000_0150.png', 'authentic/Original_000_0165.png', 'authentic/Original_000_0180.png']
[0.5722804, 0.24326812, 0.15801273]


In [14]:
def build_dataframe(filenames):
    index = range(len(filenames))
    df = pd.DataFrame(index = index, columns = ['method', 'video', 'image', 'test/train', 'true label',
                                                  'probability', 'predicted label', 'acc'])
    df = df.fillna(0)
    methods = [el[el.find('/')+1: el.find('_')] for el in filenames]
    video_numbers = [el[el.find('_')+1: el.rfind('_')] for el in filenames]
    # video_numbers = [re.search("_(.*?)\_", el).group(1) for el in filenames] # older version -- does not include second video name for fake videos
    image_numbers = [el[el.find('_')+1: el.find('.')][4:] for el in filenames]
    true_labels = [0 if el[0] == 'a' else 1 for el in filenames]
    
    df['method'] = methods
    df['video'] =  video_numbers
    df['image'] =  image_numbers
    df['true label'] = true_labels
    df['test/train'] = ['test']*len(filenames)
    df['probability'] = predictions
    df['predicted label'] = ['-']*len(filenames)
    df['acc'] = ['-']*len(filenames)
    
    return df

In [15]:
data = build_dataframe(filenames)
display(data[10:13])

Unnamed: 0,method,video,image,test/train,true label,probability,predicted label,acc
10,Original,0,150,test,0,0.57228,-,-
11,Original,0,165,test,0,0.243268,-,-
12,Original,0,180,test,0,0.158013,-,-


In [16]:
def get_mean_of_a_fraction(lst, fraction = 1.0):
    '''Takes in a list and outputs a mean of the fraction of the largest
    elements for that list (by default (fraction is 1) == consider all predictions)
        
    if fraction equals to 1, the output is simply a mean.
        
    This mimics considering only the top third highest probabilities for
    images for a given video to classify that video. The main idea is
    that if a given video has only a fraction of it being manipulated
    (unknown information) then it's likely to be wrongly classfied as original
    if we average all associated probabilities, however, if we take only a
    certain number of highest proabilities that will be much more representative
    and overall robust.'''
        
    sorted_lst = sorted(lst)[::-1] if type(lst) == list else [lst]
    sliced_lst = sorted_lst[0:math.ceil(len(lst)*fraction)] if fraction != (1 or 1.0) else sorted_lst
    return np.mean(sliced_lst)

def get_mean_with_confident_strategy(lst, fraction = 0.75, t = 0.5):
    '''Confident strategy is implemented from first-place solution in DFDC.
    
    The main idea is that if there are a lot of predictions for one class,
    then the average is taken of those predictions only. If that's not the
    case, then a simple mean is outputted.
    
    Inputs:
    1. list of predictions (converted to a list if it's a single prediction)
    2. fraction -- (between 0 and 1) what fraction of the list should predict
    the same class for other predictions to be disregarded when taking a mean
    3. t -- threshold cutoff value between two classes (note: whole notebook
    is structured for a binary classification problem only)'''
    
    lst = np.array(lst)
    num_pred = len(lst)
    num_fakes = np.count_nonzero(lst >= t)
    num_authentic = num_pred - num_fakes

    # if number of fakes is more that 75% of all predictions 
    if num_fakes > int(num_pred * fraction):
        # take predictions which are greater than threshold and average them
        return np.mean(lst[lst >= t])

    # else if number of predictions below threshold value t is more that 75%
    # of all predictions
    elif num_authentic > int(num_pred * fraction):
        # take these predictions and return their mean
        return np.mean(lst[lst < t])
  
    else: # simple mean
        return np.mean(lst)
    
def get_mean_of_transformed_predictions(lst):
    '''Takes a list of predictions, transforms them by individually
    pushing the values away from the centre (0.5) closer towards the
    extremes (0.0 and 1.0). The visualisation is included below.
    
    Returns a mean of transformed predictions.'''

    if type(lst) != list: lst = [lst]
    weights = np.power([abs(el -0.5) for el in lst], 1.0) + 1e-4
    return float((lst * weights).sum() / weights.sum())

In [17]:
def convert_predictions(df, threshold = 0.5, option = None, fraction = 0.33):
    ''' Takes in a dataframe, regroups by videos (collecting all predictions
    for a video in one nested list) then (optionally modifies by one of the three
    methods) returning a mean prediction for each video. Lastly, an accuracy
    column is filled by comparing a true label with the predicted one (a mean
    probability is convered into a label subject to threshold value).
    
    Inputs:
    1. df -- dataframe
    2. threshold -- cutoff probability value between classes (by default 0.5)
    3. option -- (by default None) choices are 'transform', 'confident strategy'
    or 'fraction'; correspond to possible list manipulations
    
    Note: if you feed option = 'fraction', then you need to also specify fraction
    value (1.0 means a simple mean, 0.33 means taking top third, et cetera)
    
    if option is not speficied or not among choices then a simple mean is calculated
    4. fraction (= 0.33) -- value for 'fraction' option '''

    # regroup based on method, video title, and test/train category
    df = df.groupby(['method', 'video','test/train', 'true label', 'predicted label'])\
                    ['probability'].apply(list).reset_index()
    
    collected_labels_pred = list(df['probability']) # get the nested list
    
    # next, we apply one of the three methods to get means
    
    if option == 'transform':
        mean_labels_pred = [get_mean_of_transformed_predictions(el) for el in collected_labels_pred]
        
    elif option == 'confident strategy':
        mean_labels_pred = [get_mean_with_confident_strategy(el) for el in collected_labels_pred]
    
    elif option == 'fraction':
        mean_labels_pred = [get_mean_of_a_fraction(el, fraction) for el in collected_labels_pred]
        
    else: # if no option is chosen (or not from the list), output a simple mean per video
        mean_labels_pred = [np.mean(el) for el in collected_labels_pred]

    labels_pred = [0 if el <= threshold else 1 for el in mean_labels_pred]
    df['predicted label'] = labels_pred

    # produce accuraacy values for each video (0 if classification is wrong and
    # 1 if classicification is correct)
    df['acc'] = [1 if df['true label'][i] == df['predicted label'][i]
                            else 0 for i in range(len(df['true label']))]

    return df

In [19]:
def show_accuracy(df):
    acc_per_method = df.groupby(['test/train', 'method'])['acc'].mean()
    acc_total = df.groupby(['test/train'])['acc'].mean()
    display(acc_per_method)
    display(acc_total)
    return acc_per_method, acc_total

In [21]:
data = build_dataframe(filenames)
print('\n')
for option in ['transform', 'confident strategy', 'fraction', 'None']:
    print('#'*38)
    print("Option:", option)
    new = convert_predictions(data, option = option)
    acc_per_method, acc_total = show_accuracy(new)
    print('#'*38)
    if option != 'None': print('\n')
print('#'*61)
print('\n')



######################################
Option: transform


test/train  method        
test        Deepfakes         0.978571
            Face2Face         0.585714
            FaceSwap          1.000000
            NeuralTextures    0.828571
            Original          0.828571
Name: acc, dtype: float64

test/train
test    0.844286
Name: acc, dtype: float64

######################################


######################################
Option: confident strategy


test/train  method        
test        Deepfakes         0.971429
            Face2Face         0.514286
            FaceSwap          1.000000
            NeuralTextures    0.814286
            Original          0.878571
Name: acc, dtype: float64

test/train
test    0.835714
Name: acc, dtype: float64

######################################


######################################
Option: fraction


test/train  method        
test        Deepfakes         1.000000
            Face2Face         0.814286
            FaceSwap          1.000000
            NeuralTextures    0.942857
            Original          0.592857
Name: acc, dtype: float64

test/train
test    0.87
Name: acc, dtype: float64

######################################


######################################
Option: None


test/train  method        
test        Deepfakes         0.971429
            Face2Face         0.514286
            FaceSwap          1.000000
            NeuralTextures    0.814286
            Original          0.878571
Name: acc, dtype: float64

test/train
test    0.835714
Name: acc, dtype: float64

######################################
#############################################################


