# Deepfake detection using Deep learning



### 1. installing and importing the necessary packages

In [None]:

# %pip install tensorflow
# %pip install matplotlib
# %pip install scipy
# %pip install keras-tuner

In [1]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
import zipfile
import fnmatch
import os.path
import random
import shutil
import scipy
import keras_tuner as kt
import numpy as np
import os

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D,MaxPool2D, Dropout
from keras import regularizers
from keras import optimizers
from keras.applications.resnet_v2 import ResNet50V2

  after removing the cwd from sys.path.


Check that the version and python bits is correct

In [1]:
# Tensorflow might not be able to be installed under different python versions
from platform import python_version
print("Current Python Version-", python_version())


# python 64 bits required
import struct
print(struct.calcsize("P") * 8)

Current Python Version- 3.7.8
64


### 2. Creating file directories, extract image into respective directories


In [6]:
# Split Data
# ├── Test  
# │   ├── manipulated
# │   └── original
# ├── Training
# │   ├── manipulated 
# │   └── original
# └── Validation
#     ├── manipulated 
#     └── original

# Creates the appropriate directory structures for training, validation and test sets.
try:
  shutil.rmtree('./Split Data')      
except:
  pass                #Split Data didn't exist
     
os.mkdir('./Split Data')
cdf={"Training":0.7,"Validation":0.85,"Test":1} #OBS! Has to be increment percentages of 5 to make batch size fit
for dir in list(cdf.keys()):
    os.mkdir('./Split Data/{}'.format(dir))
    os.mkdir('./Split Data/{}/manipulated'.format(dir))
    os.mkdir('./Split Data/{}/original'.format(dir))

In [9]:
# Function which assigns the file to the correct directory based on the discrete cumulative distribution function cdf

def assign_data(cdf):
    nbr=random.random()
    for set in list(cdf.keys()):
        if nbr<cdf[set]:
            return set

dist={"Training":0,"Validation":0,"Test":0}
filename = 'data.zip'
zfile = zipfile.ZipFile(filename, 'r')
counter=0
samplesize=12000
# Each file is loaded in sequence and randomly assigned to the corresponding directory 
# in the new straucture according to cdf. Dictionary dist keeps track of number of each set.
for name in zfile.namelist():
    save_path = './Split Data/'
    name_of_file=""
    label=""
    if fnmatch.fnmatch(name, "data/manipulated/*.png"):
        name_of_file=name[len("data/manipulated/"):]
        label="manipulated"
    elif fnmatch.fnmatch(name,"data/original/*.png"):
        name_of_file=name[len("data/original/"):]
        label="original"
    if name_of_file != "":
        myfile = zfile.open(name)
        img = matplotlib.image.imread(myfile)
        rand_assign=assign_data(cdf)
        dist[rand_assign]+=1
        save_path+=rand_assign+"/"+label # eg. "Split Data/Training/manipulated
        completeName = os.path.join(save_path, name_of_file)         
        matplotlib.image.imsave(completeName,img)
        counter+=1
        if counter>=samplesize:     
            break

zfile.close()

In [None]:
# Just for checking the set sizes
for fr in list(dist.keys()):
    for label in ["manipulated","original"]:
        print("Size of {}/{}: {}".format(fr,label,len(os.listdir(os.path.join('./Split Data',fr,label)))))


### 3. Building the model by initialising the attributes, layers etc.



- A batch size of 32 means that 32 samples from the training dataset will be used to estimate the error gradient before the model weights are updated. 
- One training epoch means that the learning algorithm has made one pass through the training dataset, where examples were separated into randomly 
selected “batch size” groups.

In [34]:
# constant fields
BATCH_SIZE= 50 
imgsize=(299,299)
COUNT_DENSE = 3


In [26]:
# Building ResNet Models
model=Sequential()
model.add(ResNet50V2(weights="imagenet", include_top=False, pooling='avg'))   #include_top enables transfer learning. pooling arbitrary now.

model.add(Dense(69,kernel_regularizer=regularizers.l2(0.001), activation="relu")) # activation function
model.add(Dropout(0.5)) # reduce overfitting - “0.5” specifies the amount of input to be removed from the available input data
model.add(Dense(2,activation='softmax')) 
model.layers[0].trainable = False     #makes the ResNet50 untrainable, so only the Dense network is trained.


sgd = optimizers.SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True) #SGD arbitrary, there are others (Adam, etc)
model.compile(optimizer = sgd, loss = 'binary_crossentropy', metrics = ['accuracy'])   #Loss function arbitrary now

In [27]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50v2 (Functional)     (None, 2048)              23564800  
                                                                 
 dense_4 (Dense)             (None, 13)                26637     
                                                                 
 dropout_1 (Dropout)         (None, 13)                0         
                                                                 
 dense_5 (Dense)             (None, 2)                 28        
                                                                 
Total params: 23,591,465
Trainable params: 26,665
Non-trainable params: 23,564,800
_________________________________________________________________


In [14]:
def build_model():
	baseModel = ResNet50V2(weights="imagenet", include_top=False, pooling = 'avg',
	input_tensor=Input(shape=(299, 299, 3)))

	# pooling - process of splitting the data into several regions, take max/avg of cells in those region, insert into an output matrix
	# pooling = avg: global average pooling applied to the output of the last convolutional block

	# construct the head of the model that will be placed on top of the
	# the base model, replace with own layers

	# TODO - determine the headModel layers! (What layers we choose to add)
	headModel = baseModel.output
	# headModel = AveragePooling2D(pool_size=(7, 7))(headModel)
	# pool size - size of the pooling window

	# headModel = Flatten(name="flatten")(headModel) # Flattening the input - 

	for i in range(COUNT_DENSE):
		headModel = Dense(23, activation="relu")(headModel) # activation function
 

	headModel = Dropout(0.5)(headModel) # reduce overfitting - “0.5” specifies the amount of input to be removed from the available input data

	headModel = Dense(2, activation="relu")(headModel) # 2 = number of classes

	# place the head FC model on top of the base model (this will become
	# the actual model we will train)
	model = Model(inputs=baseModel.input, outputs=headModel)

	# loop over all layers in the base model and freeze them so they will
	# *not* be updated during the training process, keeping the initial layers intact
	# base model is the pre-trained model (imagenet), which will not be tampered with

	# Use SGD Optimizer
	optimizer = optimizers.SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True) 
	# for layer in baseModel.layers:
	# 	layer.trainable = False
	model.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy'])
	return model

In [28]:
from keras.applications.resnet_v2 import preprocess_input

trainpath=os.path.join("./Split Data", "Training")
valpath=os.path.join("./Split Data", "Validation")

train_datagen=ImageDataGenerator(preprocessing_function=preprocess_input)

train_gen=train_datagen.flow_from_directory(trainpath, target_size=imgsize, batch_size=BATCH_SIZE, class_mode="categorical")
val_gen=train_datagen.flow_from_directory(valpath, target_size=imgsize, batch_size=BATCH_SIZE, class_mode="categorical")

Found 8403 images belonging to 2 classes.
Found 1812 images belonging to 2 classes.


#### initialising callbacks, stoppers

- monitor – This allows us to specify the performance measure to monitor in order to end training.

- mode – It is used to specify whether the objective of the chosen metric is to increase maximize or to minimize.

- verbose – To discover the training epoch on which training was stopped, the “verbose” argument can be set to 1. Once stopped, the callback will print the epoch number.

- patience – The first sign of no further improvement may not be the best time to stop training. This is because the model may coast into a plateau of no improvement or even get slightly worse before getting much better. We can account for this by adding a delay to the trigger in terms of the number of epochs on which we would like to see no improvement. This can be done by setting the “patience” argument.

In [29]:
try:
  os.mkdir(os.path.join('.','Working Model'))       
except:
  pass                #Working Model aleardy exists
cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = 4)

cb_checkpointer = ModelCheckpoint(filepath =os.path.join(".","Working Model","best.hdf5") , monitor = 'val_loss', save_best_only = True, mode = 'auto')

In [30]:
#Just for double checking
print(BATCH_SIZE, len(train_gen), BATCH_SIZE, len(val_gen))
# os.mkdir(os.path.join('.','Working Model'))         used to create the Working Model directory
EPOCHS=20
# TRAIN_STEPS=int(len(train_gen)/EPOCHS)
# VAL_STEPS=len(val_gen)
# print("EPOCHS: {}, TRAINING STEPS: {}, VAL STEPS: {}".format(EPOCHS, TRAIN_STEPS, VAL_STEPS))

50 169 50 37


### 4. Fitting, training the model

In [None]:
# model = build_model()
fit_history = model.fit_generator(
        train_gen,
        epochs = EPOCHS,
        validation_data=val_gen,
        callbacks=[cb_checkpointer, cb_early_stopper]
)

In [None]:
# Plotting training & validation error and loss function over epochs
plt.figure(1, figsize = (15,8)) 
    
plt.subplot(221)  
plt.plot(fit_history.history['accuracy'])  
plt.plot(fit_history.history['val_accuracy'])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 
    
plt.subplot(222)  
plt.plot(fit_history.history['loss'])  
plt.plot(fit_history.history['val_loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 