# Deepfake detection using Deep learning



### 1. installing and importing the necessary packages

In [None]:

# %pip install tensorflow
# %pip install matplotlib
# %pip install scipy
# %pip install keras-tuner
# %pip install scikit-learn

In [1]:
%matplotlib inline
import IPython.core.display         
# setup output image format (Chrome works best)
IPython.core.display.set_matplotlib_formats("svg")
import matplotlib.pyplot as plt
import matplotlib
import zipfile
import fnmatch
import os.path
import random
import shutil
import pandas as pd
import sklearn
import numpy as np
import os
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.layers import Dense, Flatten, AveragePooling2D

from keras import optimizers
from keras.applications.inception_v3 import InceptionV3, preprocess_input

from sklearn import metrics
# from sklearn import datasets
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

  after removing the cwd from sys.path.


Check that the version and python bits is correct

In [2]:
# Tensorflow might not be able to be installed under different python versions
from platform import python_version
print("Current Python Version-", python_version())


# python 64 bits required
import struct
print(struct.calcsize("P") * 8)

Current Python Version- 3.7.8
64


### 2. Creating file directories, extract image into respective directories


In [6]:
# Split Data
# ├── Test  
# │   ├── manipulated
# │   └── original
# ├── Training
# │   ├── manipulated 
# │   └── original
# └── Validation
#     ├── manipulated 
#     └── original

# Creates the appropriate directory structures for training, validation and test sets.
try:
  shutil.rmtree('./Split Data')      
except:
  pass                #Split Data didn't exist
     
os.mkdir('./Split Data')
cdf={"Training":0.7,"Validation":0.85,"Test":1} #OBS! Has to be increment percentages of 5 to make batch size fit
for dir in list(cdf.keys()):
    os.mkdir('./Split Data/{}'.format(dir))
    os.mkdir('./Split Data/{}/manipulated'.format(dir))
    os.mkdir('./Split Data/{}/original'.format(dir))

In [9]:
# Function which assigns the file to the correct directory based on the discrete cumulative distribution function cdf

def assign_data(cdf):
    nbr=random.random()
    for set in list(cdf.keys()):
        if nbr<cdf[set]:
            return set

dist={"Training":0,"Validation":0,"Test":0}
filename = 'data.zip'
zfile = zipfile.ZipFile(filename, 'r')
counter=0
samplesize=12000
# Each file is loaded in sequence and randomly assigned to the corresponding directory 
# in the new straucture according to cdf. Dictionary dist keeps track of number of each set.
for name in zfile.namelist():
    save_path = './Split Data/'
    name_of_file=""
    label=""
    if fnmatch.fnmatch(name, "data/manipulated/*.png"):
        name_of_file=name[len("data/manipulated/"):]
        label="manipulated"
    elif fnmatch.fnmatch(name,"data/original/*.png"):
        name_of_file=name[len("data/original/"):]
        label="original"
    if name_of_file != "":
        myfile = zfile.open(name)
        img = matplotlib.image.imread(myfile)
        rand_assign=assign_data(cdf)
        dist[rand_assign]+=1
        save_path+=rand_assign+"/"+label # eg. "Split Data/Training/manipulated
        completeName = os.path.join(save_path, name_of_file)         
        matplotlib.image.imsave(completeName,img)
        counter+=1
        if counter>=samplesize:     
            break

zfile.close()

In [None]:
# Just for checking the set sizes
for fr in list(dist.keys()):
    for label in ["manipulated","original"]:
        print("Size of {}/{}: {}".format(fr,label,len(os.listdir(os.path.join('./Split Data',fr,label)))))


### 3. Building the model by initialising the attributes, layers etc.



- A batch size of 32 means that 32 samples from the training dataset will be used to estimate the error gradient before the model weights are updated. 
- One training epoch means that the learning algorithm has made one pass through the training dataset, where examples were separated into randomly 
selected “batch size” groups.

In [1]:
# constant fields
BATCH_SIZE= 64 
imgsize=(299,299)
COUNT_DENSE = 3


In [5]:
# Building inception Model

model=Sequential()

model.add(InceptionV3(
    include_top=False,
    weights='imagenet',
    classes=1000,
    classifier_activation='softmax',
    pooling='avg'
))

model.add(Flatten())
model.add(Dense(2, activation = 'relu'))
model.add(Dense(2,activation='softmax'))  
model.layers[0].trainable = True    # train all the layers
model.summary()
#optimizer = optimizers.SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True) 
optimizer1 = optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07,amsgrad=False,name="Adam")
model.compile(optimizer = optimizer1, loss = 'binary_crossentropy', metrics = ['categorical_accuracy'])   #Loss function arbitrary now

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inception_v3 (Functional)   (None, 2048)              21802784  
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 2)                 4098      
                                                                 
 dense_1 (Dense)             (None, 2)                 6         
                                                                 
Total params: 21,806,888
Trainable params: 21,772,456
Non-trainable params: 34,432
_________________________________________________________________


In [8]:
from keras.applications.resnet_v2 import preprocess_input

trainpath=os.path.join("./Split Data", "Training")
valpath=os.path.join("./Split Data", "Validation")

train_datagen=ImageDataGenerator(preprocessing_function=preprocess_input)

train_gen=train_datagen.flow_from_directory(trainpath, target_size=imgsize, batch_size=BATCH_SIZE, class_mode="categorical")
val_gen=train_datagen.flow_from_directory(valpath, target_size=imgsize, batch_size=BATCH_SIZE, class_mode="categorical")

Found 8403 images belonging to 2 classes.
Found 1812 images belonging to 2 classes.


#### initialising callbacks, stoppers

- monitor – This allows us to specify the performance measure to monitor in order to end training.

- mode – It is used to specify whether the objective of the chosen metric is to increase maximize or to minimize.

- verbose – To discover the training epoch on which training was stopped, the “verbose” argument can be set to 1. Once stopped, the callback will print the epoch number.

- patience – The first sign of no further improvement may not be the best time to stop training. This is because the model may coast into a plateau of no improvement or even get slightly worse before getting much better. We can account for this by adding a delay to the trigger in terms of the number of epochs on which we would like to see no improvement. This can be done by setting the “patience” argument.

In [6]:
try:
  os.mkdir(os.path.join('.','Working Model'))       
except:
  pass                #Working Model aleardy exists
cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = 4)

cb_checkpointer = ModelCheckpoint(filepath =os.path.join(".","Working Model","best.hdf5") , monitor = 'val_loss', save_best_only = True, mode = 'auto')

In [9]:
#Just for double checking
print(BATCH_SIZE, len(train_gen), BATCH_SIZE, len(val_gen))



# os.mkdir(os.path.join('.','Working Model'))         used to create the Working Model directory
# TRAIN_STEPS=int(len(train_gen)/EPOCHS)
# VAL_STEPS=len(val_gen)
# print("EPOCHS: {}, TRAINING STEPS: {}, VAL STEPS: {}".format(EPOCHS, TRAIN_STEPS, VAL_STEPS))

50 169 50 37


### 4. Fitting, training the model

In [None]:
EPOCHS=20
fit_history = model.fit_generator(
        train_gen,
        epochs = EPOCHS,
        validation_data=val_gen,
        callbacks=[cb_checkpointer, cb_early_stopper]
)

In [None]:
# Plotting training & validation error and loss function over epochs
plt.figure(1, figsize = (15,8)) 
    
plt.subplot(221)  
plt.plot(fit_history.history['accuracy'])  
plt.plot(fit_history.history['val_accuracy'])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 
    
plt.subplot(222)  
plt.plot(fit_history.history['loss'])  
plt.plot(fit_history.history['val_loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.legend(['train', 'valid']) 

### 5. Testing the trained weights

In [18]:
def test(model,test_path):
  # fields required
  Accuracy, Recall, Precision, AUC = 0, 0, 0, 0

  # init constants  
  BATCH_SIZE=64
  imgsize=(299,299)

  # Model used
  built_model=Sequential()
  built_model.add(InceptionV3(
    include_top=False,
    weights='imagenet',
    classes=1000,
    classifier_activation='softmax',
    pooling='avg'
    ))

  built_model.add(Flatten())
  built_model.add(Dense(2, activation = 'relu'))
  built_model.add(Dense(2,activation='softmax'))  
  built_model.layers[0].trainable = True    
  built_model.summary()
  optimizer1 = optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07,amsgrad=False,name="Adam")
  built_model.compile(optimizer = optimizer1, loss = 'binary_crossentropy', metrics = ['categorical_accuracy'])  

  test_img_gen = ImageDataGenerator(preprocessing_function=preprocess_input)
  test_generator=test_img_gen.flow_from_directory(
    directory = test_path,
    target_size = imgsize,
    batch_size = BATCH_SIZE,   
    class_mode = None,    
    shuffle = False,      
    seed = 123            
    )
  
  built_model.load_weights(model)
  print("model loaded")

  predicted = built_model.predict(test_generator,steps = len(test_generator), verbose = 1)
  predicted_class_indices = np.argmax(predicted, axis = 1)

  y = test_generator.classes
  Accuracy = metrics.accuracy_score(y,predicted_class_indices)
  print("Model Accuracy: ", Accuracy)

  Precision = metrics.precision_score(y, predicted_class_indices, average='binary')
  print("Model Precision: ", Precision)

  Recall = metrics.recall_score(y, predicted_class_indices, average='binary')
  print("Model Recall: ", Recall)

  AUC = metrics.roc_auc_score(y, predicted_class_indices)
  print("AUC: ", AUC)

  filenames = [i.split('\\')[1] for i in test_generator.filenames] 
  actualLabel = [i.split('\\')[0] for i in test_generator.filenames]
  for i in range(len(actualLabel)):
    if actualLabel[i] == 'manipulated':
        actualLabel[i] = '0'
    else:
        actualLabel[i]  = '1' 
  results_df = pd.DataFrame(
    {
        'id': pd.Series(filenames), 
        'actual label': pd.Series(actualLabel),
        'pred label': pd.Series(predicted_class_indices)
    }
  )
  os.makedirs('Result', exist_ok=True)
  results_df.to_csv('Result/out.csv')

  return Accuracy, Recall, Precision, AUC



In [5]:
test_path=os.path.join("./Split Data", "Test")
test_path

'./Split Data\\Test'

In [6]:
model = os.path.join(".","Working Model","final.hdf5")
model

'.\\Working Model\\final.hdf5'

In [17]:
test(model,test_path)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inception_v3 (Functional)   (None, 2048)              21802784  
                                                                 
 flatten_1 (Flatten)         (None, 2048)              0         
                                                                 
 dense_2 (Dense)             (None, 2)                 4098      
                                                                 
 dense_3 (Dense)             (None, 2)                 6         
                                                                 
Total params: 21,806,888
Trainable params: 21,772,456
Non-trainable params: 34,432
_________________________________________________________________
Found 1785 images belonging to 2 classes.
model loaded
Model Accuracy:  0.9876750700280112
Model Precision:  0.9862542955326461
Model Recall:  0.9761904761904762
AUC:  0

(0.9876750700280112,
 0.9761904761904762,
 0.9862542955326461,
 0.9847535505430243)