In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'deepfake-detection-challenge:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F16880%2F858837%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240927%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240927T165301Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da1fcd6e41aecadd2560a3775d24a6b0b0640b8a8695a3846b9b98ecadb0fb01f72ffa0127cf3c423e94d20c25adb5776e027a1e60f629f301aee5c4df5563c95effb7f108000bfd931a161bdcbb1feba307f003cdb77a2ce954234cc825991f6bd681fccc06171380a48399c46978bbea239799133496bd1bdd2f0096b59def2ff5c5d819d4bfa84b8010dbe0bcee50fa1d7becedcfba167c276156a7ef743cd617846fc041cbbfd297e5789cc243da2998470eb7a677effb8ce629183441642a98014a5c1554d38070a88424e3cced473f52361711084f508d02d9a5c41999af724f6b65d5894ebc17cfd582c0b5ad6e404a69973d6163b8c4b1530d1a5a9ae,deepfake-detection-challenge-dataset-face-images:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5682694%2F9370003%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240927%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240927T165301Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3872304d74814b5c0405ec7dbbcc2287f1cd87432bbea303461dc97cbb152eb8b6a944a9d5353e7ac2c4a293546f223668ed4af3e2770e991190efc29cb2364833f62b124d501801d8c94ebc59d99c233092960ecc0d69ec2f31b4dcc9f615b3cc3af4b36b97782b7727517d9066d27c44a7188ac21350d2528f67f9850df9950fe891c6c20e61ea570e2c8b50d438cbdee13db62db9b65779fb266b3cd08b8376d7e118f22505cfb0da8613086558620b5af67a3011dcb059926e67b70b61d6bb96fd759b2babd428c20751d593e551d9ec488a7e736fe271272f3809c932e85ee856c4603376b95ffcb70a0fdb7b1383604830a70174e19c8a0c67503db414'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading deepfake-detection-challenge, 4439352219 bytes compressed
Downloaded and uncompressed: deepfake-detection-challenge
Downloading deepfake-detection-challenge-dataset-face-images, 126132864 bytes compressed
Downloaded and uncompressed: deepfake-detection-challenge-dataset-face-images
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/eqnoqyfquo_6.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/atkdltyyen_8.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/dbnygxtwek_7.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/dzyuwjkjui_7.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/cyxlcuyznd_5.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/ahqqqilsxt_2.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/efwfxwwlbw_7.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/anpuvshzoo_1.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/bwhlgysghg_0.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/bgvhtpzknn_1.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/real/egghxjjmfg_5.png
/kaggle/input/deepfake-detection-challenge-dataset-face-images/re

# Training Data Preparation

In [None]:
!pip install keras-nightly
!pip install tensorflow
!pip install keras
!pip install --upgrade tensorflow



In [None]:
import os
import cv2
import numpy as np
import pickle
import tensorflow
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input


# Constants
IMG_SIZE = (299, 299)  # Target image size for InceptionV3
MAX_FRAMES = 10        # Maximum number of frames per sequence
REAL_DIR = 'real'      # Directory containing images of real samples
FAKE_DIR = 'fake'      # Directory containing images of fake samples

def load_images_from_directory(directory, label):
    """
    Load images from the specified directory, group them by videoname, preprocess, and pad sequences.

    Parameters:
    - directory (str): Path to the directory containing images.
    - label (int): Label for the samples (0 for real, 1 for fake).

    Returns:
    - data (np.array): Array of processed sequences of images.
    - labels (np.array): Array of corresponding labels.
    """
    data = []
    labels = []
    video_frames = defaultdict(list)  # Dictionary to hold frames grouped by videoname

    # Iterate over all images in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.png'):
            # Extract the videoname (before the first underscore)
            video_name = filename.split('_')[0]
            filepath = os.path.join(directory, filename)
            img = cv2.imread(filepath)

            if img is not None:
                # Resize and preprocess the image
                img = cv2.resize(img, IMG_SIZE)
                img = img_to_array(img)
                img = preprocess_input(img)  # Preprocess using InceptionV3 preprocessing
                video_frames[video_name].append(img)

    # Process each set of images grouped by videoname
    for frames in video_frames.values():
        # Pad with zeros if frames are less than MAX_FRAMES
        while len(frames) < MAX_FRAMES:
            frames.append(np.zeros((299, 299, 3)))  # Zero-padding for missing frames

        # Limit to MAX_FRAMES if more frames are present
        frames = frames[:MAX_FRAMES]

        data.append(frames)
        labels.append(label)

    return np.array(data), np.array(labels)


In [None]:
# Load real and fake images
REAL_DIR = "/kaggle/input/deepfake-detection-challenge-dataset-face-images/real"
FAKE_DIR = "/kaggle/input/deepfake-detection-challenge-dataset-face-images/fake"
# Load real and fake images
x_real, y_real = load_images_from_directory(REAL_DIR, label=0)  # Label 0 for real
x_fake, y_fake = load_images_from_directory(FAKE_DIR, label=1)  # Label 1 for fake


In [None]:

# Combine real and fake data
x_data = np.concatenate([x_real, x_fake], axis=0)
y_data = np.concatenate([y_real, y_fake], axis=0)

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42, stratify=y_data)



In [None]:
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, TimeDistributed, Dropout, GlobalAveragePooling2D, Input

# Define the custom feature extractor as a Keras Model
def create_feature_extractor():
    base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))
    feature_extractor = tf.keras.Model(inputs=base_model.input, outputs=GlobalAveragePooling2D()(base_model.output))
    return feature_extractor

# Build the sequential model with TimeDistributed
def create_model():
    feature_extractor = create_feature_extractor()

    model = Sequential()
    model.add(Input(shape=(None, 299, 299, 3)))
    model.add(TimeDistributed(feature_extractor))
    model.add(LSTM(128, return_sequences=True)) # LSTM layer
    model.add(GRU(128)) # GRU layer
    model.add(Dropout(0.5)) # Dropout layer for regularization
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer: binary classification

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and summarize the model
model = create_model()
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
model.fit(x_train, y_train, epochs=20, batch_size=10, validation_data=(x_val, y_val))

Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m307s[0m 7s/step - accuracy: 0.5548 - loss: 0.7098 - val_accuracy: 0.4839 - val_loss: 0.8270
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 5s/step - accuracy: 0.5226 - loss: 0.7139 - val_accuracy: 0.4839 - val_loss: 0.7658
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 5s/step - accuracy: 0.5901 - loss: 0.6616 - val_accuracy: 0.5806 - val_loss: 0.7366
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 5s/step - accuracy: 0.6428 - loss: 0.6645 - val_accuracy: 0.5161 - val_loss: 0.7024
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 5s/step - accuracy: 0.5729 - loss: 0.6920 - val_accuracy: 0.4194 - val_loss: 0.8629
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 5s/step - accuracy: 0.5851 - loss: 0.6578 - val_accuracy: 0.5161 - val_loss: 0.7316
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d3af4c37b20>

In [None]:
# Save the model in the .h5 format
model.save('/kaggle/working/deepfake_detection_model.h5')



In [None]:
import tensorflow as tf

# Load the model
model_path = '/kaggle/working/deepfake_detection_model.h5'
model = tf.keras.models.load_model(model_path)




In [None]:
# Save model architecture
model_json = model.to_json()
with open('/kaggle/working/model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

# Save model weights
model.save_weights('/kaggle/working/model.weights.h5')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import model_from_json

# Load model architecture
with open('/kaggle/working/model_architecture.json', 'r') as json_file:
    model_json = json_file.read()

# Recreate the model from the architecture
model = model_from_json(model_json, custom_objects={'TimeDistributed': tf.keras.layers.TimeDistributed})

# Load model weights
model.load_weights('/kaggle/working/model.weights.h5')
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.inception_v3 import preprocess_input

# Load the trained model
#model = load_model('/kaggle/working/deepfake_detection_model_2.h5')

# Constants
IMG_SIZE = (299, 299)  # Image size expected by the model
MAX_FRAMES = 10        # Max frames to consider per video

def extract_frames_from_video(video_path, max_frames=MAX_FRAMES):
    """
    Extract and preprocess frames from a given video for prediction.

    Parameters:
    - video_path (str): Path to the input video file.
    - max_frames (int): Maximum number of frames to process from the video.

    Returns:
    - processed_frames (np.array): Array of processed frames ready for model prediction.
    """
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    processed_frames = []

    while cap.isOpened() and frame_count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        # Resize and preprocess the frame
        resized_frame = cv2.resize(frame, IMG_SIZE)
        frame_array = img_to_array(resized_frame)
        processed_frame = preprocess_input(frame_array)
        processed_frames.append(processed_frame)
        frame_count += 1

    cap.release()

    # Pad with zero frames if less than max_frames are present
    while len(processed_frames) < max_frames:
        processed_frames.append(np.zeros((299, 299, 3)))

    return np.array([processed_frames])

def predict_video(model, video_path):
    """
    Predict whether the video is REAL or FAKE based on extracted frames.

    Parameters:
    - model: The trained deepfake detection model.
    - video_path (str): Path to the input video file.

    Returns:
    - prediction (str): 'REAL' or 'FAKE' based on model prediction.
    """
    # Extract frames from the video
    frames = extract_frames_from_video(video_path)

    # Make predictions on frames
    predictions = model.predict(frames)
    print(predictions)
    # Aggregate predictions; if the average is above 0.5, classify as FAKE
    avg_prediction = np.mean(predictions)
    print(avg_prediction)
    if avg_prediction > 0.5:
        return 'FAKE'
    else:
        return 'REAL'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Example usage
# video_path = '/kaggle/input/deepfake-detection-challenge/test_videos/sqixhnilfm.mp4'  # Replace with your video file path
video_path = '/content/drive/MyDrive/Media.mp4'
result = predict_video(model, video_path)
print(f'The video is predicted to be: {result}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step
[[0.16868123]]
0.16868123
The video is predicted to be: REAL
