## **Import required libraries**

In [1]:
# Import libraries 
import numpy as np  # Linear algebra
import os    # Library for interacting with operating system
import cv2   # Computer vision library

## **Define functions**
* **standard_bbox_values()**
  * **inputs:** bounding box path, the width and height of image
  * **output:** standard values for the entries of bounding box (bbox[0], bbox[1], bbox[2], bbox[3])
 
 
* **get_padding_bbox_elements()**
  * **inputs:** bounding box entries: x1, y1, w1, h1, the width and height of image: real_w, real_h, ratio_bbox_and_image
  * **output:** padded bounding box entries

In [3]:
#This function takes the elements of bounding box and returns their standard values based on README file provided with the dataset
def standard_bbox_values(bbox_path, real_w, real_h):
    bbox_read = open(bbox_path, "r")       # Open the bounding box txt file as a file object
    bbox = list(bbox_read)[0].split()      # Split the text by the whitespace separator and return bounding box elements in string type
    
    x1 = int(int(bbox[0]) * (float(real_w) / 224))  # bbox[0]
    y1 = int(int(bbox[1]) * (float(real_h) / 224))  # bbox[1]
    w1 = int(int(bbox[2]) * (float(real_w) / 224))  # bbox[2]
    h1 = int(int(bbox[3]) * (float(real_h) / 224))  # bbox[3]
    
    return x1, y1, w1, h1

#This function adds padding to the bounding box while ensuring it is inside the image
def get_padding_bbox_elements(x1, y1, w1, h1, real_w, real_h, ratio_bbox_and_image):
    x1_padding = x1 - int((w1) * (1 + ratio_bbox_and_image))
    y1_padding = y1 - int((h1) * (1 + ratio_bbox_and_image))
    w1_padding = w1 + int((w1) * (1 + ratio_bbox_and_image))
    h1_padding = h1 + int((h1) * (1 + ratio_bbox_and_image))
    x1_padding = max(0, x1_padding) 
    y1_padding = max(0, y1_padding)
    w1_padding = min(real_w, w1_padding)
    h1_padding = min(real_h, h1_padding)
        
    return x1_padding, y1_padding, w1_padding, h1_padding

## **Extract the image samples - Part 1:**
* Get the standard values of the face bounding box provided along with each image
* Add padding to the bounding box
* Crop the original image by the bounding box, resize it, and store it 
* Assign label '1' to *live* image and label '0' to *spoof* image and store it

In [4]:
import gc                         # garbage collector

img_face = []                    # extracted face images
img_label = []                   # labels (spoof:0, live:1) for extracted face images
folders = []
count_live = 0                   # live counter
count_spoof = 0                  # spoof counter
count_limit_live = 2        # number of extracted live samples
count_limit_spoof = 2       # number of extracted spoof samples

train_path = '/kaggle/input/celeba-spoof-for-face-antispoofing/CelebA_Spoof_/CelebA_Spoof/Data/train'
for folder in os.listdir(train_path):       # a list of names of all the folders present in train_path (folders: 1, 10, 1000, ...)
    folders.append(folder)                  # store the extracted folders 
    gc.collect()                          # release unreferenced memory

    d = os.path.join(train_path, folder)    # concatenate train_path and folder
    if os.path.isdir(d):
        for e in os.listdir(d):
            imgs_path = os.path.join(d, e)          # live or spoof folder path
            for img_path in os.listdir(imgs_path):
                if (img_path.endswith('.jpg')):     # if it is an image not bounding box txt file
                    full_img_path = os.path.join(imgs_path, img_path)
                    bound_box_path = full_img_path[0:-4] + '_BB.txt'    # bbox path and img path differ only in a few last words (their suffix)
                    img = cv2.imread(full_img_path)                     # read the image
                    real_w = img.shape[1]                               # image width
                    real_h = img.shape[0]                               # image height
                    x1, y1, w1, h1 = standard_bbox_values(bound_box_path, real_w, real_h)      # get the standard values for bounding box entries based on README file provided
                    ratio_bbox_and_image = (w1 * h1) / (real_w * real_h)                       # the ratio of bbox area to img area
                    x1_padding, y1_padding, w1_padding, h1_padding = get_padding_bbox_elements(x1, y1, w1, h1, 
                                                                                              real_w, real_h,
                                                                                              ratio_bbox_and_image)   # add padding to the bounding box
                    cropped_img = img[y1_padding:y1+h1_padding, x1_padding:x1+w1_padding]                             # crop the original image by the padded bounding box
                    try:
                        if (e == 'live' and count_live >= count_limit_live) or (e == 'spoof' and count_spoof >= count_limit_spoof):
                            continue
                        resized_cropped_img = cv2.resize(cropped_img, (224, 224), interpolation = cv2.INTER_AREA)      # resize the cropped face image to (224,224)
                        img_face.append(resized_cropped_img)
                        if e == 'live':
                            count_live = count_live + 1
                            img_label.append(1)                               # assign label '1' to live image
                        elif e == 'spoof':
                            count_spoof = count_spoof + 1
                            img_label.append(0)                              # assign label '0' to spoof image
                    except:
                        continue

                    if (count_live == count_limit_live and e == 'live') or (count_spoof == count_limit_spoof and e == 'spoof'):
                        break
            if count_live >= count_limit_live and count_spoof >= count_limit_spoof:
                break
    if count_live >= count_limit_live and count_spoof >= count_limit_spoof:
        print("DONE Extracting ")
        break


## **Save images and lables into one single NPZ file**

In [11]:
X = np.asarray(img_face)    # convert to numpy array
y = np.asarray(img_label)
np.savez('anti_spoofing_data.npz', X, y)   # Save all the numpy arrays into one single npz file
print("DONE SAVING NPZ FILE")

## **Download the NPZ file**

In [6]:
from IPython.display import FileLink
FileLink(r'anti_spoofing_data.npz')

In [7]:
np.savez('folders.npz',folders) 
from IPython.display import FileLink
FileLink(r'folders.npz')

## **Extract the image samples - Part 2**
Due to the limitations of kaggle resources, I extracted 80k number of images in two parts: 40k in part 1, and 40k in part 2. Therefore, the below codes are the same as above.

In [10]:
folders = np.load('../input/foldernamepart1/folders.npz')   # load the folders extracted in part 1

In [20]:
import gc

img_face = []     # Extracted face images
img_label = []    # Labels (spoof:0, live:1) for extracted face images
folders_part2 = []
count_live = 0                   # live counter
count_spoof = 0                  # spoof counter
count_limit_live = 20000        # Number of extracted live samples
count_limit_spoof = 20000       # Number of extracted spoof samples

train_path = '/kaggle/input/celeba-spoof-for-face-antispoofing/CelebA_Spoof_/CelebA_Spoof/Data/train'
for folder in os.listdir(train_path):   # a list of names of all the folders present in train_path (folders: 1, 10, 1000, ...)
    if folder not in folders:           # if this folder has not already been extracted in part 1
        folders_part2.append(folder)    # store the extracted folders 
        gc.collect()                  # release unreferenced memory

        d = os.path.join(train_path, folder)                                  # concatenate train_path and folder
        if os.path.isdir(d):
            for e in os.listdir(d): 
                imgs_path = os.path.join(d, e)                                # live or spoof folder path 
                for img_path in os.listdir(imgs_path):
                    if (img_path.endswith(".jpg")):                           # if it is an image not bounding box txt file
                        full_img_path = os.path.join(imgs_path, img_path)
                        bound_box_path = full_img_path[0:-4] + '_BB.txt'      # bbox path and img path differ only in a few last words (their suffix)
                        img = cv2.imread(full_img_path)                       # read the image
                        real_w = img.shape[1]                                 # image width
                        real_h = img.shape[0]                                 # image height
                        x1, y1, w1, h1 = standard_bbox_values(bound_box_path, real_w, real_h)      # get the standard values for bounding box entries based on README file provided
                        ratio_bbox_and_image = (w1 * h1) / (real_w * real_h)                       # the ratio of bbox area to img area
                        x1_padding, y1_padding, w1_padding, h1_padding = get_padding_bbox_elements(x1, y1, w1, h1, 
                                                                                                  real_w, real_h,
                                                                                                  ratio_bbox_and_image)    # add padding to the bounding box
                        cropped_img = img[y1_padding:y1+h1_padding, x1_padding:x1+w1_padding]                              # crop the original image by the padded bounding box
                        try:
                            if (e == 'live' and count_live >= count_limit_live) or (e == 'spoof' and count_spoof >= count_limit_spoof):
                                continue
                            resized_cropped_img = cv2.resize(cropped_img, (224, 224), interpolation = cv2.INTER_AREA)      # resize the cropped face image to (224,224)
                            img_face.append(resized_cropped_img)
                            if e == 'live':
                                count_live = count_live + 1
                                img_label.append(1)                                # assign label '1' to live image
                            elif e == 'spoof':
                                count_spoof = count_spoof + 1
                                img_label.append(0)                                # assign label '0' to spoof image
                        except:
                            continue

                        if (count_live == count_limit_live and e == 'live') or (count_spoof == count_limit_spoof and e == 'spoof'):
                            break
                if count_live >= count_limit_live and count_spoof >= count_limit_spoof:
                    break
        if count_live >= count_limit_live and count_spoof >= count_limit_spoof:
            print("DONE Extracting ")
            break


In [None]:
X = np.asarray(img_face)    # convert to numpy array
y = np.asarray(img_label)
np.savez('anti_spoofing_data_part2.npz', X, y)   # Save all the numpy arrays into one single npz file
print("DONE SAVING NPZ FILE")

In [None]:
from IPython.display import FileLink
FileLink(r'anti_spoofing_data_part2.npz')

In [None]:
np.savez('folders_part2.npz',folders_part2) 
from IPython.display import FileLink
FileLink(r'folders_part2.npz')