# Extracting video into Pdf
This notebook purpose is to read a video of a book in pdf and then separating its pages into images and finally joinning them into a pdf. The video used in this example is an ebook, so recognizing different pages is relative easy

Import modules needed

In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np

## Functions

Function to show an specific image

In [2]:
def show_image(image, color=None):
    fig = plt.figure(figsize=(22, 20))
    ax = fig.add_subplot(111)
    ax.imshow(image, cmap='gray')
    plt.show()

The next image obtains the video that is stored in mp4 format and separates each frame into images and save them

In [3]:
def video_to_frames_of(video, in_directory):
    vidcap = cv2.VideoCapture(video)
    success,image = vidcap.read()
    count = 0
    while success:
      cv2.imwrite(f"{in_directory}/frame{count}.jpg", image)     # save frame as JPEG file      
      success,image = vidcap.read()
      # print('Read a new frame: ', success)
      count += 1

This function shows the video for a human to see

In [4]:
def watch_video_in(directory):
    cap = cv2.VideoCapture(directory) # series of image. this is meant to read all the frames rapidly, not for human to see
    if cap.isOpened() == False:
        print('Error: File not found or Wrong')
    else:
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        print(frame_count)
    
    print("Press 'q' for exiting the program")
    while cap.isOpened(): # while capture is opened and is reading
        ret, frame = cap.read()
    
        if ret == True: # while still returning frames, then show frames and be able to break out
        # gray = cv2.cvtColor(frame)
            # time.sleep(1/30) # tell how many fps where recorderd, and delay so each frame is shown as they were recorded. useful only when you want to watch video
            cv2.imshow('frame', frame)
        
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
        else: # but if show all frames, then break out of loop
            break
        
    cap.release()
    cv2.destroyAllWindows()

This function reads all the frames stored in the previous folder and returns an array with all the images stored.

In [39]:
import glob
from functools import cmp_to_key

def custom_sort_function(fileA, fileB):
    start_indx_a = -1
    start_indx_b = -1
    for indx, char in enumerate(fileA):
        if char.isdigit():
            start_indx_a = indx
            break
            
    for indx, char in enumerate(fileB):
        if char.isdigit():
            start_indx_b = indx
            break
                        
    end_indx_fileA = fileA.find('.')
    end_indx_fileB = fileB.find('.')

    num_file_a = int(fileA[start_indx_a:end_indx_fileA])
    num_file_b = int(fileB[start_indx_b:end_indx_fileB])

    if (num_file_a < num_file_b):
        return -1
    else:
        return 1

def load_images_from(folder):
    images = []
    files = glob.glob(f"{folder}/*.jpg") # obtain list of all the files that match this pattern (ending with jpg)
    sorted_files = sorted(files, key=cmp_to_key(custom_sort_function))
    
    for file in sorted_files:
        img = cv2.imread(file, 0) # read them on gray scales
        images.append(img)
    return images
    

Now, obtaining an array of images, save those images in the folder for analysis

In [28]:
import os
import shutil

def save(images, in_directory):
    dirname = in_directory

    # Remove and recreate folder
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.makedirs(dirname)

    for num in range(len(images)):
        image = images[num]
        cv2.imwrite(f"./{dirname}/img{num}.jpg", image)

One way of comparing images is using mean squared error by each pixel. A zero result should tell that two images are similar but this is not always the case. We will leave this function and test it afterwards. Here, we implement it manually but scikit learn has an implementation of the algorithm

In [7]:
def mse(imageA, imageB):
    num_elements = imageA.size # obtain total amount of elements in the image
    
    squared_error = (imageA - imageB) ** 2 # obtain the difference between each value and squared it
    total_squared_error = np.sum(squared_error)
    mse = total_squared_error / num_elements

    return mse

The Structural Similarity Index, developed by Wang et al, focuses on comparing sub-samples of the image to compare the structure of each image. 

Parameters
- (x, y) location of the NXN window
- mean of the pixel intensity in xy direction
- variance of intensities in the xy direction
- covariance

Use scikit learn implementation

In [12]:
from skimage.metrics import structural_similarity as ssim

In [13]:
def using_ssmi_compare(images):
    unique_imgs = [] # array that will store all unique images
    unique_imgs.append(images[0])
    
    for indx in range(1, len(images)): # start on image 1
        image = images[indx]
        prev_image = images[indx - 1]
        diff = ssim(image, prev_image) # compare structural similarity between current and previous image
        different_images = diff < 0.9

        if different_images:
            unique_imgs.append(image)
            
    return unique_imgs

## Implementation

Load the iamges from the folder frames which contains all the frames from the video book.mp4

In [9]:
images = load_images_from('frames') # load all the images into the array, without distiction

In [14]:
unique_imgs = using_ssmi_compare(images) # Obtain an array of the unique images that are compared using the structural similarity index

In [16]:
# save(unique_imgs, in_directory='frames_ssmi') # save the images into the folder
save(unique_imgs, in_directory='frames_ssmi') # save the images into the folder

## Further clean the images
Now lets clean more the structure of the folder with the images frames_ssmi. We now have unique images in the folder

In [31]:
def separate_images_using_strict_borders_of(folder_images):
    images = []
    images.append(folder_images[0])    
    for img in folder_images:
        have_left_border = img[10, 11] in range(120, 180)
        have_right_border = img[10, 902] in range(120, 180)
        same_image_than_previous = np.array_equal(images[-1],img)
        
        if  have_left_border and have_right_border and not same_image_than_previous:
            images.append(img)
    return images

In [32]:
# images = load_images_from('frames_ssmi') # load all the images that were processed after doing ssmi
ssmi_images = load_images_from('frames_ssmi')

In [None]:
uniq_images = separate_images_using_strict_borders_of(ssmi_images)

In [35]:
save(uniq_images, in_directory='strict_borders') # save the images into the folder

## Create PDF
Now, we will use the images that were filtered and join all into a pdf that will contain all the book.

In [40]:
def load_files_directory(folder):
    files = glob.glob(f"{folder}/*.jpg") # obtain list of all the files that match this pattern (ending with jpg)
    sorted_files = sorted(files, key=cmp_to_key(custom_sort_function))
    
    return sorted_files # files directory

Obtain all the images of the book into a single array

In [43]:
from PIL import Image

# obtain all file from files that are returned from the function
images = [
    Image.open(file)
    for file in load_files_directory('strict_borders')
]
cover = Image.open('./cover/cover.jpeg') # add the cover to the start of the array
images.insert(0, cover)

In [44]:
pdf_path = "./book.pdf" # path to the pf

# save all the images on a pdf
images[0].save(
    pdf_path, "PDF" ,resolution=100.0, save_all=True, append_images=images[1:]
)