<a href="https://colab.research.google.com/github/Hoale2908/PDF_Convert_and_Extract/blob/main/PDF_Convert_and_Extract_(1_0).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Set up { vertical-output: true, display-mode: "form" }

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker

!pip install pytesseract --quiet
import pytesseract

! apt install tesseract-ocr --quiet
! apt install libtesseract-dev --quiet

# Install pdf2image
!pip install pdf2image --quiet
from pdf2image import convert_from_path
!apt-get install poppler-utils --quiet

# Install PyPDF2
!apt-get install python3-pypdf2 --quiet
import PyPDF2
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter

print('\n Done! Go to the next step!')

In [None]:
# @title Upload files { vertical-output: true, display-mode: "form" }

# Uploading files
data = '/content/data/'

if os.path.isdir(data)==False:
    os.makedirs(data)

os.chdir(data)

print('PLEASE UPLOAD YOUR FILES:')

from google.colab import files
uploaded = files.upload()

# Checking the number of files in the folder
nfiles = 0
for filename in os.scandir(data):
    if filename.is_file():
        nfiles = nfiles + 1

print('\n TOTAL: ' + str(nfiles) + ' FILE(S) UPLOADED.')

file_name = []
no_pages = []

for file in os.listdir(data):
  if file.endswith(".pdf"):
    inputfile = PdfFileReader(data + "/" + file)
    file_name.append(file)
    no_pages.append(len(inputfile.pages))

file_df = pd.DataFrame(list(zip(file_name, no_pages)), columns = ['File name','No. of Pages'])

print(file_df)
print('\n Done! Go to the next step!\n')

# Create a dataframe for extracted data in next steps
result = pd.DataFrame()

In [None]:
# @title Remove unwanted pages & Merge { vertical-output: true, display-mode: "form" }

no_pages = file_df.groupby('No. of Pages').count()
list_no_pages = list(no_pages.index)

# If there are multiple files, merge them into one file.
if nfiles >1 :
    keep_pages = {}
    print('The uploaded files will be merged.')

    # Asking about the pages to keep with respect to each set of files
    for i in list_no_pages:
        if i>1 :
            p = input('For the files with ' + str(i) + ' pages, which page contain the data?')
            keep_pages[i] = p
        else:
            keep_pages[i] = 1

    # Create a new column in the DataFrame to indicate which page to keep for each file
    for file in os.listdir(data):
        if file.endswith('.pdf'):
            ind = file_name.index(file)
            m = file_df.loc[ind,'No. of Pages']
            for k in keep_pages.keys():
                if k == m:
                    file_df.loc[ind,'Page to keep'] = keep_pages[k]

    # Print out the summary
    print('\n A summary of your selection is as following:')
    print(file_df)

    # Merging the pages
    outputfile = PdfFileMerger()

    for file in os.listdir(data):
        if file.endswith('.pdf'):
            inputfile = PdfFileReader(data + '/' + file)
            ind = file_name.index(file)
            x = int(file_df.loc[ind,'Page to keep'])
            outputfile.append(fileobj=inputfile, pages=(x-1, x))


    all_merged = 'all_merged.pdf'
    output = open(all_merged, 'wb')
    outputfile.write(output)
    outputfile.close()
    output.close()
    PDF_file = data + '/' + all_merged
    print('Merging completed.')

# No need to merge if there is only one file
else:
    print('Go to the next step.')
    PDF_file = data + file_df['File name'][0]


In [None]:
# @title Convert to image { vertical-output: true, display-mode: "form" }

pages = '/content/data/pages/'

if os.path.isdir(pages)==False:
    os.makedirs(pages)

os.chdir(pages)
image_counter = 1

images = convert_from_path(PDF_file)

for image in images:
    filename = 'page_'+str(image_counter)+'.jpg'
    image.save(filename, 'JPEG')
    image_counter = image_counter + 1

n_pages_to_del = input('Which pages should be removed, if any? Separating them by a comma. If not, input 0.')
n_pages_to_del = n_pages_to_del.split(',')
n_pages_to_del = [int(i) for i in n_pages_to_del]
for i in n_pages_to_del:
    if i>0:
        os.remove('page_'+str(i)+'.jpg')
    else:
        pass

print('\n Done! Go on to the next step!')

In [None]:
# @title Preview and get data location { run: "auto", vertical-output: true, form-width: "200px", display-mode: "form" }

!pip install -I pillow==7.2.0 --quiet
from PIL import Image

# @markdown This step will put a grid on your file so that you can get the coordinates of the area containing the data.

Horizontal_scale = 50 #@param ["200", "100", "50", "20", "10"] {type:"raw"}
Vertical_scale = 20 #@param ["50", "20", "10"] {type:"raw"}

# Open image file
import random
random_page = random.choice(os.listdir('/content/data/pages/'))
image = Image.open(random_page)
zoom_w = 1200/image.width
zoom_h = 1200/image.height
zoom = round(max(zoom_w, zoom_h))
new_size = (image.width*max(zoom,1),image.height*max(zoom,1))
image = image.resize(new_size, Image.NEAREST)

# Set up figure
fit = 50
fig=plt.figure(figsize=(float(image.size[0])/fit,float(image.size[1])/fit),dpi=100)
ax=fig.add_subplot(111)

# Set the gridding interval: here we use the major tick interval
loc = plticker.MultipleLocator(base=Horizontal_scale)
ax.xaxis.set_major_locator(loc)
loc_y = plticker.MultipleLocator(base=Vertical_scale)
ax.yaxis.set_major_locator(loc_y)

# Add the grid
ax.grid(which='major', axis='both', linestyle='-')

# Add the image
ax.imshow(image)

In [None]:
#@title Input the coordinates of the data

left = input('LEFT border:')
left = int(left)

right = input('RIGHT border:')
right = int(right)

top = input('TOP border:')
top = int(top)

bottom = input('BOTTOM border:')
bottom = int(bottom)

img = Image.open(random_page)      # Open the random file
img_cropped = img.crop((left, top, right, bottom))

print('\nPlease make sure the image shown contain ONLY the data to be extracted (no borders, no other characters). If not, adjust the coordinators by re-run this step.\n')

img_cropped

In [None]:
# @title Are there any spaces in data?
spaces = "Yes" # @param ["Yes", "No"]
if spaces == "Yes":
    datatype = "Text"
else:
    datatype = "Number"

In [None]:
#@title Confirm if the data extracted is correct

# If it is a number:
if datatype == "Number":
    zoom = 1
    new_size = (img_cropped.width*zoom,img_cropped.height*zoom)
    img_resized = img_cropped.resize(new_size, Image.NEAREST)
    os.chdir('/content/')
    img_resized.save('test.jpg')
    img = Image.open("test.jpg")
    text = str(pytesseract.image_to_string(Image.open("/content/test.jpg"),config='--psm 6'))
    text = text.replace("\n","")
    text = text.replace("\x0c","")
    text = text.replace(" ","")                   # no space in a number
    print('\nPlease check if the data extracted below is correct. \nIf it is incorrect, re-do the last step until you get the right data. \nIf it is correct, move on to the next step. \n')
    print(text)

# If it is a text:
else:
    zoom = 1
    new_size = (img_cropped.width*zoom,img_cropped.height*zoom)
    img_resized = img_cropped.resize(new_size, Image.NEAREST)
    os.chdir('/content/')
    img_resized.save('test.jpg')
    img = Image.open('test.jpg')
    text = str(pytesseract.image_to_string(Image.open('/content/test.jpg')))
    text = text.replace("\n","")
    text = text.replace("\x0c","")
    print('\nPlease check if the data extracted below is correct. \nIf it is incorrect, re-do the last step until you get the right data. \nIf it is correct, move on to the next step. \n')
    print(text)

In [None]:
#@title Extract the data { vertical-output: true, display-mode: "form" }
# @markdown - Give a name for the data being extracted. It will be the header of the column in the output CSV file.

# @markdown - If you want to extract another type of data, re-do "Insert the coordinates" step with  the new data.

# @markdown - If you are done with extraction, run the next step to export the data to a .csv file.

fieldname = input("Insert the name of the data, then hit Enter:")

templist = []

os.chdir(pages)

if datatype == "Text":
    for file in os.listdir("/content/data/pages/"):
        if file.endswith('.jpg'):
            img = Image.open(file)
            img_cropped = img.crop((left, top, right, bottom))
            zoom = round(max((10/min(img_cropped.size),1)))
            new_size = (img_cropped.width*zoom,img_cropped.height*zoom)
            img_resized = img_cropped.resize(new_size, Image.NEAREST)
            img_resized.save('/content/test.jpg')
            img = Image.open('/content/test.jpg')
            text = ((pytesseract.image_to_string(Image.open("/content/test.jpg"))))
            text = text.replace("\n","")
            text = text.replace("\x0c","")
            templist.append(text)
else:
    for file in os.listdir("/content/data/pages/"):
        if file.endswith('.jpg'):
            img = Image.open(file)
            img_cropped = img.crop((left, top, right, bottom))
            zoom = round(max((10/min(img_cropped.size),1)))
            new_size = (img_cropped.width*zoom,img_cropped.height*zoom)
            img_resized = img_cropped.resize(new_size, Image.NEAREST)
            img_resized.save('/content/test.jpg')
            img = Image.open('/content/test.jpg')
            text = ((pytesseract.image_to_string(Image.open("/content/test.jpg"), config='--psm 6')))
            text = text.replace("\n","")
            text = text.replace("\x0c","")
            text = text.replace(" ","")
            templist.append(text)
result[fieldname] = templist
result


In [None]:
#@title Export to CSV { vertical-output: true, display-mode: "form" }

# Export the result

from pathlib import Path
result_path = Path('/content/result.csv')
result.to_csv(result_path)
files.download('/content/result.csv')

In [None]:
#@title Restarter { vertical-output: true, display-mode: "form" }
#@markdown Run this step if you want to re-upload the files, or delete all uploaded files.
import sys
import os

data = '/content/data'

for file in os.listdir('/content'):
    if file.endswith('.jpg'):
        os.remove('/content/' + file)
    if file.endswith('.pdf'):
        os.remove('/content/' + file)
    if file.endswith('.csv'):
        os.remove('/content/' + file)


for file in os.listdir('/content/data'):
    if file.endswith('.jpg'):
        os.remove('/content/data/' + file)
    if file.endswith('.pdf'):
        os.remove('/content/data/' + file)
    if file.endswith('.csv'):
        os.remove('/content/data/' + file)

del result

print('OK! Go back to step 2 to reupload your files, or exit the program.')
