# PDF->Page->Text

This notebook processes PDFs to TXT by splitting the page and performing OCR for each page of the PDF.

## Outline of the Workflow
* Mount the notebook to the Google Colab
* Import Libraries and packages
* Format params, search, and get files from Google Drive
* Convert pdf pages to jpeg files
* Convert jpeg files to txt (string); save them in output_file

In [1]:
### Modify this cell only
top = '1GalyWQAGD03OivOT6gMpRskYxEbXpkQi' 
input_dir = '/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/k9_input/'  ##must start and end with a "/"
output_dir = '/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/k9_input_results/' ##must start and end with a "/"

In [2]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
service = build('drive', 'v3')

In [3]:
# Run this to mount the Notebook in your Google Drive account 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


One-time installations

In [5]:
!pip3 install pdf2image
!apt-get install poppler-utils 
!pip3 install pytesseract
!pip3 install pdf2image
!sudo apt-get install tesseract-ocr
!pip install PyPDF2

Reading package lists... Done
Building dependency tree       
Reading state information... Done
poppler-utils is already the newest version (0.62.0-2ubuntu2.12).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.


Imports

In [6]:
import os
import json
import csv
import sys
from apiclient.discovery import build  # pip install google-api-python-client
from pdf2image import convert_from_path # the module can convert PDF to a PIL Image object
from PIL import Image
import pytesseract #OCR tool; recognizes the text embedded in images
import sys 
from PyPDF2 import PdfFileReader, PdfFileWriter #the classes can read PDFs, split PDFs, and extract data
from collections import Counter # track the number of occurence of a value
import pandas as pd

Logging setup

In [7]:
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(threadName)-10s %(message)s',)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

### Formatting and searching the files

In [9]:
def iterfiles(name=None, is_folder=None, parent=None, order_by='folder,name,createdTime'):  
    q = []
    if name is not None:
        q.append("name = '%s'" % name.replace("'", "\\'"))
    if is_folder is not None:
        q.append("mimeType %s '%s'" % ('=' if is_folder else '!=', FOLDER))
    if parent is not None:
        q.append("'%s' in parents" % parent.replace("'", "\\'"))
    fields = ['id', 'title', 'mimeType']
    params = {'pageToken': None, 'orderBy': order_by,
              'fields': 'kind, nextPageToken, incompleteSearch, files(id, name, mimeType, md5Checksum, webViewLink, createdTime, modifiedTime, size)'}
    if q:
        params['q'] = ' and '.join(q)
    while True:
        logger.debug('params {}'.format(params))
        response = service.files().list(**params).execute() # searches the files which match **params
        logger.debug('response {}'.format(response))        
        for f in response['files']:
            yield f
        try:
            params['pageToken'] = response['nextPageToken'] # changes the page token of the file to next page token
        except KeyError:
            return

### Search all the sub-directories and files inside the initial directory

Uses `iterfiles()`

In [10]:
def walk(top='root', by_name=False, snapshot_file='stack_snapshot.json'):
    print("walk function is called")
  # Search all the sub-directories and files inside the initial directory
    try:
        with open(snapshot_file) as f:
            stack = json.loads(f.read()) #accepts a string, read the json file
    except FileNotFoundError:
        logger.info('no stack snapshot found')
        stack = None
    if not stack:
      # if reading the json file was unsuccessful, check the reasons
        if by_name:
            top, = iterfiles(name=top, is_folder=True) # top as name
        else:
            top = service.files().get(fileId=top).execute() #top as fileId
            if top['mimeType'] != FOLDER:
                raise ValueError('not a folder: %r' % top)
        stack = [([top['name']], top)]
    while stack:
      # if stack is not empty, noting down its information
        logger.info('stack size {}'.format(len(stack)))
        with open(snapshot_file, 'w') as file:
            file.write(json.dumps(stack))
        path, top = stack.pop()
        logger.info('dir {}'.format(path))
        dirs, files = is_file = [], []
        for f in iterfiles(parent=top['id']):
            is_file[f['mimeType'] != FOLDER].append(f)
        logger.info('subdirs {} files {}'.format(len(dirs), len(files)))
        yield path, top, dirs, files
        if dirs:
            logger.debug('dirs {}'.format(dirs))
            logger.debug('path {}'.format(path))            
            newstuff = [(path + [d['name']], d) for d in reversed(dirs)]
            logger.debug('extend: {}'.format(newstuff))
            stack.extend(newstuff)

### Gives information about a file, including its ID, md5, size, create/modify time. 

In [11]:
def extract_metadata(node):
    md5Checksum = node.get('md5Checksum', 'None')
    size = node.get('size', 'None')
    mimeType = node.get('mimeType', 'None')
    webViewLink = node.get('webViewLink', 'None')
    id = node.get('id', 'None')
    createdTime = node.get('createdTime', 'None')
    modifiedTime = node.get('modifiedTime', 'None')
    name = node.get('name', 'None')
    return id, md5Checksum, size, mimeType, createdTime, modifiedTime

### OCR on a Directory

<br/>

#### *Main Idea*: 
Takes in a directory, creates a folder of pages for each PDF/document, and uses OCR to convert all pages to computer text (txt).

<br/>

#### *Imports*: 
* PdfFileReader / PdfFileWriter: Useful for reading and saving PDFs.
* Counter: Counts the number of each element and stores the each unique as a key with the frequency as dictionary values. Subclass of dict. Used to count frequency of words in a PDF.

<br/>

#### *Outer loop*: Reads in all pdfs from the input directory from `walk()`. Splits a pdf into individual pages which will each be run into the inner for loop. Gets specific information about a file using `extract_metadata` and saves that in a CSV. Creates folder for output if one does not already exist.
* convert_from_path: takes in a PDF and outputs an image for each page in the PDF
* img_count: index for inner for loop. Tracks page number for a pdf.

<br/>

#### *Inner `page` loop*: For each page, sets Flag to False. Runs OCR with `pytesseract.[FILENAME]` on the default orientation and all 90-degree rotations: 0°, 90°, 180°, 270°. If any orientation can be read, Flag is set to True and the text of the page is written to output. If none of the rotations generates a proper output, the page is not used.

<br/>



In [19]:
def big_loop(top, csvwriter, maxfiles):   
  fc = 0
  if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)
  for path, root, dirs, files in walk(top=top, by_name=False, snapshot_file=snapshot_file):
      # note 'by_name' is currently not used
      dirpath = '/'.join(path)
      id, _, _, mimeType, createdTime, modifiedTime = extract_metadata(root)
      # for directories we use the md5Checksum field to hold the dir count
      md5Checksum = len(dirs) 
      # for directories we use the size field to hold the plain file count
      size = len(files)
      row = [id, md5Checksum, size, mimeType, createdTime, modifiedTime, dirpath]
      csvwriter.writerow(row)
      # we count each dir as a file for the purpose of maxfiles
      fc = fc + 1
      for pdf_index in range(len(files)):
        #Here we conduct page split and put it in a folder
        folder_name = str(id)+'/' ## This is how google id of the pdf gets to be the folder's name of that pdf
        folder_name_failed = str(id)+'/failed_pages.txt'  ## failed paged are specified in terms of filename and pages
        pdf_numbers = len(files)
        pages = convert_from_path(input_dir[:-1]+'/'+files[pdf_index]['name'])
        if os.path.isdir(output_dir+folder_name) == False:
          print("New folder is created")
          os.mkdir(output_dir+folder_name)
        csv_id = []
        csv_text = []
        output_error_file = output_dir +folder_name+ 'failed_pages.txt' #Page the are failed in ocr process specified to pdf name and page number
        for page_idx, page in enumerate(pages):
          img_count = page_idx + 1
          print("image idx: ", img_count)
          save_name = output_dir+folder_name+"page_"+str(img_count)+".jpg"
          page.save(save_name, 'JPEG')
          ##Save the images and csv. 
          flag = True ##Currently disabling the oritation checker. Replace with a better scheme
          # for i in range(4):
          #   text = str(((pytesseract.image_to_string(Image.open(save_name)))))
          #   cnt = Counter()
          #   cnt.update(text.split())
          #   # print(cnt['the'],cnt['de'],cnt['bu'],cnt['im'],cnt['che'],cnt['del'],cnt['e'],cnt['la'])
          #   if ((cnt['the'] < 3) == True and (cnt['de'] < 3) == True and (cnt['bu'] < 3) == True and (cnt['im'] < 3) == True and ((cnt['e'] + cnt['che'] + cnt['del'] + cnt['la'] < 5) == True)):
          #     page2 = page.rotate(90*i)
          #     page2.save(save_name, 'JPEG')
          #   else:
          #     flag = True
          #     break
          if flag == False:
            f_error = open(output_error_file, "a")
            message = "Page " + str(img_count) + " failed to find the correct orientation.\n"
            f_error.write(message)
            f_error.close() 
          else:
            # converts images to strings
            text = str(((pytesseract.image_to_string(Image.open(save_name)))))
            csv_text.append(text)
            csv_id.append(img_count)
      
        df = pd.DataFrame({'page id':csv_id, 'page text':csv_text})
        print("csv saved at " + output_dir+folder_name+"page.csv")
        df.to_csv(output_dir+folder_name+"page.csv")


### Sets up output CSV file and calls `big_loop()`

In [20]:
def catalog_walk(top):    ## pass in folder link
  snapshot_file = '/snapshot2021'
  csvoptions = {'dialect': csv.excel}
 
  OUTFILE = open("/content/drive/My Drive/catalog_not_in_use.csv", "a")
  csvwriter = csv.writer(OUTFILE)
  maxfiles = 100
  big_loop(top, csvwriter, maxfiles)
  OUTFILE.close()

### Specifies input and output directories and calls `catalog_walk()`

In [21]:
snapshot_file = '/snapshot2021.txt'
FOLDER = 'application/vnd.google-apps.folder'
catalog_walk(top)

2021-02-19 06:48:38,384 INFO MainThread stack size 1
2021-02-19 06:48:38,386 INFO MainThread dir ['k9_input']
2021-02-19 06:48:38,393 INFO MainThread URL being requested: GET https://www.googleapis.com/drive/v3/files?orderBy=folder%2Cname%2CcreatedTime&fields=kind%2C+nextPageToken%2C+incompleteSearch%2C+files%28id%2C+name%2C+mimeType%2C+md5Checksum%2C+webViewLink%2C+createdTime%2C+modifiedTime%2C+size%29&q=%271GalyWQAGD03OivOT6gMpRskYxEbXpkQi%27+in+parents&alt=json


walk function is called


2021-02-19 06:48:38,828 INFO MainThread subdirs 0 files 2


New folder is created
image idx:  1
image idx:  2
image idx:  3
/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/k9_input_results/1GalyWQAGD03OivOT6gMpRskYxEbXpkQi/page.csv
image idx:  1
image idx:  2
image idx:  3
/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/k9_input_results/1GalyWQAGD03OivOT6gMpRskYxEbXpkQi/page.csv
