# PDF->Page->Text

This notebook processes PDFs to TXT by splitting the page and performing OCR for each page of the PDF.

## Outline of the Workflow
* Mount the notebook to the Google Colab
* Import Libraries and packages
* Format params, search, and get files from Google Drive
* Convert pdf pages to jpeg files
* Convert jpeg files to txt (string); save them in output_file

In [None]:
### Modify this cell only
top = '1yei52gRB7vJZCD7RjlTIp0urGnmQCQ20' #1u-uLRd-O0uMpis0r_vPxGcfw3uwv6QUa this is the last PDF that was processed in SecondarySources (2/20/21)
input_dir = '/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/180_input/'  ##must start and end with a "/"
output_dir = '/content/drive/My Drive/AWCA/Colab_notebooks/OCR/OCR_PDF2Page/180_output/' ##must start and end with a "/"

In [None]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
service = build('drive', 'v3')

In [None]:
# Run this to mount the Notebook in your Google Drive account 
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


One-time installations

In [None]:
!pip3 install pdf2image
!apt-get install poppler-utils 
!pip3 install pytesseract
!pip3 install pdf2image
!sudo apt-get install tesseract-ocr
!pip install PyPDF2

Collecting pdf2image
  Downloading https://files.pythonhosted.org/packages/03/62/089030fd16ab3e5c245315d63c80b29250b8f9e4579b5a09306eb7e7539c/pdf2image-1.14.0-py3-none-any.whl
Installing collected packages: pdf2image
Successfully installed pdf2image-1.14.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 13 not upgraded.
Need to get 154 kB of archives.
After this operation, 613 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 poppler-utils amd64 0.62.0-2ubuntu2.12 [154 kB]
Fetched 154 kB in 1s (168 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 149406 files and directories currently installed.)
Preparing to unpack .../poppler-utils_0.62.0-2ubuntu2.12_amd64.deb ...
Unpacking poppler-utils (0.62.0-2ubuntu2.12) ...
Setting up poppler-utils (0.

Imports

In [None]:
import os
import json
import csv
import sys
from apiclient.discovery import build  # pip install google-api-python-client
from pdf2image import convert_from_path # the module can convert PDF to a PIL Image object
from PIL import Image
import pytesseract #OCR tool; recognizes the text embedded in images
import sys 
from PyPDF2 import PdfFileReader, PdfFileWriter #the classes can read PDFs, split PDFs, and extract data
from collections import Counter # track the number of occurence of a value
import pandas as pd

In [None]:
print(pytesseract.get_languages(config=''))

['osd', 'eng']


Logging setup

In [None]:
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(threadName)-10s %(message)s',)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

### Formatting and searching the files

In [None]:
def iterfiles(name=None, is_folder=None, parent=None, order_by='folder,name,createdTime'):  
    q = []
    if name is not None:
        q.append("name = '%s'" % name.replace("'", "\\'"))
    if is_folder is not None:
        q.append("mimeType %s '%s'" % ('=' if is_folder else '!=', FOLDER))
    if parent is not None:
        q.append("'%s' in parents" % parent.replace("'", "\\'"))
    fields = ['id', 'title', 'mimeType']
    params = {'pageToken': None, 'orderBy': order_by,
              'fields': 'kind, nextPageToken, incompleteSearch, files(id, name, mimeType, md5Checksum, webViewLink, createdTime, modifiedTime, size)'}
    if q:
        params['q'] = ' and '.join(q)
    while True:
        logger.debug('params {}'.format(params))
        response = service.files().list(**params).execute() # searches the files which match **params
        logger.debug('response {}'.format(response))        
        for f in response['files']:
            yield f
        try:
            params['pageToken'] = response['nextPageToken'] # changes the page token of the file to next page token
        except KeyError:
            return

### Search all the sub-directories and files inside the initial directory

Uses `iterfiles()`

In [None]:
def walk(top='root', by_name=False):
    print("walk function is called")
  # Search all the sub-directories and files inside the initial directory
    stack = None
    if not stack:
      # if reading the json file was unsuccessful, check the reasons
        if by_name:
            top, = iterfiles(name=top, is_folder=True) # top as name
        else:
            top = service.files().get(fileId=top).execute() #top as fileId
            if top['mimeType'] != FOLDER:
                raise ValueError('not a folder: %r' % top)
        stack = [([top['name']], top)]
    while stack:
      # if stack is not empty, noting down its information
        logger.info('stack size {}'.format(len(stack)))
        #with open(snapshot_file, 'w') as file:
            #file.write(json.dumps(stack))
        path, top = stack.pop()
        logger.info('dir {}'.format(path))
        dirs, files = is_file = [], []
        for f in iterfiles(parent=top['id']):
            is_file[f['mimeType'] != FOLDER].append(f)
        logger.info('subdirs {} files {}'.format(len(dirs), len(files)))
        yield path, top, dirs, files
        if dirs:
            logger.debug('dirs {}'.format(dirs))
            logger.debug('path {}'.format(path))            
            newstuff = [(path + [d['name']], d) for d in reversed(dirs)]
            logger.debug('extend: {}'.format(newstuff))
            stack.extend(newstuff)

### Gives information about a file, including its ID, md5, size, create/modify time. 

In [None]:
def extract_metadata(node):
    md5Checksum = node.get('md5Checksum', 'None')
    size = node.get('size', 'None')
    mimeType = node.get('mimeType', 'None')
    webViewLink = node.get('webViewLink', 'None')
    id = node.get('id', 'None')
    createdTime = node.get('createdTime', 'None')
    modifiedTime = node.get('modifiedTime', 'None')
    name = node.get('name', 'None')
    return id, md5Checksum, size, mimeType, createdTime, modifiedTime

### OCR on a Directory

<br/>

#### *Main Idea*: 
Takes in a directory, creates a folder of pages for each PDF/document, and uses OCR to convert all pages to computer text (txt).

<br/>

#### *Imports*: 
* PdfFileReader / PdfFileWriter: Useful for reading and saving PDFs.
* Counter: Counts the number of each element and stores the each unique as a key with the frequency as dictionary values. Subclass of dict. Used to count frequency of words in a PDF.

<br/>

#### *Outer loop*: Reads in all pdfs from the input directory from `walk()`. Splits a pdf into individual pages which will each be run into the inner for loop. Gets specific information about a file using `extract_metadata` and saves that in a CSV. Creates folder for output if one does not already exist.
* convert_from_path: takes in a PDF and outputs an image for each page in the PDF
* img_count: index for inner for loop. Tracks page number for a pdf.

<br/>

#### *Inner `page` loop*: For each page, sets Flag to False. Runs OCR with `pytesseract.[FILENAME]` on the default orientation and all 90-degree rotations: 0°, 90°, 180°, 270°. If any orientation can be read, Flag is set to True and the text of the page is written to output. If none of the rotations generates a proper output, the page is not used.

<br/>



In [None]:
import random
def dict_common_sum(cntr):
  sum = 0
  for word, freq in cntr.most_common(2):
    sum += freq * 2
  for word, freq in cntr.most_common(5):
    sum += freq * 2
  for word, freq in cntr.most_common(10):
    sum += freq
  return sum

def big_loop(top, csvwriter, maxfiles):   
  fc = 0
  if os.path.isdir(output_dir) == False:
    os.mkdir(output_dir)
  for path, root, dirs, files in walk(top=top, by_name=False):
      # note 'by_name' is currently not used
      dirpath = '/'.join(path)
      id, _, _, mimeType, createdTime, modifiedTime = extract_metadata(root)
      # for directories we use the md5Checksum field to hold the dir count
      md5Checksum = len(dirs) 
      # for directories we use the size field to hold the plain file count
      size = len(files)
      row = [id, md5Checksum, size, mimeType, createdTime, modifiedTime, dirpath]
      csvwriter.writerow(row)
      # we count each dir as a file for the purpose of maxfiles
      fc = fc + 1
      for pdf_index in range(len(files)):
        #Here we conduct page split and put it in a folder
        folder_name = str(files[pdf_index]['id'])+'/' ## This is how google id of the pdf gets to be the folder's name of that pdf
        folder_name_failed = str(id)+'/failed_pages.txt'  ## failed paged are specified in terms of filename and pages
        pdf_numbers = len(files)
        file_name = input_dir[:-1]+'/'+files[pdf_index]['name']
        if ((file_name.endswith('.pdf') == False) and (file_name.endswith('.PDF')== False)):
          continue
        pages = convert_from_path(file_name)
        if os.path.isdir(output_dir+folder_name) == False:
          print("New folder is created")
          os.mkdir(output_dir+folder_name)
        csv_id = []
        csv_text = []
        pdf_name = []
        output_error_file = output_dir +folder_name+ 'failed_pages.txt' #Page the are failed in ocr process specified to pdf name and page number

        '''
        # For debugging purposes. To be used with the other place where page_limit is being used.
        page_limit = 4
        page_curr = 0
        '''

        # Orientation content
        cnt0, cnt1, cnt2, cnt3 = Counter(), Counter(), Counter(), Counter()
        num_to_select = max(min(5, len(pages)), int(0.1 * len(pages)))
        test_pages = random.sample(pages, num_to_select)
        for page_idx, page in enumerate(test_pages):
          img_count = page_idx + 1
          print("image idx: ", img_count)
          save_name = output_dir+folder_name+"page_"+str(img_count)+".jpg"
          page.save(save_name, 'JPEG')
          for i in range(4):
            text = str(((pytesseract.image_to_string(Image.open(save_name)))))
            if (i == 0):
              cnt0.update(text.split())
            if (i == 1):
              cnt1.update(text.split())
            if (i == 2):
              cnt2.update(text.split())
            if (i == 3):
              cnt3.update(text.split())
            page2 = page.rotate(90*(i + 1))
            page2.save(save_name,  'JPEG')
        w_0, w_1, w_2, w_3 = dict_common_sum(cnt0), dict_common_sum(cnt1), dict_common_sum(cnt2), dict_common_sum(cnt3)
        top_orient = max(w_0, w_1, w_2, w_3)
        if w_0 == top_orient:
          k = -1
        if w_1 == top_orient:
          k = 0
        if w_2 == top_orient:
          k = 1
        if w_3 == top_orient:
          k = 2
        
        print("Orientation to be used" + str(k))
        print(cnt0)
        print(len(cnt0))
        print(w_0)
        print(cnt1)
        print(len(cnt1))
        print(w_1)
        print(cnt2)
        print(len(cnt2))
        print(w_2)
        print(cnt3)
        print(len(cnt3))
        print(w_3)

        for page_idx, page in enumerate(pages):
          '''
          # Delete later
          if (page_limit <= page_curr):
            break
          else:
            page_curr += 1
          '''

          img_count = page_idx + 1
          print("image idx: ", img_count)
          save_name = output_dir+folder_name+"page_"+str(img_count)+".jpg"
          page.save(save_name, 'JPEG')
          ##Save the images and csv. 
          #flag = False ##Currently disabling the oritation checker. Replace with a better scheme
          print("Page")
          page2 = page.rotate(90*(k + 1))
          page2.save(save_name,  'JPEG')
          text = str(((pytesseract.image_to_string(Image.open(save_name)))))
          #cnt = Counter()
          #cnt.update(text.split())
          # print(cnt['the'],cnt['de'],cnt['bu'],cnt['im'],cnt['che'],cnt['del'],cnt['e'],cnt['la'])
          #if ((cnt['the'] < 3) == True and (cnt['de'] < 3) == True and (cnt['bu'] < 3) == True and (cnt['im'] < 3) == True and ((cnt['e'] + cnt['che'] + cnt['del'] + cnt['la'] < 5) == True)):
          #else:
          #  flag = True
          #  break
          '''
          if flag == False:
            f_error = open(output_error_file, "a")
            message = "Page " + str(img_count) + " failed to find the correct orientation.\n"
            f_error.write(message)
            f_error.close() 
          else:
          '''
          # converts images to strings
          csv_text.append(text)
          csv_id.append(str(files[pdf_index]['id'])+"_"+str(img_count))
          pdf_name.append(files[pdf_index]['name'])

        
        
        
        
        df = pd.DataFrame({'page id':csv_id, 'name': pdf_name, 'page text':csv_text})
        print("csv saved at " + output_dir+folder_name+"page.csv")
        df.to_csv(output_dir+folder_name+"page.csv")


### Sets up output CSV file and calls `big_loop()`

In [None]:
def catalog_walk(top):    ## pass in folder link
  #snapshot_file = '/snapshot-1'
  snapshot_file = '/snapshot10'
  csvoptions = {'dialect': csv.excel}
 
  OUTFILE = open("/content/drive/My Drive/catalog_not_in_use.csv", "a")
  csvwriter = csv.writer(OUTFILE)
  maxfiles = 10
  big_loop(top, csvwriter, maxfiles)
  OUTFILE.close()

### Specifies input and output directories and calls `catalog_walk()`

In [None]:
snapshot_file = '/snapshot-10.txt'
FOLDER = 'application/vnd.google-apps.folder'
catalog_walk(top)

2021-02-28 10:03:54,758 INFO MainThread URL being requested: GET https://www.googleapis.com/drive/v3/files/1yei52gRB7vJZCD7RjlTIp0urGnmQCQ20?alt=json


walk function is called


2021-02-28 10:03:55,165 INFO MainThread stack size 1
2021-02-28 10:03:55,166 INFO MainThread dir ['180_input']
2021-02-28 10:03:55,172 INFO MainThread URL being requested: GET https://www.googleapis.com/drive/v3/files?orderBy=folder%2Cname%2CcreatedTime&fields=kind%2C+nextPageToken%2C+incompleteSearch%2C+files%28id%2C+name%2C+mimeType%2C+md5Checksum%2C+webViewLink%2C+createdTime%2C+modifiedTime%2C+size%29&q=%271yei52gRB7vJZCD7RjlTIp0urGnmQCQ20%27+in+parents&alt=json
2021-02-28 10:03:55,492 INFO MainThread subdirs 0 files 6


New folder is created
image idx:  1
image idx:  2
image idx:  3
image idx:  4
image idx:  5
Orientation to be used1
Counter({'pue': 10, 'ay': 7, '<': 6, '03': 6, '2': 6, '¢': 6, 'Jo': 5, 'UT': 5, 'uo': 4, 'a': 4, '-': 4, '_': 3, 'Sd': 3, 'z': 3, '&': 3, 'JO': 3, '8': 3, '3nq': 3, 'y': 3, 'zo': 3, 'ayy': 3, '4': 3, 'e': 3, '=': 3, 'a8uey>': 2, 'SMNd': 2, 'UF': 2, 'amos': 2, 'aie': 2, 'g': 2, 'ay3': 2, 'aaey': 2, 'ue': 2, '7': 2, 'og': 2, '(4)': 2, '“II': 2, 'yons': 2, 'aq': 2, 'Aq': 2, 'sayz': 2, 'go': 2, 'of': 2, 'aut': 2, 'i': 2, '|': 2, '“Toa': 2, '‘+g': 2, 'adj': 2, 'aq)': 2, 'jou': 2, '°33N3': 2, '>': 2, '>)': 2, '(9923)': 1, 'aatTo': 1, 'atAgz': 1, '-ARzy': 1, 'aea': 1, 'uezo,': 1, '*qeg': 1, '—Ugn,»': 1, 'tZ': 1, 'pa8rem': 1, '*z': 1, 'eat': 1, 'Wry': 1, '38T': 1, 'usIatd': 1, '-qeH': 1, '¢)oUT33ee': 1, 'SMNG': 1, 'upquedy': 1, 'sa': 1, 'sjuBUOSUOD': 1, 'BuTMOTTOJZ': 1, 'B07': 1, '-U-': 1, 'UOTIMFUTSse': 1, 'TeTNZa1': 1, 'sy': 1, 'sang': 1, 'Atqeqoad': 1, 'osty': 1, 'Preage': 1, 

Ideas for orientation:
(A represents a constant)
- Compare the number of words for each of the orientations, and if one has Ax more than each of the others (A set to a constant), then use that one. Only works if orientations that are wrong output very low word counts. 
- Randomly select A documents and produce a unique list of all the words that appear in these pages. Then for each orientation, compare if it has enough overlap between the list and itself. Does not work for documents that are short. Use min(page count, 5, 0.1*doc length)
- Same idea as #2, but take only the A most frequent words to put in the unique list. Then compare based on some logic statement similar to the one used previously.