**Importing Libraries and setting up**

# New Section

# Importing Important Libraries

In [2]:
# Standard utility libraries
import numpy as np
import cv2
import re
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
from google.colab.patches import cv2_imshow

# Tesseract libraries
!pip install tesseract
!pip install pytesseract
!sudo apt install tesseract-ocr
import pytesseract

# adds image processing capabilities
from PIL import Image

# Tf-idf libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [3]:
# Doc-tr library setup for OCR
!pip install python-doctr[tf]
!sudo apt-get install fonts-freefont-ttf -y

Collecting python-doctr[tf]
  Using cached python_doctr-0.5.1-py3-none-any.whl (205 kB)
Collecting tensorflow>=2.4.0
  Downloading tensorflow-2.8.0-cp37-cp37m-manylinux2010_x86_64.whl (497.5 MB)
[K     |████████████████████████████████| 497.5 MB 24 kB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Using cached tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
Collecting tensorflow>=2.4.0
  Using cached tensorflow-2.7.1-cp37-cp37m-manylinux2010_x86_64.whl (495.0 MB)
  Using cached tensorflow-2.7.0-cp37-cp37m-manylinux2010_x86_64.whl (489.6 MB)
  Using cached tensorflow-2.6.3-cp37-cp37m-manylinux2010_x86_64.whl (463.8 MB)
Installing collected packages: tensorflow, python-doctr
Successfully installed python-doctr-0.5.1 tensorflow-2.6.3
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  fonts-freefont-ttf
0 upgraded, 1 newly installed, 0 to remove and 39 not upgrade

In [4]:
#Setting up the OCR tool- Doc tr. Using this to run with Tensorflow.
%matplotlib inline
import os

# Let's pick the desired backend
os.environ['USE_TF'] = '1'
# os.environ['USE_TORCH'] = '1'

import matplotlib.pyplot as plt

from doctr.io import DocumentFile
from doctr.models import ocr_predictor

**Reading the input image, cleaning up and fix skewness**


In [5]:
def preprocess_image(image_file):
  image = cv2.imread(image_file)

  # Convert the original image to grayscale
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  gray = cv2.bitwise_not(gray)

  thres = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

  # Find the angle of rotation
  coords = np.column_stack(np.where(thres>0))
  angle = cv2.minAreaRect(coords)[-1]

  if angle<-45:
    angle = -(90 + angle)
  else:
    angle = -angle

  print(angle)

  # Straighten the file
  (h, w) = image.shape[:2]
  center = (w // 2, h // 2)
  M = cv2.getRotationMatrix2D(center, angle, 1.0)
  rotated = cv2.warpAffine(image, M, (w, h), flags = cv2.INTER_CUBIC, borderMode = cv2.BORDER_REPLICATE)

  # cv2.putText(rotated, "Angle: {:.2f} degrees".format(angle), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
  cv2.imwrite("rotated.png", rotated)
  print("angle: {:.3f}".format(angle))
  # cv2_imshow(image)
  # cv2_imshow(rotated)

  return rotated

**Reading standard 126, 131 and 140 'unfilled' forms and prepare word vec model to perform Tf-Idf for classification**

> Indented block



In [6]:
# opening Form 126 from the source path
img = cv2.imread("Acord126.png")

# converts the image to result and saves it into result variable
A = pytesseract.image_to_string(img)

# opening Form 131 from the source path
img = cv2.imread("Acord131.png")

# converts the image to result and saves it into result variable
B = pytesseract.image_to_string(img)

# opening Form 140 from the source path
img = cv2.imread("Acord140.png")

# converts the image to result and saves it into result variable
C = pytesseract.image_to_string(img)

# TF matrix with IDF normalization (IDF)
vectorizer = TfidfVectorizer(token_pattern=r'\S+', use_idf=True, smooth_idf=False, sublinear_tf=True, norm=None)
doc_vec = vectorizer.fit_transform([A, B, C])
doc_vec_mat = pd.DataFrame(doc_vec.toarray(), columns = vectorizer.get_feature_names(), index = ["Form 126", "Form 131", "Form 140"])



**Classify the form type into 126/140/131**

In [7]:
#Earlier we created the doc vec useing the reference form images, and with the given input image we created the query vectory.
#From the given Doc-vec & Query Vec, we calculated the cosine similarity with these two and classified as per the higher similarity score.
def classify_form(img):
  query_1 = pytesseract.image_to_string(img)

  # write text in a text file and save it to source path
  with open('abc.txt',mode ='w') as file:
    file.write(query_1)

  query_vec = vectorizer.transform([query_1])
  query_vec_mat = pd.DataFrame(query_vec.toarray(), columns = vectorizer.get_feature_names(), index = ["Scanned Form"])
  # cosine similarity table
  cosine_matrix = pd.DataFrame(cosine_similarity(query_vec_mat, doc_vec_mat), columns=["Form 126", "Form 131", "Form 140"], index = ["Scanned Form"])
  print(cosine_matrix)
  similar_form = cosine_matrix.idxmax(axis=1)[0]
  print("\nClassified Form Type: " + similar_form)
  return similar_form.split(" ")[1]

**As the classification is done, perform OCR on the input image**

In [8]:
import json
def perform_ocr(formClass):
  form = DocumentFile.from_images("rotated.png")
  print(f"Number of pages: {len(form)}")

  predictor = ocr_predictor(pretrained=True)
  print(predictor)

  result = predictor(form)
  # result.show(form)

  # Grab the words to be skipped in each forms
  if formClass == "126":
    filepath = '126SkipWords.txt'
  elif formClass == "131":
    filepath = '131SkipWords.txt'
  elif formClass == "140":
    filepath = '140SkipWords.txt'
  else:
    print("Classified file doesn't belong to any category, hence aborting.")
    exit()
  
  # Skip the repeating and non useful words from the image file's OCR
  words_to_be_skipped = []
  with open(filepath) as fp:
    line = fp.readline()
    while line:
        words_to_be_skipped.append(line.strip())
        line = fp.readline()
  
  # Create a map of useful words(to be included in key-value json)
  # Map(useful words)= Total words from result - (Skiplist+digits)
  map = {}
  i=0
  for page in result.pages:
    for block in page.blocks:
      for line in block.lines:
        for word in line.words:
          if word.value not in words_to_be_skipped and not word.value.isdigit():
            print(word.value)
            map[i] = word.value
            i=i+1
  print(map)

  json = parseForm(map)

  return json

**Declare Form Class which is to be rendered as JSON output and assign the key value pairs as per iteration**

In [9]:
import json

class Form:
  customer_id = ""
  date = ""
  agency_name = ""
  carrier = ""
  naic_code = ""
  policy_number = ""
  effective_date = ""
  applicant_name = ""
  description = ""

# Creating the JSON based on the list of useful words(map) and the form class we created above.
def parseForm(map):
  form = Form()
  dateCount = 0
  keysToRemove = []

  # Set the date and effective date by identifying the Date patterns from the keys
  for key in map:
    if len(map[key].split("/")) >=2 and dateCount == 0:
      form.date = map[key]
      keysToRemove.append(key)
      dateCount = 1
    elif len(map[key].split("/")) >=2 and dateCount == 1:
      form.effectiveDate = map[key]
      keysToRemove.append(key)
      dateCount = 2
    elif len(map[key].split("/")) >=2:
      keysToRemove.append(key)
  
  for key in keysToRemove:
    map.pop(key)

  keysToRemove.clear()

  # Set the customer id and naic_code based on first 2 alphanumeric values from the key
  iteration = 0
  for key in map:
    if not map[key].isalpha() and iteration == 0:
      form.customer_id = map[key]
      iteration = 1
      keysToRemove.append(key)
    elif not map[key].isalpha() and iteration == 1:
      form.naic_code = map[key]
      keysToRemove.append(key)
      iteration = 2
      break

  for key in keysToRemove:
    map.pop(key)

  # Set other key values by iterating the map
  iteration = 0
  prevKey = 0
  for key in map:
    if iteration == 0:
      if (prevKey == 0 or key == prevkey+1) and map[key].isalpha():
        form.agency_name = form.agency_name + " " + map[key]
        prevkey = key
      else:
        form.policy_number = map[key]
        iteration = 2
        prevKey = 0
    elif iteration == 2:
      form.carrier = map[key]
      iteration = 3
    elif iteration == 3:
      if (prevKey == 0 or key == prevkey+1) and map[key].isalpha():
        form.applicant_name = form.applicant_name + " " + map[key]
        prevkey = key
      else:
        form.description = map[key]
        iteration = 4
    else:
      form.description = form.description + map[key]

# json.dumps is used to create the JSON output from the form class
  return json.dumps(form.__dict__)

**Since all functions are defined; call preprocessing and classification function on the given input image**

In [10]:
# Running OCR on all test files
#imageList = ["form126_test1.jpg", "form126_test2.jpg", "form131_test1.jpg", "form131_test2.jpg", "form140_test1.jpg", "form140_test2.jpg"]

# Running OCR on just one file
# imageList = ["form131_test1.jpg"]

# Running OCR on 3 selected files of each form type
imageList = ["form126_test2.jpg", "form131_test2.jpg", "form140_test2.jpg"]

for image in imageList:
  import json
  processed_form = preprocess_image(image)
  formClass = classify_form(processed_form)
  json = perform_ocr(formClass)
  print(json)
  print("------------------\n-------------\n------------------")

-0.0
angle: -0.000




              Form 126  Form 131  Form 140
Scanned Form  0.890433  0.263755  0.153888

Classified Form Type: Form 126
Number of pages: 1
Downloading https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip to /root/.cache/doctr/models/db_resnet50-adcafc63.zip


  0%|          | 0/94178964 [00:00<?, ?it/s]

Downloading https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip to /root/.cache/doctr/models/crnn_vgg16_bn-76b7f2c6.zip


  0%|          | 0/58758994 [00:00<?, ?it/s]

OCRPredictor(
  (det_predictor): DetectionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(1024, 1024), method='bilinear')
      (normalize): Normalize(mean=[0.7979999780654907, 0.7850000262260437, 0.7720000147819519], std=[0.2639999985694885, 0.27489998936653137, 0.28700000047683716])
    )
    (model): DBNet(
      (feat_extractor): IntermediateLayerGetter()
      (fpn): FeaturePyramidNetwork(channels=128)
      (probability_head): <keras.engine.sequential.Sequential object at 0x7f65d68536d0>
      (threshold_head): <keras.engine.sequential.Sequential object at 0x7f65d6832d90>
      (postprocessor): DBPostProcessor(bin_thresh=0.3, box_thresh=0.1)
    )
  )
  (reco_predictor): RecognitionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(32, 128), method='bilinear', preserve_aspect_ratio=True, symmetric_pad=False)
      (normalize): Normalize(mean=[0.6940000057220459, 0.6949999928474426, 0.6930000185966492], std=[0.298999



              Form 126  Form 131  Form 140
Scanned Form  0.329495  0.720188  0.200646

Classified Form Type: Form 131
Number of pages: 1




OCRPredictor(
  (det_predictor): DetectionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(1024, 1024), method='bilinear')
      (normalize): Normalize(mean=[0.7979999780654907, 0.7850000262260437, 0.7720000147819519], std=[0.2639999985694885, 0.27489998936653137, 0.28700000047683716])
    )
    (model): DBNet(
      (feat_extractor): IntermediateLayerGetter()
      (fpn): FeaturePyramidNetwork(channels=128)
      (probability_head): <keras.engine.sequential.Sequential object at 0x7f65d2f91850>
      (threshold_head): <keras.engine.sequential.Sequential object at 0x7f65d2f19610>
      (postprocessor): DBPostProcessor(bin_thresh=0.3, box_thresh=0.1)
    )
  )
  (reco_predictor): RecognitionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(32, 128), method='bilinear', preserve_aspect_ratio=True, symmetric_pad=False)
      (normalize): Normalize(mean=[0.6940000057220459, 0.6949999928474426, 0.6930000185966492], std=[0.298999



              Form 126  Form 131  Form 140
Scanned Form  0.264366  0.270102  0.683746

Classified Form Type: Form 140
Number of pages: 1
OCRPredictor(
  (det_predictor): DetectionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(1024, 1024), method='bilinear')
      (normalize): Normalize(mean=[0.7979999780654907, 0.7850000262260437, 0.7720000147819519], std=[0.2639999985694885, 0.27489998936653137, 0.28700000047683716])
    )
    (model): DBNet(
      (feat_extractor): IntermediateLayerGetter()
      (fpn): FeaturePyramidNetwork(channels=128)
      (probability_head): <keras.engine.sequential.Sequential object at 0x7f65d5a8d390>
      (threshold_head): <keras.engine.sequential.Sequential object at 0x7f65d5a6e490>
      (postprocessor): DBPostProcessor(bin_thresh=0.3, box_thresh=0.1)
    )
  )
  (reco_predictor): RecognitionPredictor(
    (pre_processor): PreProcessor(
      (resize): Resize(output_size=(32, 128), method='bilinear', preserve_aspect_ratio