<!-- Ref: https://github.com/shreeshiv/AIESI -->
# Readme 
*   File setup: first move all json and jpg files into one single folder, then aggregate all the json files to one single csv file for the future evaluation.

*   Preprocess: turn the image into binary image (pixel value: 0 or 255, one channel), and then using erode and dilate to reduce the noise in the image. In addition, the project also filter out the connected components with too small area(area < 10).

*   Recognized words: first find the bounding box of the text in the image. Then, loop thru the bounding box to recognize the word in the box using pytesseract, and return all the text in the image.

*   LSTM field matching: use LSTM to estimate if each character belongs to the corresponding fields (company, date, address, total).

*   Evalution: calculate and plot the f1 score and precision

*   Main: estimate the f1 score and precision for training data, and output the result of validation data as csv file

*   Future Direction:

  1.   To further imporve the performance, we can try to adjust the kernel size at preprocessing step and text recognizing step (contour).

  2.   Should train the model. The project use the pretrained model to do the LSTM field matching since the project can't train it properly. The reason is the project uses pytesseract to recognize words, and if the words weren't recognized correctly, the true labels can't be matched to the word vector, thus causing it untrainable. In the future, we should change word recognizing step to CNN structure and concatanate the CNN to the LSTM to update both weights together. In this way, the problem can be avoided.

**I attach the validation result as csv file, and the pretrained model in the folder as well**

# Setup


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt install tesseract-ocr
!pip install colorama
!pip install -U git+https://github.com/madmaze/pytesseract.git

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 30 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 2s (2,418 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl

In [3]:
import shutil
import pytesseract
import glob
import numpy as np
import pandas as pd
import cv2
import torch 
from torch import nn
import matplotlib.pyplot as plt
from skimage.filters import threshold_local
import json
from string import ascii_uppercase, digits, punctuation
import regex
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

# Preprocess

In [None]:
# aggregate the files into one file
def move_file(start, destination):
  files = glob.glob(start)
  for file in files: 
    content = glob.glob(f"{file}/*")
    for i in content:
      if i.endswith('json') or i.endswith('jpg'):
        shutil.move(i, destination) 

move_file("/content/drive/MyDrive/Job/Infrred/dataset/val/*", "/content/drive/MyDrive/Job/Infrred/dataset/val_all")
move_file("/content/drive/MyDrive/Job/Infrred/dataset/train/*", "/content/drive/MyDrive/Job/Infrred/dataset/train_all")

In [None]:
# aggregate all json file into one csv
def agg_json(destination):
  files = glob.glob(f"/content/drive/MyDrive/Job/Infrred/dataset/train_all/*")
  df = pd.DataFrame(columns=['index', 'company', 'date', 'address', 'total'])
  for file in files:
    if file.endswith('json'):
      index = file[-8:-5]
      if index.find('/') != -1:
        index = index[index.find('/')+1:]
      temp = pd.read_json(file, typ='series')
      temp = pd.DataFrame([temp])
      temp['index'] = index
      df = pd.concat([df, temp])

  df.to_csv(destination)

agg_json('/content/drive/MyDrive/Job/Infrred/dataset/train_all/all_json.csv')

In [4]:
def img_preprocess(path, kernel_size):
  # read the img as grey scale image
  img = cv2.imread(path, 0)

  # turn into binary img using adaptive threshold
  T = threshold_local(img, 15, offset = 6, method = "gaussian")
  thresh = (img > T).astype("uint8") * 255
  thresh = ~thresh

  # erode and dilate the image
  kernel = np.ones((kernel_size, kernel_size), np.uint8)
  opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)

  # find the connected components in the image
  nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(opening, None, None, None, 8, cv2.CV_32S)
  # filter out the small connected components (area < 10)
  area = stats[1:,-1]
  filtered_area = np.zeros((labels.shape), np.uint8)
  for i in range(0, nlabels-1):
    if area[i] >= 10:
      filtered_area[labels == i+1] = 255
  
  return filtered_area
  

# Recognized words

In [5]:
 def img_words(img): 

  # find contours
  kernel = np.ones((5, 20), np.uint8)
  img_dilate = cv2.dilate(img, kernel, iterations=1)
  contours, hir = cv2.findContours(img_dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)

  # find the corresponding bounding box for each contour
  rects = [cv2.boundingRect(cnt) for cnt in contours]
  sorted_rects = sorted(rects, key = lambda y:y[0])
  sorted_rects = sorted(sorted_rects, key = lambda y:y[1])

  # loop thru each bounding box and find the word
  res = ''
  for i, rect in enumerate(sorted_rects):
    x, y, w, h = rect
    # if rect is too small, skip the rect
    if w<20 or h<20:
      continue

    temp = img[y:y+h, x:x+w]
    temp = cv2.cvtColor(temp, cv2.COLOR_BAYER_BG2BGR)
    text = pytesseract.image_to_data(temp, config=r'--psm 6')
    text = text.split()
    index = 22
    while True:
      if index > len(text):
        break
      # no text if config == -1
      if int(text[index]) == -1:
        index += 11
      else:
        res += text[index+1]
        res += " "
        # since existence of the word, so index += 12
        index += 12
    res += '\n'
  
  return res

# LSTM field matching

In [6]:
# define model, a pretrained and predefined model from reference
class MyModel(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size):
    super().__init__()
    self.embed = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=2, bidirectional=True)
    self.linear = nn.Linear(hidden_size * 2, 5)

  def forward(self, inpt):
    embedded = self.embed(inpt)
    feature, _ = self.lstm(embedded)
    oupt = self.linear(feature)
    return oupt

In [7]:
def transform(text, targets):
  li = ['company', 'date', 'address', 'total']
  res = torch.zeros(len(text), 1)
  for ind, value in enumerate(li):
    index = text.upper().find(targets[value][0:5])
    end = index+len(targets[value])
    res[index:end, 0] = torch.LongTensor([ind+1 for _ in range(len(targets[value]))])

  text = etfo
  temp_tensor = torch.zeros(len(text), 1, dtype=torch.long)
  temp_tensor[:, 0] = torch.LongTensor([VOCAB.find(c) for c in text])

  mask = temp_tensor[:,0] >= 0
  indices = torch.nonzero(mask)
  res = res[indices[:,0]]

  return res.squeeze().type(torch.LongTensor)

In [8]:
def train(epoch):
  
  epoch_loss = 0.0
  num_epochs = 15
  best_loss = 999999
  best_epoch = -1
  
  temp = []
  ind1, ind2 = 0, 0
  for epoch in range(num_epochs):

    running_loss = 0.0
    files = glob.glob(f"/content/drive/MyDrive/Job/Infrred/dataset/train_all/*")
    for file in files: 
      # forward
      if file.endswith('jpg'):
        img = img_preprocess(file, 1)
        text = img_words(img)
        text = text.upper()
        text_tensor = get_text_tensor(text)
        output = model(text_tensor)
        prob = torch.nn.functional.softmax(output, dim=2)
        prob = prob.squeeze()
        ind1 = 1

      # label
      if file.endswith('json'):
        temp = pd.read_json(file, typ='series')
        target = transform(text, temp)
        ind2 = 1

      if ind1 == 1 and ind2 == 1:
        # zero the parameter gradients
        optimizer.zero_grad()

        # backward + optimize
        loss = criterion(prob, target)
        loss.backward()
        optimizer.step()

      # # print statistics
      # running_loss += loss.item()
      # if i % 2000 == 1999:    # print every 2000 mini-batches
      #       print('[%d, %5d] loss: %.3f' %
      #             (epoch + 1, i + 1, running_loss / 2000))
      #       running_loss = 0.0

  print('Finished Training')

In [15]:
# convert text to text tensor
def get_text_tensor(text):
  
  text = text.upper()
  text_tensor = torch.zeros(len(text), 1, dtype=torch.long)
  text_tensor[:, 0] = torch.LongTensor([all_vocab.find(c) for c in text])

  # make sure that the nums in the tensor are in the range
  mask = text_tensor[:,0] >= 0
  indices = torch.nonzero(mask)
  text_tensor = text_tensor[indices[:,0]]

  return text_tensor

In [16]:
# convert prediction to dictionary of each field thru the cosecutive number in pred and the max_prob in the corresponding sequence
def pred_to_dict(text, pred, prob):
  res = {"company": ("", 0), "date": ("", 0), "address": ("", 0), "total": ("", 0)}
  keys = list(res.keys())

  cons_pred = [0] + (np.nonzero(np.diff(pred))[0] + 1).tolist() + [len(pred)] 

  for i in range(len(cons_pred) - 1):
      pred_class = pred[cons_pred[i]] - 1
      # no class
      if pred_class == -1:
        continue

      cur_key = keys[pred_class]
      cur_prob = prob[cons_pred[i]:cons_pred[i+1]].max()
      if cur_prob > res[cur_key][1]:
        res[cur_key] = (text[cons_pred[i] : cons_pred[i+1]], cur_prob)
    
  return {k: regex.sub(r"[\t\n]", " ", v[0].strip()) for k, v in res.items()}

In [17]:
def predict(model, text, text_tensor):

  with torch.no_grad():
    output = model(text_tensor)
    prob = torch.nn.functional.softmax(output, dim=2)
    prob, pred = torch.max(prob, dim=2)
    prob = prob.squeeze().numpy()
    pred = pred.squeeze().numpy()
    result = pred_to_dict(text, pred, prob)
  
  return result

# Evaluation

In [50]:
def evaluate(df, index, result):
  temp = df[df['index'] == int(index)]
  print(temp)
  company_truth.append(temp['company']) 
  company_pred.append(result['company'])
  date_truth.append(temp['date']) 
  date_pred.append(result['date'])
  address_truth.append(temp['address']) 
  address_pred.append(result['address'])
  total_truth.append(temp['total']) 
  total_pred.append(result['total'])

In [52]:
def compute_f1_and_precision(company_truth, company_pred, date_truth, date_pred, address_truth, address_pred, total_truth, total_pred):

  company_score = f1_score(company_truth, company_pred, average='macro')
  date_score = f1_score(date_truth, date_pred, average='macro')
  address_score = f1_score(address_truth, address_pred, average='macro')
  total_score = f1_score(total_truth, total_pred, average='macro')
  f1 = [company_score, date_score, address_score, total_score]

  company_pre = precision_score(company_truth, company_pred, average='macro')
  date_pre = precision_score(date_truth, date_pred, average='macro')
  address_pre = precision_score(address_truth, address_pred, average='macro')
  total_pre = precision_score(total_truth, total_pred, average='macro')
  precision = [company_pre, date_pre, address_pre, total_pre]

  return f1, precision

In [54]:
def plot_result(result, title):
  fig = plt.figure()
  ax = fig.add_axes([1,0,1,1])
  langs = ['Company', 'Date', 'Address', 'Total']
  ax.bar(langs,result)
  plt.title(title)
  plt.show()

# Main

In [66]:
if __name__ == "__main__":

  all_vocab = ascii_uppercase+digits+punctuation+" \t\n"
  model = MyModel(len(all_vocab), 16, 256)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
  company_truth, company_pred, date_truth, date_pred, address_truth, address_pred, total_truth, total_pred = [], [], [], [], [], [], [], []

  model.load_state_dict(torch.load("/content/drive/MyDrive/Job/Infrred/AIESI-master/AIESI_using_tesseract/model.pth"))
  # all json files
  df = pd.read_csv('/content/drive/MyDrive/Job/Infrred/dataset/train_all/all_json.csv', index_col=0)
  # train model
  # train(5)
  # torch.save(model.state_dict(), '/content/drive/MyDrive/Job/Infrred/dataset')

  # # compute score for training dataset
  # files = glob.glob('/content/drive/MyDrive/Job/Infrred/dataset/train_all/*')
  # for file in files:
  #   if file.endswith('jpg'):
  #     # return index
  #     index = file[-7:-4]
  #     if index.find('/') != -1:
  #       index = index[index.find('/')+1:]

  #     img = img_preprocess(file, 1)
  #     text = img_words(img)
  #     text = text.upper()
  #     text_tensor = get_text_tensor(text)
  #     pred = predict(model, text, text_tensor)
  #     print(pred)
  #     evaluate(df, index, pred)

  #     print(index)
  # # print(company_truth, company_pred, date_truth, date_pred, address_truth, address_pred, total_truth, total_pred)
  # f1, precision = compute_f1_and_precision(company_truth, company_pred, date_truth, date_pred, address_truth, address_pred, total_truth, total_pred)
  # plot_result(f1, 'F1 Score')
  # plot_result(precision, 'Precision')


  # output json for validation data set 
  files = glob.glob('/content/drive/MyDrive/Job/Infrred/dataset/val_all/*')
  df = pd.DataFrame(columns=['index', 'company', 'date', 'address', 'total'])
  for file in files:
    if file.endswith('jpg'):
        # return index
      index = file[-7:-4]
      if index.find('/') != -1:
        index = index[index.find('/')+1:]

      img = img_preprocess(file, 1)
      text = img_words(img)
      text_tensor = get_text_tensor(text)
      pred = predict(model, text, text_tensor)
      pred['index'] = index
      pred = pd.DataFrame([pred])
      df = pd.concat([df, pred])

  df.to_csv('/content/drive/MyDrive/Job/Infrred/dataset/val_all/val_all_json.csv')