In [None]:
# !pip install easyocr -U
# !pip install python-Levenshtein -U
# !pip install pytesseract -U
# !apt install tesseract-ocr
# !apt install libtesseract-dev
# !python3 -m pip install paddlepaddle-gpu -U
# !pip install paddleocr -U

In [None]:
import cv2 as cv
from google.colab.patches import cv2_imshow
import easyocr
import numpy as np
import pandas as pd
import Levenshtein
import matplotlib.pyplot as plt
import pytesseract
from paddleocr import PaddleOCR, draw_ocr

# Import Images

In [None]:
fig = plt.figure(figsize=(25,40))

for i in range(1,4):
  img = cv.imread(f'/content/drive/MyDrive/4th_Year/DSC4996/Claim_forms/{i}.png')
  fig.add_subplot(1, 3, i)
  plt.imshow(img)
  plt.axis('off')
  plt.title(f'Image {i}')

# Prepair Image Set

In [None]:
img = {}

for i in range(1,16):
  img[f'BGR image {i}'] = cv.imread(f'/content/drive/MyDrive/4th_Year/DSC4996/Claim_forms/{i}.png')
  img[f'RGB image {i}'] = cv.cvtColor(img[f'BGR image {i}'], cv.COLOR_BGR2RGB)
  img[f'GrayScale image {i}'] = cv.cvtColor(img[f'BGR image {i}'], cv.COLOR_BGR2GRAY)
  _ , img[f'OTSU_85 image {i}'] = cv.threshold(img[f'GrayScale image {i}'], 85, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)

raw_image_type = ['BGR', 'RGB', 'GrayScale', 'OTSU_85']
kernel = np.array([[0, -1, 0],
                   [-1, 5,-1],
                   [0, -1, 0]])

for image_type in raw_image_type:
  for i in range(1,16):
    img[f"{image_type} Sharpen image {i}"] = cv.filter2D(img[f'{image_type} image {i}'], ddepth=-1, kernel=kernel)

# Plot the Image Set

In [None]:
image_type = ['BGR', 'RGB', 'GrayScale', 'OTSU_85', 'BGR Sharpen', 'RGB Sharpen', 'GrayScale Sharpen', 'OTSU_85 Sharpen']

position = 1
fig = plt.figure(figsize=(23, 50))

for img_type in image_type:
  fig.add_subplot(4, 3, position)

  plt.imshow(img[f'{img_type} image 1'])
  plt.axis('off')
  plt.title(f"{img_type} image 1")

  position += 1

# **Finding the best image input type among BGR, RGB, GrayScale, BGR Sharpen, RGB Sharpen, and GrayScale Sharpen using EasyOCR, PaddlePaddleOCR, and PyTesseract with Levenshtein's CER, WER values**

### EasyOCR Model

In [None]:
def easyOCR(image):
  reader = easyocr.Reader(['en'])
  result = reader.readtext(image)

  text = ''
  for i in range(0,len(result)):
    text += result[i][1] + " "

  text = text.replace('  ', ' ').replace('   ', ' ').replace('    ', ' ')

  return text.strip()

### Predicting Text using EasyOCR for each image input type

In [None]:
predicted_text = {}

for i in range(1,16):
  for input_img in image_type:
    predicted_text[f'{input_img} image {i} EasyOCR text'] = easyOCR(img[f'{input_img} image {i}'])
    if i==1:
      print("")
      print(f"{input_img} image {i} EasyOCR text :-")
      print(predicted_text[f'{input_img} image {i} EasyOCR text'])
      print("")

### pyTesseract Model

In [None]:
def pytes(image):
  result = pytesseract.image_to_string(image)
  result = ' '.join(result.split())
  result = result.replace('  ', ' ').replace('   ', ' ').replace('    ', ' ')
  return result

In [None]:
for i in range(1,16):
  for input_img in image_type:
    predicted_text[f'{input_img} image {i} pyTesseract text'] = pytes(img[f'{input_img} image {i}'])
    if i == 1:
      print(f"{input_img} image {i} pyTesseract text :-")
      print(predicted_text[f'{input_img} image {i} pyTesseract text'])
      print("")

### PaddlePaddleOCR Model

In [None]:
def ppOCR(image):
  ocr = PaddleOCR(use_angle_cls = True, lang = 'en', use_gpu=True)
  result = ocr.ocr(image)

  text = ''
  for i in range(len(result[0])):
    text += result[0][i][1][0] + " "

  text = text.replace('  ', ' ').replace('   ', ' ').replace('    ', ' ')

  return text.strip()

In [None]:
for i in range(1,16):
  for input_img in image_type:
    predicted_text[f'{input_img} image {i} PaddlePaddleOCR text'] = ppOCR(img[f'{input_img} image {i}'])
    if i == 1:
      print('')
      print(f"{input_img} image {i} PaddlePaddleOCR text :-")
      print(predicted_text[f'{input_img} image {i} PaddlePaddleOCR text'])
      print("")
      print('')

### **Get CER (Character Error Rate) and WER (Word Error Rate) for every input image type using Levenshtein Method**

### Levenshtein Method

In [None]:
def LevenshteinDist(ref, pred):
  cer = Levenshtein.distance(ref, pred) / len(ref)
  wer = Levenshtein.distance(ref.split(), pred.split()) / len(ref.split())

  return round(cer,4), round(wer,4)

### Ground Truth texts

In [None]:
original_text = {}

with open('/content/drive/MyDrive/4th_Year/DSC4996/Claim_forms/original_text.txt', 'r') as f:
  txt = f.read()
  txt = txt.split('\n\n')

for i in range(len(txt)):
  original_text[f'image {i+1}'] = txt[i]

for i in range(1,4):
  print(original_text[f'image {i}'])
  print('\n')

In [None]:
df = {}

df['Image input type and model'] = []
for input_img in image_type:
  for model in ['EasyOCR', 'pyTesseract', 'PaddlePaddleOCR']:
    df['Image input type and model'].append(input_img + ' ' + model)

for i in range(1, 16):
  df[f'CER for image {i}'], df[f'WER for image {i}'] = [], []

### Calculating CER and WER for EasyOCR model predicted text

In [None]:
for i in range(1,16):
  for input_img in image_type:
    cer, wer = LevenshteinDist(original_text[f'image {i}'], predicted_text[f'{input_img} image {i} EasyOCR text'])
    df[f'CER for image {i}'].append(cer)
    df[f'WER for image {i}'].append(wer)

### Calculating CER and WER for pyTesseract model predicted text

In [None]:
for i in range(1,16):
  for input_img in image_type:
    cer, wer = LevenshteinDist(original_text[f'image {i}'], predicted_text[f'{input_img} image {i} pyTesseract text'])
    df[f'CER for image {i}'].append(cer)
    df[f'WER for image {i}'].append(wer)

### Calculating CER and WER for PaddlePaddleOCR model predicted text

In [None]:
for i in range(1,16):
  for input_img in image_type:
    cer, wer = LevenshteinDist(original_text[f'image {i}'], predicted_text[f'{input_img} image {i} PaddlePaddleOCR text'])
    df[f'CER for image {i}'].append(cer)
    df[f'WER for image {i}'].append(wer)

### Creating DataFrame

In [None]:
df = pd.DataFrame.from_dict(df)
df

In [None]:
temp_cer = 0
temp_wer = 0

for i in range(1, 16):
  temp_cer += df[f'CER for image {i}']
  temp_wer += df[f'WER for image {i}']

df['AVG CER'] = temp_cer/15
df['AVG WER'] = temp_wer/15

df[['Image input type and model', 'AVG CER', 'AVG WER']]

Unnamed: 0,Image input type and model,AVG CER,AVG WER
0,BGR EasyOCR,0.013833,0.05708
1,BGR pyTesseract,0.013673,0.055947
2,BGR PaddlePaddleOCR,0.013833,0.05708
3,RGB EasyOCR,0.024507,0.0853
4,RGB pyTesseract,0.012113,0.060233
5,RGB PaddlePaddleOCR,0.013393,0.071187
6,GrayScale EasyOCR,0.01238,0.062133
7,GrayScale pyTesseract,0.024507,0.0853
8,GrayScale PaddlePaddleOCR,0.03288,0.11922
9,OTSU_85 EasyOCR,0.03182,0.101553


### Best image input type and OCR model based on Average CER Value

In [None]:
df[df['AVG CER'] == min(df['AVG CER'])][['Image input type and model', 'AVG CER']]

Unnamed: 0,Image input type and model,AVG CER
4,RGB pyTesseract,0.012113


### Best image input type and OCR model based on Average WER Value

In [None]:
df[df['AVG WER'] == min(df['AVG WER'])][['Image input type and model', 'AVG WER']]

Unnamed: 0,Image input type and model,AVG WER
1,BGR pyTesseract,0.055947


# 🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑
### *So we can conclude that best OCR model is **pyTesseract** OCR model and best image input types are **RGB** or **BGR** based on average CER and WER values for this case. Here after I choose **BGR** as a image input type and **pyTesseract** as a OCR model for future tasks.*
# 🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑🛑

---

