In [None]:
!sudo apt install libtesseract-dev
!sudo apt install tesseract-ocr-ita
!pip install pytesseract -q
!pip install PyPDF2 -q
!pip install pdf2image -q
!pip install pymupdf -q
!apt-get install poppler-utils -q
!pip install easyocr -q

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev
0 upgraded, 3 newly installed, 0 to remove and 23 not upgraded.
Need to get 3,343 kB of archives.
After this operation, 15.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 libarchive-dev amd64 3.4.0-2ubuntu1.2 [491 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 libleptonica-dev amd64 1.79.0-1 [1,389 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 libtesseract-dev amd64 4.1.1-2build2 [1,463 kB]
Fetched 3,343 kB in 1s (3,426 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm li

In [None]:
import cv2
import numpy as np
import random
import string
import time
import pytesseract

In [None]:
import io
import requests
import fitz
import easyocr

In [None]:
def string_similarity(s1, s2):
    """
    This function takes in two strings s1 and s2 and returns the similarity
    score between them. 
    Calculates the Levenshtein similarity between two strings.
    Returns a value between 0 and 1, where 1 means the strings are identical.
    """
    n = len(s1)
    m = len(s2)
    if n == 0 or m == 0:
        return 0
    # create a matrix of size (n+1) x (m+1) to store the edit distances
    matrix = [[0] * (m+1) for i in range(n+1)]
    for i in range(n+1):
        matrix[i][0] = i
    for j in range(m+1):
        matrix[0][j] = j
    # fill in the matrix with edit distances
    for i in range(1, n+1):
        for j in range(1, m+1):
            if s1[i-1] == s2[j-1]:
                matrix[i][j] = matrix[i-1][j-1]
            else:
                matrix[i][j] = min(matrix[i-1][j], matrix[i][j-1], matrix[i-1][j-1]) + 1
    # calculate similarity score as 1 - normalized edit distance
    similarity = 1 - (matrix[n][m] / max(n, m))
    return similarity


In [None]:
def initialize_readers():
  readers = []
  readers.append(easyocr.Reader(['it'], gpu = False))
  readers.append(easyocr.Reader(['it'], gpu = True))
  return readers

In [None]:
def get_tesseract_text(img):
  text = pytesseract.image_to_string(img)
  text = text.replace(" ", "")
  text = text.replace('\n', "")
  text = '\n'.join(text[i:i+23] for i in range(0, len(text), 23))
  return text

In [None]:
def get_easyOcr_text(img, readers, gpu = False):
  if gpu == False:
    text_easy = readers[0].readtext(img, batch_size = 50, detail = 0)
  elif gpu == True:
    text_easy = readers[1].readtext(img, batch_size = 50, detail = 0)
  text_easy = ''.join(text_easy)
  text_easy = text_easy.replace(" ", "")
  text_easy = text_easy.replace('\n', "")
  text_easy = '\n'.join(text_easy[i:i+23] for i in range(0, len(text_easy), 23))
  return text_easy

In [None]:
def test_ocr():
# Define the size of the image
  width = 500
  line_height = 30
  text = ""
  readers = initialize_readers()
  results = []

# Generate a random string of 1000 characters
  random_string = ''.join(random.choices(string.ascii_uppercase + string.digits, k=1000))

# Insert a newline character after every 23 characters
  random_string = '\n'.join(random_string[i:i+23] for i in range(0, len(random_string), 23))
  random_string = random_string.replace(" ", "")

# Split the text into lines
  lines = random_string.split('\n')

# Calculate the required height of the image
  height = (len(lines) * line_height) + line_height

# Create a blank image
  img = np.zeros((height, width, 3), np.uint8)

# Set the background to white
  cv2.rectangle(img, (0, 0), (width, height), (255, 255, 255), -1)

# Choose a random font
  font = cv2.FONT_HERSHEY_SIMPLEX

# Set the font scale and thickness
  font_scale = 1
  thickness = 2

# Loop through the lines and draw them onto the image
  for i, line in enumerate(lines):
    # Get the size of the text
      text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]

    # Set the position of the text
      x = int((width - text_size[0]) / 2)
      y = int(line_height * (i+1))

    # Draw the text onto the image
      cv2.putText(img, line, (x, y), font, font_scale, (0, 0, 0), thickness)

# Save the image
  cv2.imwrite('random_text.png', img)

# Time
  start_time = time.time()
# Load the image into tesseract
  txt_tess = get_tesseract_text(img)
  time_tesseract = time.time() - start_time
  tesseract = {'text': txt_tess, 'time': time_tesseract, 'method': 'tesseract',
               'similarity': "Similarity score: {:.2f}%".format(string_similarity(random_string, txt_tess))}
  results.append(tesseract)
# Load the image easyOCR cpu mode
  start_time = time.time()
  text_easy_cpu = get_easyOcr_text(readers = readers, gpu = False, img = img)
  time_easy_cpu = time.time() - start_time
  easy_w_CPU = {'text': text_easy_cpu, 'time': time_easy_cpu, 'method': 'easyCPU',
                'similarity': "Similarity score: {:.2f}%".format(string_similarity(random_string, text_easy_cpu))}
  results.append(easy_w_CPU)

# Load the image easyOCR cpu mode
  start_time = time.time()
  text_easy_gpu = get_easyOcr_text(readers = readers, gpu = True, img = img)
  time_easy_gpu = time.time() - start_time
  easy_w_GPU = {'text': text_easy_gpu, 'time': time_easy_gpu, 'method': 'easyGPU',
                'similarity': "Similarity score: {:.2f}%".format(string_similarity(random_string, text_easy_gpu))}
  results.append(easy_w_GPU)

  return results


In [None]:
test_ocr()



[{'text': '76OKSNSBPYO4YMYDXEOGOZE\nD4IBAFIVOIAIXLIWRXZ50GI\nBAJQMEUBIZEEVI12PSDPVSX\nI948HSLHEVDIPQ1PE60Z617\n877BG8MDOSPV2W7HWPKSN92\n8BKSAM4CUGZIZIREHZCE6QS\nT2YD1KNI4RSYURSMBK6UORS\nOBEUWXWQSUKDZJOLX52X805\nSBHSN6JQOSPS8SSS9NI4UID\nUSBXIY4EOPPNL84YIZOQGWL\nLAXIDBWK25BLWW8NUOVGHAQ\nWGKMORJR27ZPFTPS6C104R8\n5USMSOJGOO22W86KYWX64G2\n0QMCJHGYJLYPO6BNRKAYRGT\nNLUGMBXBSE7VOTKS3DCSYXM\nW6TD1WSAHOCSM8T49IRSZ8C\nPTIWJQWSFXOPJ9SU7UY1SR4\nAPQHA1HSIIEBDEHJ4UGOUX7\nN4LTLOMY9M2HWASRMGZWUQX\nP7CR4HS1B3Z2G6GY11C6SJ8\nJZG7N3WC7X074BOEDTEOXXR\nJ11S36QYROCAL16DNW68FKQ\nSF6VWC9ZM7RGESQXAVXMBUT\nOB5EGAR1UQR9KZ4NMIGZ4SI\nXNXW68XDV3GPIL7YYMJNBW1\nYUPXAZKSZXJEWPXG6GHXFF2\n6X1308BTIWO2J66N3JD6HO8\nWMI4N3USCRT7QY7HBEYHMHC\nDFHIV4NISOK6FXCHOPWO116\nUH4U810QOCZ1CV8P9NM7611\nCTOS7XUJ7LWKYSSDYZMULDD\nUSOAQSPWOWX1SLURWNH9OVP\nE8WS9LDM31WHCUUG70YPEI9\nSB46UD6FO7TR9O1AZXSFHSO\nIPQSDUJIQ7M3EBE6YZ4LKHM\nZGYO35R1I6KPFH365D3RSN6\n00VUHWG6RPENXOBNO4JCKSF\nEUJSCRNUKW4K1FZE3GCOM9X\n87DE1HKMEOYMFUQEALWMZG1\nE9VWXLGYTH1FK9