### Acceso a drive

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Bibliotecas 

In [11]:
!sudo apt install tesseract-ocr
!pip3 install pytesseract
!cp drive/MyDrive/HackathonRIIAA2021/Data/spa.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

!apt install enchant
!pip install pyenchant
!sudo apt-get install myspell-es

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
enchant is already the newest version (1.6.0-11.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
myspell-es is already the newest version (1.11-14).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remov

In [12]:
import os 
import nltk
import re
import string
import distance
import pytesseract as ocr
import cv2 as cv
import numpy as np
from tqdm import tqdm
from enchant.checker import SpellChecker

Función para identificar palabras incorrectas en el texto resultante del Tesseract-OCR

In [13]:
# cleanup text
def get_personslist(text):
    personslist=[]
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'PERSON':
                personslist.insert(0, (chunk.leaves()[0][0]))
    return list(set(personslist))


# using enchant.checker.SpellChecker, identify incorrect words
def identify_incorrect_words(text):
  rep = { '\n': ' ', '\\': ' ', '\"': '"', '-': ' ', '"': ' " ', 
        '"': ' " ', '"': ' " ', ',':' , ', '.':' . ', '!':' ! ', 
        '?':' ? ' , '*':' * ', 
        '(': ' ( ', ')': ' ) ', '=-\n':''}
        
  rep = dict((re.escape(k), v) for k, v in rep.items()) 
  pattern = re.compile("|".join(rep.keys()))
  text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
  # personslist = get_personslist(text)
  # print("PERSON LIST")
  # ignorewords = personslist + ["!", ",", ".", "\"", "?", '(', ')', '*', '`']
  ignorewords = ["!", ",", ".", "\"", "?", '(', ')', '*']
  spell = SpellChecker("es_MX")
  words = text.split()
  incorrectwords = [w for w in words if not spell.check(w) and w not in ignorewords and len(w) > 1]

  # using enchant.checker.SpellChecker, get suggested replacements
  suggestedwords = [{w: spell.suggest(w)} for w in incorrectwords]
  for w in incorrectwords:
    text = text.replace(w + " ", ' [MASK] ', 1)
  return text, suggestedwords

Función para corregir las palabras previamente identificadas como incorrectas.

La corrección se realiza encontrando la palabra más similar mediante la distancia Jaccard, de una lista de sugerencias proporcionadas por un dicccionario.

In [14]:
def get_close_matches(word, word_options):
  min_dist = 2
  word_match = ""
  for v in word_options:
    temp = distance.jaccard(word, v)
    if temp < min_dist:
        min_dist = temp
        word_match = v
    return word_match

def set_suggestion(text, suggestedwords):
  index_mask = 0
  tokens = text.split(" ")
  for token in tokens:
    if token == '[MASK]':
      word = list(suggestedwords[index_mask].keys())[0]
      word_options =  list(suggestedwords[index_mask].values())[0]
      word_match = get_close_matches(word, word_options) if len(word_options) else word
      text = text.replace('[MASK]', word_match, 1)
      index_mask += 1
  return text


### Función para procesar imagenes y utilizar Tessearct-OCR

In [15]:
def get_processed_images(src_path, processed_path, texts_path, bin_option=True, border_option=False):
  source = [f for f in os.listdir(src_path) if os.path.isfile(os.path.join(src_path, f))]
  print('\n[INFO] Se encontraron {} imagenes para procesar.\n'.format(len(source)))
  
  for index in tqdm(range(len(source))):
    # Lectura y escalado de imagenes #
    image_name = source[index].split('.')[0]
    image = cv.imread('{}{}'.format(src_path,source[index]))
    image = cv.resize(image, None, fx=1.3, fy=1.3, interpolation=cv.INTER_CUBIC)
    
    """
    Preprocesamiento de imagenes
    """
    # Eliminación de ruido causado por sombras en las imagenes #
    gray_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    dilated_image = cv.dilate(gray_image, np.ones((7,7), np.uint8))
    blur_image = cv.medianBlur(dilated_image,21)
    diff_image = 255 - cv.absdiff(gray_image, blur_image)
    norm_image = diff_image.copy()
    cv.normalize(diff_image, norm_image, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8UC1)
    thr_img = cv.threshold(norm_image, 230, 0, cv.THRESH_TRUNC)[1]
    cv.normalize(thr_img, thr_img, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8UC1)
    bin_image = cv.threshold(thr_img, 0, 255, cv.THRESH_BINARY | cv.THRESH_OTSU)[1]
    if border_option == False:
      # Lectura del texto de la imagen mediante OCR sin detección de bordes#
      config_options = r'--oem 1'
      text = ocr.image_to_string(bin_image, lang='spa', config=config_options)
      cv.imwrite('{}{}.png'.format(processed_path,image_name), bin_image)
      textfile = open('{}{}.txt'.format(texts_path,image_name),'w')
      textfile.write(text)
      textfile.close()
    else:
      # Lectura del texto de la imagen mediante OCR con detección de bordes#
      bin_image_inv = cv.threshold(thr_img, 0, 255, cv.THRESH_OTSU | cv.THRESH_BINARY_INV)[1]
      rect_kernel = cv.getStructuringElement(cv.MORPH_RECT, (90, 90))
      dilation = cv.dilate(bin_image_inv, rect_kernel, iterations = 1)
    
      contours, hierarchy = cv.findContours(dilation, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_NONE)
      if bin_option:
        image_copy = bin_image.copy()
      else:
        image_copy = gray_image.copy()
      count = -1
      for cnt in contours:
        count +=1
        x,y,w,h = cv.boundingRect(cnt)
        cropped = image_copy[y:y + h, x:x + w]
        h,w = cropped.shape
        if h>1500 and w>2500:
          config_options = r'--oem 1'
          text = ocr.image_to_string(cropped, lang='spa', config=config_options)
          text, suggestedwords = identify_incorrect_words(text)
          text = set_suggestion(text, suggestedwords)
          if len(text) > 0:
            cv.imwrite('{}{}_{}.png'.format(processed_path,image_name,count), cropped)
            textfile = open('{}{}.txt'.format(texts_path,image_name),'a')
            textfile.write(text)
            textfile.close()

In [16]:
IMAGES_PATH = 'drive/MyDrive/Datos - Hackathon JusticIA/Fichas_auto/'
PROCESSED_PATH = 'drive/MyDrive/HackathonRIIAA2021/Processed_images/Fichas_auto_dic/'
TEXTS_PATH = 'drive/MyDrive/HackathonRIIAA2021/Texts/Fichas_auto_dic/'

In [17]:
get_processed_images(src_path=IMAGES_PATH, processed_path=PROCESSED_PATH, texts_path=TEXTS_PATH)


[INFO] Se encontraron 1000 imagenes para procesar.



100%|██████████| 1000/1000 [2:59:54<00:00, 10.79s/it]
