### Acceso a drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Bibliotecas 

In [2]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!cp drive/MyDrive/HackathonRIIAA2021/Data/spa.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.


In [3]:
import os 
import pytesseract as ocr
import cv2 as cv
import numpy as np
from tqdm import tqdm

### Función para procesar imagenes y utilizar Tessearct-OCR

In [4]:
def get_processed_images(src_path, processed_path, texts_path, bin_option=True, border_option=False):
  source = [f for f in os.listdir(src_path) if os.path.isfile(os.path.join(src_path, f))]
  print('\n[INFO] Se encontraron {} imagenes para procesar.\n'.format(len(source)))
  
  for index in tqdm(range(len(source))):
    # Lectura y escalado de imagenes #
    image_name = source[index].split('.')[0]
    image = cv.imread('{}{}'.format(src_path,source[index]))
    image = cv.resize(image, None, fx=1.3, fy=1.3, interpolation=cv.INTER_CUBIC)
    
    """
    Preprocesamiento de imagenes
    """
    # Eliminación de ruido causado por sombras en las imagenes #
    gray_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    dilated_image = cv.dilate(gray_image, np.ones((7,7), np.uint8))
    blur_image = cv.medianBlur(dilated_image,21)
    diff_image = 255 - cv.absdiff(gray_image, blur_image)
    norm_image = diff_image.copy()
    cv.normalize(diff_image, norm_image, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8UC1)
    thr_img = cv.threshold(norm_image, 230, 0, cv.THRESH_TRUNC)[1]
    cv.normalize(thr_img, thr_img, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8UC1)
    bin_image = cv.threshold(thr_img, 0, 255, cv.THRESH_BINARY | cv.THRESH_OTSU)[1]
    if border_option == False:
      # Lectura del texto de la imagen mediante OCR sin detección de bordes#
      config_options = r'--oem 1'
      text = ocr.image_to_string(bin_image, lang='spa', config=config_options)
      cv.imwrite('{}{}.png'.format(processed_path,image_name), bin_image)
      textfile = open('{}{}.txt'.format(texts_path,image_name),'w')
      textfile.write(text)
      textfile.close()
    else:
      # Lectura del texto de la imagen mediante OCR con detección de bordes#
      bin_image_inv = cv.threshold(thr_img, 0, 255, cv.THRESH_OTSU | cv.THRESH_BINARY_INV)[1]
      rect_kernel = cv.getStructuringElement(cv.MORPH_RECT, (90, 90))
      dilation = cv.dilate(bin_image_inv, rect_kernel, iterations = 1)
    
      contours, hierarchy = cv.findContours(dilation, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_NONE)
      if bin_option:
        image_copy = bin_image.copy()
      else:
        image_copy = gray_image.copy()
      count = -1
      for cnt in contours:
        count +=1
        x,y,w,h = cv.boundingRect(cnt)
        cropped = image_copy[y:y + h, x:x + w]
        h,w = cropped.shape
        if h>1500 and w>2500:
          config_options = r'--oem 1'
          text = ocr.image_to_string(cropped, lang='spa', config=config_options)
          if len(text) > 0:
            cv.imwrite('{}{}_{}.png'.format(processed_path,image_name,count), cropped)
            textfile = open('{}{}.txt'.format(texts_path,image_name),'a')
            textfile.write(text)
            textfile.close()


In [5]:
IMAGES_PATH = 'drive/MyDrive/Datos - Hackathon JusticIA/Fichas_manual/'
PROCESSED_PATH = 'drive/MyDrive/HackathonRIIAA2021/Processed_images_v3/Fichas_manual/'
TEXTS_PATH = 'drive/MyDrive/HackathonRIIAA2021/Texts_v3/Fichas_manual/'

In [6]:
get_processed_images(src_path=IMAGES_PATH, processed_path=PROCESSED_PATH, texts_path=TEXTS_PATH)


[INFO] Se encontraron 1000 imagenes para procesar.



100%|██████████| 1000/1000 [2:46:29<00:00,  9.99s/it]


In [7]:
IMAGES_PATH = 'drive/MyDrive/Datos - Hackathon JusticIA/Fichas_auto/'
PROCESSED_PATH = 'drive/MyDrive/HackathonRIIAA2021/Processed_images_v3/Fichas_auto/'
TEXTS_PATH = 'drive/MyDrive/HackathonRIIAA2021/Texts_v3/Fichas_auto/'

In [8]:
get_processed_images(src_path=IMAGES_PATH, processed_path=PROCESSED_PATH, texts_path=TEXTS_PATH)


[INFO] Se encontraron 1000 imagenes para procesar.



100%|██████████| 1000/1000 [2:29:00<00:00,  8.94s/it]
