# Avaliando identificação de campos relevantes RG (OCR + algoritmo de identificação)

## Constantes

In [1]:
rg_folder = '../RG-Dataset'

## Importações

In [2]:
import cv2
import pandas as pd
from tqdm import tqdm
import pytesseract
import numpy as np

from classes.Result import Result

from image_preprocessing.filters import to_gray, decrease_noise
from image_preprocessing.rotations import rotate_90_if_vertical_rectangle, rotate_180

from result_selection.result_selection import select_result

from relevant_info_identification.relevant_info_identification import get_document_info

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

## Código de avaliação

In [4]:
def execute_pipeline(img_path):
    img = cv2.imread(img_path)
    gray_img = to_gray(img)
    filtered_img = decrease_noise(gray_img)
    filtered_img = rotate_90_if_vertical_rectangle(filtered_img)

    no_rotate_img = filtered_img
    rotate_180_img = rotate_180(filtered_img)

    no_rotate_info, _ = get_document_info(no_rotate_img)
    rotate_180_info, _ = get_document_info(rotate_180_img)

    no_rotate_result = Result(None, no_rotate_info, 0)
    rotate_180_result = Result(None, rotate_180_info, 0)

    return select_result([no_rotate_result, rotate_180_result]).relevant_infos_json

dataset = pd.read_csv(f'{rg_folder}/dataset.csv', sep=';')
documents_qnt = len(dataset)
na_count = dataset.isna().sum()

final_results = {
    'cpf': {
        'total': documents_qnt - na_count['cpf'],
        'corrects': 0
    },
    'name': {
        'total': documents_qnt - na_count['name'],
        'corrects': 0
    },
    'rg': {
        'total': documents_qnt - na_count['rg'],
        'corrects': 0
    },
    'birthdate': {
        'total': documents_qnt - na_count['birthdate'],
        'corrects': 0
    },
    'total_documents': {
        'total': documents_qnt,
        'corrects': 0
    }
}

for i, row in tqdm(dataset.iterrows(), total=len(dataset)):
    info = execute_pipeline(f'{rg_folder}/{row["image_path"]}')
    max_corrects = 0
    corrects = 0

    if isinstance(row['cpf'], str):
        max_corrects += 1
        if 'cpf' in info.keys() and info['cpf'] == row['cpf']:
            final_results['cpf']['corrects'] += 1
            corrects += 1
    
    if isinstance(row['name'], str):
        max_corrects += 1
        if 'nome' in info.keys() and info['nome'] == row['name']:
            final_results['name']['corrects'] += 1
            corrects += 1
    
    if isinstance(row['rg'], str):
        max_corrects += 1
        if 'rg' in info.keys() and info['rg'] == row['rg']:
            final_results['rg']['corrects'] += 1
            corrects += 1
    
    if isinstance(row['birthdate'], str):
        max_corrects += 1
        if 'data de nascimento' in info.keys() and info['data de nascimento']['formated_date'] == row['birthdate']:
            final_results['birthdate']['corrects'] += 1
            corrects += 1

    if max_corrects == corrects:
        final_results['total_documents']['corrects'] += 1

final_results


100%|██████████| 3505/3505 [1:26:46<00:00,  1.49s/it]  


{'cpf': {'total': 2965, 'corrects': 2336},
 'name': {'total': 3505, 'corrects': 497},
 'rg': {'total': 3505, 'corrects': 2338},
 'birthdate': {'total': 3505, 'corrects': 1711},
 'total_documents': {'total': 3505, 'corrects': 272}}

In [5]:
results_df = pd.DataFrame({
    'cpf': [final_results['cpf']['corrects'] / final_results['cpf']['total']],
    'rg': [final_results['rg']['corrects'] / final_results['rg']['total']],
    'name': [final_results['name']['corrects'] / final_results['name']['total']],
    'birthdate': [final_results['birthdate']['corrects'] / final_results['birthdate']['total']],
    'total_documents': [final_results['total_documents']['corrects'] / final_results['total_documents']['total']],
})
results_df

Unnamed: 0,cpf,rg,name,birthdate,total_documents
0,0.787858,0.667047,0.141797,0.48816,0.077603


In [6]:
with open('relevant_info_identification/results.tex', 'w') as result_file:
    result_file.write(results_df.to_latex())

  result_file.write(results_df.to_latex())
