# Modificando a base de dados para melhor avaliação

## Importações

In [1]:
import os
import shutil
import json

import pandas as pd
import cv2

from relevant_info_identification.birthdate_identification import find_birthdate_text
from relevant_info_identification.cpf_identification import find_cpf_text
from relevant_info_identification.name_indentification import find_name_text
from relevant_info_identification.rg_identification import find_rg_text

## Tantando ler um arquivo de OCR

In [2]:
encoding = 'ISO 8859-1'
example_file_path = '../BID Dataset/RG_Verso/000111111_gt_ocr.txt'
with open(example_file_path, encoding=encoding) as example_file:
    print(example_file.read())
# Não é possível ler direto com o pandas, causa erro devido à formatação do arquivo (listas ou não no x e y)

x, y, width, height, transcription
[50, 59, 41, 33], [596, 229, 229, 597], -1, -1, VÁLIDA EM TODO O TERRITÓRIO NACIONAL
74, 519, 18, 138, 08.096.661-5
141, 261, 20, 479, REBELO RONEI NAKAMURAKARE
[83, 86, 71, 73], [749, 681, 682, 750], -1, -1, REGISTRO
[99, 100, 88, 87], [749, 704, 704, 750], -1, -1, GERAL
[127, 129, 116, 115], [749, 707, 706, 750], -1, -1, NOME
[196, 198, 183, 182], [751, 688, 688, 751], -1, -1, FILIAÇÃO
206, 391, 18, 348, ADALTO CASCADAN FRASAO
240, 283, 20, 455, JANDHER SALLES GASPERAZZO
[305, 310, 297, 293], [250, 94, 94, 252], -1, -1, DATA DE NASCIMENTO
316, 157, 16, 106, 02/01/1963
342, 305, 20, 337, C.NAS=481 LV=36 FL=435
[361, 363, 350, 348], [755, 664, 664, 754], -1, -1, DOC ORIGEM
[292, 296, 283, 281], [752, 646, 646, 753], -1, -1, NATURALIDADE
303, 586, 21, 155, GUAIRAÇÁ-PR
372, 585, 22, 157, JOAÇABA-SC
406, 539, 18, 162, 354.205.532-87
[429, 430, 417, 418], [756, 728, 729, 756], -1, -1, CPF
470, 694, 15, 54, NH 54
458, 383, 11, 167, CEDRICK VALMIR CIRULLI
[

In [3]:
def read_with_list(line: str):
    first_rbracket_index = line.find(']')
    second_rbracket_index = line.find(']', first_rbracket_index + 1)

    x = eval(line[:first_rbracket_index+1])
    y = eval(line[first_rbracket_index+3:second_rbracket_index+1])

    final_split = line[second_rbracket_index+3:].split(', ')

    width = int(final_split[0])
    height = int(final_split[1])
    transcription = final_split[2]

    return {
        'left': x,
        'top': y,
        'width': width,
        'height': height,
        'transcription': transcription
    }

def read_without_list(line):
    final_split = line.split(', ')
    
    return {
        'left': int(final_split[0]),
        'top': int(final_split[1]),
        'width': int(final_split[2]),
        'height': int(final_split[3]),
        'transcription': final_split[4]
    }

def read_ocr_file(file_path):
    with open(file_path, encoding=encoding) as ocr_file:
        rows = []
        ocr_file.readline()
        for line in ocr_file.readlines():
            line = line.strip()
            if line[0] == '[':
                rows.append(read_with_list(line))
            else:
                rows.append(read_without_list(line))
    return pd.DataFrame(rows)
                

read_ocr_file(example_file_path)

Unnamed: 0,left,top,width,height,transcription
0,"[50, 59, 41, 33]","[596, 229, 229, 597]",-1,-1,VÁLIDA EM TODO O TERRITÓRIO NACIONAL
1,74,519,18,138,08.096.661-5
2,141,261,20,479,REBELO RONEI NAKAMURAKARE
3,"[83, 86, 71, 73]","[749, 681, 682, 750]",-1,-1,REGISTRO
4,"[99, 100, 88, 87]","[749, 704, 704, 750]",-1,-1,GERAL
5,"[127, 129, 116, 115]","[749, 707, 706, 750]",-1,-1,NOME
6,"[196, 198, 183, 182]","[751, 688, 688, 751]",-1,-1,FILIAÇÃO
7,206,391,18,348,ADALTO CASCADAN FRASAO
8,240,283,20,455,JANDHER SALLES GASPERAZZO
9,"[305, 310, 297, 293]","[250, 94, 94, 252]",-1,-1,DATA DE NASCIMENTO


## Tratando cada documento

Para fazer uma análise de taxa de acerto por CPF, RG, nome e data de nascimento, é preciso coletar essas informações.
É preciso fazer isso, pois as informações não estão separadas dessa forma, sendo possível estrai-las do próprio arquivo
de OCR esperado.

In [4]:
new_dataset_folder = '../RG-Dataset'
files_dataset_folder = '../RG-Dataset/files'
csv_dataset_path = '../RG-Dataset/dataset.csv'
old_dataset_folder = '../BID Dataset/RG_Verso'
index_file_path = '../current_index.txt'
sep = ';'

old_rg_files = os.listdir(old_dataset_folder)

saved_infos = {
    'image': None,
    'current_index': None
}

In [5]:
def get_current_index_value():
    with open(index_file_path, 'r') as index_file:
        return int(index_file.readline())

def write_index(index):
    with open(index_file_path, 'w') as index_file:
        index_file.write(str(index))

In [6]:
if not os.path.exists(new_dataset_folder):
    os.mkdir(new_dataset_folder)

if not os.path.exists(files_dataset_folder):
    os.mkdir(files_dataset_folder)

if not os.path.exists(csv_dataset_path):
    pd.DataFrame(columns=['id', 'image_path', 'ocr_path', 'segmentation_path', 'info_path', 'CPF', 'RG', 'birthdate', 'name']).to_csv(csv_dataset_path, index=False, sep=sep)


if not os.path.exists('../current_index.txt'):
    write_index(0)
    saved_infos['current_index'] = 0
else:
    saved_infos['current_index'] = get_current_index_value()

In [7]:
import ipywidgets as widgets

In [8]:
img_widget = widgets.Image(
    format='jpg',
    width=600,
    height=400
)

rotate_img_btn = widgets.Button(
    description='Rotacionar',
    icon='undo'
)

text_widget = widgets.Label(
    value='Texto'
)

cpf_input = widgets.Text(description='CPF')

rg_input = widgets.Text(description='RG')

name_input = widgets.Text(description='Nome')

birthdate_input = widgets.Text(description='Data de nascimento')

save_btn = widgets.Button(
    description='Salvar',
    button_style='success',
    icon='check'
)


In [9]:

def load_current_info():
    text_widget.value = f'Documento {saved_infos["current_index"]+1} de {len(old_rg_files)//3}'

    current_files = old_rg_files[saved_infos['current_index']*3:saved_infos['current_index']*3+3]

    ocr_file = f'{old_dataset_folder}{os.path.sep}{current_files[0]}'
    original_image_file = f'{old_dataset_folder}{os.path.sep}{current_files[2]}'
    
    ocr_dataframe = read_ocr_file(ocr_file)
    text_col = 'transcription'
    cpf = find_cpf_text(ocr_dataframe, text_col)
    rg = find_rg_text(ocr_dataframe, text_col)
    name = find_name_text(ocr_dataframe, text_col)
    birthdate = find_birthdate_text(ocr_dataframe, text_col)

    cpf_input.value = cpf
    rg_input.value = rg
    name_input.value = name
    birthdate_input.value = birthdate['formated_date']

    saved_infos['image'] = cv2.imread(original_image_file)
    img_widget.value = cv2.imencode('.jpg', saved_infos['image'])[1].tobytes()

def save_current_info():
    id = old_rg_files[0][:old_rg_files[0].find('_')]
    cpf = cpf_input.value
    rg = rg_input.value
    name = name_input.value
    birthdate = birthdate_input.value

    current_files = old_rg_files[saved_infos['current_index']*3:saved_infos['current_index']*3+3]
    ocr_file = f'{old_dataset_folder}{os.path.sep}{current_files[0]}'
    seg_file = f'{old_dataset_folder}{os.path.sep}{current_files[1]}'
    original_image_file = f'{old_dataset_folder}{os.path.sep}{current_files[2]}'



    ocr_dataframe = read_ocr_file(ocr_file)
    ocr_dataframe.to_csv(f'{files_dataset_folder}{os.path.sep}{current_files[0]}', index=False, sep=sep)

    shutil.copyfile(seg_file, f'{files_dataset_folder}{os.path.sep}{current_files[1]}')

    shutil.copyfile(original_image_file, f'{files_dataset_folder}{os.path.sep}{current_files[2]}')

    info_json = {
        'cpf': cpf,
        'rg': rg,
        'name': name,
        'birthdate': birthdate
    }

    json_file = f'{id}_info.json'
    with open(f'{files_dataset_folder}{os.path.sep}{json_file}', 'w') as outfile:
        json.dump(info_json, outfile)

    pd.DataFrame([{
        'id': id,
        'image_path': f'files/{current_files[2]}',
        'ocr_path': f'files/{current_files[0]}',
        'segmentation_path': f'files/{current_files[1]}',
        'info_path': f'files/{json_file}',
        'CPF': cpf,
        'RG': rg,
        'birthdate': birthdate,
        'name': name
    }]).to_csv(csv_dataset_path, index=False, sep=sep, mode='a', header=False)


def rotate_img():
    saved_infos['image'] = cv2.rotate(saved_infos['image'], cv2.ROTATE_90_CLOCKWISE)
    img_widget.value = cv2.imencode('.jpg', saved_infos['image'])[1].tobytes()

def increment_index():
    saved_infos['current_index'] += 1
    write_index(saved_infos['current_index'])

def save_and_load_next():
    save_current_info()
    increment_index()
    load_current_info()

load_current_info()

In [10]:
rotate_img_btn.on_click(lambda _: rotate_img())

save_btn.on_click(lambda _: save_and_load_next())

In [11]:
display(img_widget)
display(text_widget)
display(rotate_img_btn)
display(cpf_input)
display(rg_input)
display(name_input)
display(birthdate_input)
display(save_btn)

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x0…

Label(value='Documento 2 de 3600')

Button(description='Rotacionar', icon='undo', style=ButtonStyle())

Text(value='188.354.397-52', description='CPF')

Text(value='29.227.222-4', description='RG')

Text(value='', description='Nome')

Text(value='19/06/2004', description='Data de nascimento')

Button(button_style='success', description='Salvar', icon='check', style=ButtonStyle())