# Verificando se a "nova" base de dados está ok

O notebook disponível em `src/tratando_base_dados.ipynb` modifica a base de dados original BID, adicionando um novo arquivo para cada imagem, que é um arquivo JSON contendo informações relevantes do documento, como nome, data de nascimento, RG e CPF.

Além disso, também é criado um arquivo `.csv` com cada linha contendo informações sobre cada documento, como os arquivos correspondentes a eles, os seus dados importantes e o seu ID.

## Importações

In [1]:
import os
import json

import pandas as pd
import cv2
import ipywidgets as widgets

## Constantes

In [2]:
DATASET_FOLDER_PATH = '../../RG-Dataset'
DATASET_CSV_PATH = f'{DATASET_FOLDER_PATH}/dataset.csv'

## Lendo o arquivo "dataset.csv"

In [3]:
dataset = pd.read_csv(DATASET_CSV_PATH, sep=';')
dataset

Unnamed: 0,id,image_path,ocr_path,segmentation_path,info_path,cpf,rg,birthdate,name
0,111111,files/000111111_in.jpg,files/000111111_gt_ocr.txt,files/000111111_gt_segmentation.jpg,files/000111111_info.json,354.205.532-87,08.096.661-5,02/01/1963,Rebelo Ronei Nakamurakare
1,230000,files/000230000_in.jpg,files/000230000_gt_ocr.txt,files/000230000_gt_segmentation.jpg,files/000230000_info.json,188.354.397-52,29.227.222-4,05/05/1984,Kohatsu Liberatti Ivan
2,233025,files/000233025_in.jpg,files/000233025_gt_ocr.txt,files/000233025_gt_segmentation.jpg,files/000233025_info.json,370.678.495-51,73.377.624-3,16/02/1976,Chicaro Okubaro Salvo
3,233331,files/000233331_in.jpg,files/000233331_gt_ocr.txt,files/000233331_gt_segmentation.jpg,files/000233331_info.json,624.476.345-95,84.941.430-1,20/11/2008,Hochun Cerdeira Crema
4,250000,files/000250000_in.jpg,files/000250000_gt_ocr.txt,files/000250000_gt_segmentation.jpg,files/000250000_info.json,,48.753.318-5,20/07/1978,Scrignoli Petenusci Rombach
...,...,...,...,...,...,...,...,...,...
2695,28684,files/00028684_in.jpg,files/00028684_gt_ocr.txt,files/00028684_gt_segmentation.jpg,files/00028684_info.json,001.127.634-72,31.140.515-0,14/06/2007,Franci Bergamo Tao
2696,28685,files/00028685_in.jpg,files/00028685_gt_ocr.txt,files/00028685_gt_segmentation.jpg,files/00028685_info.json,240.081.530-55,68.733.923-6,14/11/1994,Chilo Vesper Hernan
2697,28687,files/00028687_in.jpg,files/00028687_gt_ocr.txt,files/00028687_gt_segmentation.jpg,files/00028687_info.json,533.942.588-62,33.916.203-6,11/03/2005,Aysa Bomgiovani Toretti
2698,28688,files/00028688_in.jpg,files/00028688_gt_ocr.txt,files/00028688_gt_segmentation.jpg,files/00028688_info.json,877.703.295-07,33.468.765-2,14/05/1991,Meng Ando Bannach


In [4]:
docs_count = dataset.shape[0]
print(f'Existem {docs_count} documentos no dataset')

Existem 2700 documentos no dataset


In [5]:
dataset.isna().sum()

id                     0
image_path             0
ocr_path               0
segmentation_path      0
info_path              0
cpf                  410
rg                     0
birthdate              0
name                   2
dtype: int64

In [6]:
nan_docs_count = dataset.isna().sum().sum()
print(f'Existem {nan_docs_count} documentos com dados nulos')

Existem 412 documentos com dados nulos


In [7]:
not_nan_docs_count = docs_count - nan_docs_count
print(f'Logo, temos {not_nan_docs_count} documentos sem nenhum dado nulo')

Logo, temos 2288 documentos sem nenhum dado nulo


## Código para vizualizar informações do dataset

In [8]:
img_widget = widgets.Image(
    format='jpg',
    width=600,
    height=400
)

rotate_img_btn = widgets.Button(
    description='Rotacionar',
    icon='undo'
)

In [11]:
DOC_NUMBER = 2699
current_doc = dataset.iloc[DOC_NUMBER]

img = cv2.imread(f'{DATASET_FOLDER_PATH}/{current_doc["image_path"]}')
if img.shape[0] > img.shape[1]:
    img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["ocr_path"]}') as ocr_file:
    ocr_info = ocr_file.read()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["info_path"]}') as info_file:
    info_json = json.load(info_file)

def rotate_img():
    global img
    img = cv2.rotate(img, cv2.ROTATE_180)
    img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

rotate_img_btn.on_click(lambda _: rotate_img())

In [12]:
display(current_doc)

display(widgets.HBox([
    img_widget, 
    widgets.VBox([
        rotate_img_btn
    ])
], layout=widgets.Layout(align_items='center')))

print(ocr_info)
print(info_json)

id                                                28689
image_path                        files/00028689_in.jpg
ocr_path                      files/00028689_gt_ocr.txt
segmentation_path    files/00028689_gt_segmentation.jpg
info_path                      files/00028689_info.json
cpf                                      095.513.782-96
rg                                         21.686.087-8
birthdate                                    29/08/1985
name                             Vioti Purcineli Zicari
Name: 2699, dtype: object

HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C…

left;top;width;height;transcription
91;324;17;198;LEI Nº7 116 DE 29/08/83
105;92;19;70;NH 84
121;327;14;176;ASSINATURA DO DIRETOR
139;261;14;313;VALLADES GALVANI BERNARDO
172;133;18;164;095.513.782-96
173;89;14;29;CPF
205;95;19;214;ENTRE RIOS-BA
240;88;15;94;DOC ORIGEM
238;193;20;304;C.NAS=103 LV=259 FL=211
273;92;18;184;VEREDINHA-MG
307;88;14;112;NATURALIDADE
343;92;16;277;LUSHIUEN ALENO STOLFI
375;90;18;296;CURIEL NICIOLI SOMERA
405;88;15;66;FILIAÇÃO
442;91;15;248;VIOTI PURCINELI ZICARI
471;88;13;48;NOME
509;174;16;125;21.686.087-8
517;90;12;70;REGISTRO
501;90;14;48;GERAL
498;480;17;81;EXPEDIÇÃO
514;481;14;60;DATA DE
504;578;19;131;26/06/1991
270;573;17;114;29/08/1985
304;588;14;155;DATA DE NASCIMENTO
556;237;16;372;VÁLIDA EM TODO O TERRITÓRIO NACIONAL
103;652;19;67;2 VIA

{'cpf': '095.513.782-96', 'rg': '21.686.087-8', 'name': 'Vioti Purcineli Zicari', 'birthdate': '29/08/1985'}
