# Verificando se a "nova" base de dados está ok

O notebook disponível em `src/tratando_base_dados.ipynb` modifica a base de dados original BID, adicionando um novo arquivo para cada imagem, que é um arquivo JSON contendo informações relevantes do documento, como nome, data de nascimento, RG e CPF.

Além disso, também é criado um arquivo `.csv` com cada linha contendo informações sobre cada documento, como os arquivos correspondentes a eles, os seus dados importantes e o seu ID.

## Importações

In [1]:
import os
import json

import pandas as pd
import cv2
import ipywidgets as widgets

## Constantes

In [2]:
DATASET_FOLDER_PATH = '../../RG-Dataset'
DATASET_CSV_PATH = f'{DATASET_FOLDER_PATH}/dataset.csv'

## Lendo o arquivo "dataset.csv"

In [3]:
dataset = pd.read_csv(DATASET_CSV_PATH, sep=';')
dataset

Unnamed: 0,id,image_path,ocr_path,segmentation_path,info_path,cpf,rg,birthdate,name
0,111111,files/000111111_in.jpg,files/000111111_gt_ocr.txt,files/000111111_gt_segmentation.jpg,files/000111111_info.json,354.205.532-87,08.096.661-5,02/01/1963,Rebelo Ronei Nakamurakare
1,230000,files/000230000_in.jpg,files/000230000_gt_ocr.txt,files/000230000_gt_segmentation.jpg,files/000230000_info.json,188.354.397-52,29.227.222-4,05/05/1984,Kohatsu Liberatti Ivan
2,233025,files/000233025_in.jpg,files/000233025_gt_ocr.txt,files/000233025_gt_segmentation.jpg,files/000233025_info.json,370.678.495-51,73.377.624-3,16/02/1976,Chicaro Okubaro Salvo
3,233331,files/000233331_in.jpg,files/000233331_gt_ocr.txt,files/000233331_gt_segmentation.jpg,files/000233331_info.json,624.476.345-95,84.941.430-1,20/11/2008,Hochun Cerdeira Crema
4,250000,files/000250000_in.jpg,files/000250000_gt_ocr.txt,files/000250000_gt_segmentation.jpg,files/000250000_info.json,,48.753.318-5,20/07/1978,Scrignoli Petenusci Rombach
...,...,...,...,...,...,...,...,...,...
2247,28226,files/00028226_in.jpg,files/00028226_gt_ocr.txt,files/00028226_gt_segmentation.jpg,files/00028226_info.json,437.205.174-38,54.418.427-0,15/04/1977,Koivisto Barreto Malvao
2248,28227,files/00028227_in.jpg,files/00028227_gt_ocr.txt,files/00028227_gt_segmentation.jpg,files/00028227_info.json,015.092.775-44,78.972.039-5,16/02/1998,Tairone Simabukulo Kusaba
2249,28228,files/00028228_in.jpg,files/00028228_gt_ocr.txt,files/00028228_gt_segmentation.jpg,files/00028228_info.json,795.486.901-60,25.209.967-9,31/01/1958,Bouskela Morishigue Seithi
2250,28229,files/00028229_in.jpg,files/00028229_gt_ocr.txt,files/00028229_gt_segmentation.jpg,files/00028229_info.json,790.790.752-72,17.546.850-3,11/10/1977,Ballarin Suenori Frigori


In [4]:
docs_count = dataset.shape[0]
print(f'Existem {docs_count} documentos no dataset')

Existem 2252 documentos no dataset


In [5]:
dataset.isna().sum()

id                     0
image_path             0
ocr_path               0
segmentation_path      0
info_path              0
cpf                  348
rg                     0
birthdate              0
name                   2
dtype: int64

In [6]:
nan_docs_count = dataset.isna().sum().sum()
print(f'Existem {nan_docs_count} documentos com dados nulos')

Existem 350 documentos com dados nulos


In [7]:
not_nan_docs_count = docs_count - nan_docs_count
print(f'Logo, temos {not_nan_docs_count} documentos sem nenhum dado nulo')

Logo, temos 1902 documentos sem nenhum dado nulo


## Código para vizualizar informações do dataset

In [8]:
img_widget = widgets.Image(
    format='jpg',
    width=600,
    height=400
)

rotate_img_btn = widgets.Button(
    description='Rotacionar',
    icon='undo'
)

In [13]:
DOC_NUMBER = 2251
current_doc = dataset.iloc[DOC_NUMBER]

img = cv2.imread(f'{DATASET_FOLDER_PATH}/{current_doc["image_path"]}')
if img.shape[0] > img.shape[1]:
    img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["ocr_path"]}') as ocr_file:
    ocr_info = ocr_file.read()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["info_path"]}') as info_file:
    info_json = json.load(info_file)

def rotate_img():
    global img
    img = cv2.rotate(img, cv2.ROTATE_180)
    img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

rotate_img_btn.on_click(lambda _: rotate_img())

IndexError: single positional indexer is out-of-bounds

In [12]:
display(current_doc)

display(widgets.HBox([
    img_widget, 
    widgets.VBox([
        rotate_img_btn
    ])
], layout=widgets.Layout(align_items='center')))

print(ocr_info)
print(info_json)

id                                                28230
image_path                        files/00028230_in.jpg
ocr_path                      files/00028230_gt_ocr.txt
segmentation_path    files/00028230_gt_segmentation.jpg
info_path                      files/00028230_info.json
cpf                                      440.992.127-42
rg                                         21.173.877-3
birthdate                                    16/09/1965
name                             Mercaldi Matrai Scalon
Name: 2251, dtype: object

HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C…

left;top;width;height;transcription
[271, 271, 719, 717];[91, 69, 74, 94];-1;-1;VÁLIDA EM TODO O TERRITÓRIO NACIONAL
[682, 683, 868, 868];[384, 368, 370, 387];-1;-1;DATA DE NASCIMENTO
83;608;88;23;NH 66
369;631;211;20;ASSINATURA DO DIRETOR
367;662;233;24;LEI Nº7 116 DE 29/08/83
131;525;236;24;440.992.127-42
85;530;39;22;CPF
86;474;197;33;VITÓRIA-ES
89;445;111;20;DOC ORIGEM
208;439;482;28;C.NAS=184 LV=166 FL=23
90;395;211;26;ARARAS-SP
89;362;131;17;NATURALIDADE
658;400;176;26;16/09/1965
666;103;185;28;20/09/1990
558;118;98;20;EXPEDIÇÃO
559;98;73;21;DATA DE
96;151;54;19;NOME
98;114;55;20;GERAL
98;96;83;17;REGISTRO
93;232;78;23;FILIAÇÃO
87;311;406;22;MITHUHIRO DIOGO LOYANE
90;268;462;23;DALECIO VITERBO ALAVARCE
91;188;401;20;MERCALDI MATRAI SCALON
187;103;158;20;21.173.877-3
752;611;62;19;9 VIA

{'cpf': '440.992.127-42', 'rg': '21.173.877-3', 'name': 'Mercaldi Matrai Scalon', 'birthdate': '16/09/1965'}
