# Verificando se a "nova" base de dados está ok

O notebook disponível em `src/tratando_base_dados.ipynb` modifica a base de dados original BID, adicionando um novo arquivo para cada imagem, que é um arquivo JSON contendo informações relevantes do documento, como nome, data de nascimento, RG e CPF.

Além disso, também é criado um arquivo `.csv` com cada linha contendo informações sobre cada documento, como os arquivos correspondentes a eles, os seus dados importantes e o seu ID.

## Importações

In [1]:
import os
import json

import pandas as pd
import cv2
import ipywidgets as widgets

## Constantes

In [2]:
DATASET_FOLDER_PATH = '../../RG-Dataset'
DATASET_CSV_PATH = f'{DATASET_FOLDER_PATH}/dataset.csv'

## Lendo o arquivo "dataset.csv"

In [3]:
dataset = pd.read_csv(DATASET_CSV_PATH, sep=';')
dataset

Unnamed: 0,id,image_path,ocr_path,segmentation_path,info_path,cpf,rg,birthdate,name
0,111111,files/000111111_in.jpg,files/000111111_gt_ocr.txt,files/000111111_gt_segmentation.jpg,files/000111111_info.json,354.205.532-87,08.096.661-5,02/01/1963,Rebelo Ronei Nakamurakare
1,230000,files/000230000_in.jpg,files/000230000_gt_ocr.txt,files/000230000_gt_segmentation.jpg,files/000230000_info.json,188.354.397-52,29.227.222-4,05/05/1984,Kohatsu Liberatti Ivan
2,233025,files/000233025_in.jpg,files/000233025_gt_ocr.txt,files/000233025_gt_segmentation.jpg,files/000233025_info.json,370.678.495-51,73.377.624-3,16/02/1976,Chicaro Okubaro Salvo
3,233331,files/000233331_in.jpg,files/000233331_gt_ocr.txt,files/000233331_gt_segmentation.jpg,files/000233331_info.json,624.476.345-95,84.941.430-1,20/11/2008,Hochun Cerdeira Crema
4,250000,files/000250000_in.jpg,files/000250000_gt_ocr.txt,files/000250000_gt_segmentation.jpg,files/000250000_info.json,,48.753.318-5,20/07/1978,Scrignoli Petenusci Rombach
...,...,...,...,...,...,...,...,...,...
1880,27853,files/00027853_in.jpg,files/00027853_gt_ocr.txt,files/00027853_gt_segmentation.jpg,files/00027853_info.json,332.531.534-87,44.686.825-5,04/09/1972,Shiguti Welsh Pedao
1881,27854,files/00027854_in.jpg,files/00027854_gt_ocr.txt,files/00027854_gt_segmentation.jpg,files/00027854_info.json,001.127.634-72,31.140.515-0,14/06/2007,Franci Bergamo Tao
1882,27855,files/00027855_in.jpg,files/00027855_gt_ocr.txt,files/00027855_gt_segmentation.jpg,files/00027855_info.json,424.882.456-66,09.346.392-3,13/02/1998,Goularte Pantaroto Loscalzo
1883,27856,files/00027856_in.jpg,files/00027856_gt_ocr.txt,files/00027856_gt_segmentation.jpg,files/00027856_info.json,427.210.606-60,56.101.779-7,15/12/1984,Liara Diago Riga


In [4]:
docs_count = dataset.shape[0]
print(f'Existem {docs_count} documentos no dataset')

Existem 1885 documentos no dataset


In [5]:
dataset.isna().sum()

id                     0
image_path             0
ocr_path               0
segmentation_path      0
info_path              0
cpf                  287
rg                     0
birthdate              0
name                   2
dtype: int64

In [6]:
nan_docs_count = dataset.isna().sum().sum()
print(f'Existem {nan_docs_count} documentos com dados nulos')

Existem 289 documentos com dados nulos


In [7]:
not_nan_docs_count = docs_count - nan_docs_count
print(f'Logo, temos {not_nan_docs_count} documentos sem nenhum dado nulo')

Logo, temos 1596 documentos sem nenhum dado nulo


## Código para vizualizar informações do dataset

In [8]:
img_widget = widgets.Image(
    format='jpg',
    width=600,
    height=400
)

rotate_img_btn = widgets.Button(
    description='Rotacionar',
    icon='undo'
)

In [11]:
DOC_NUMBER = 1884
current_doc = dataset.iloc[DOC_NUMBER]

img = cv2.imread(f'{DATASET_FOLDER_PATH}/{current_doc["image_path"]}')
if img.shape[0] > img.shape[1]:
    img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["ocr_path"]}') as ocr_file:
    ocr_info = ocr_file.read()

with open(f'{DATASET_FOLDER_PATH}/{current_doc["info_path"]}') as info_file:
    info_json = json.load(info_file)

def rotate_img():
    global img
    img = cv2.rotate(img, cv2.ROTATE_180)
    img_widget.value = cv2.imencode('.jpg', img)[1].tobytes()

rotate_img_btn.on_click(lambda _: rotate_img())

In [12]:
display(current_doc)

display(widgets.HBox([
    img_widget, 
    widgets.VBox([
        rotate_img_btn
    ])
], layout=widgets.Layout(align_items='center')))

print(ocr_info)
print(info_json)

id                                                27857
image_path                        files/00027857_in.jpg
ocr_path                      files/00027857_gt_ocr.txt
segmentation_path    files/00027857_gt_segmentation.jpg
info_path                      files/00027857_info.json
cpf                                      427.938.677-30
rg                                         64.697.408-7
birthdate                                    17/10/1956
name                          Marcuns Wlademir Altemani
Name: 1884, dtype: object

HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C…

left;top;width;height;transcription
[178, 460, 460, 176];[53, 56, 71, 68];-1;-1;VÁLIDA EM TODO O TERRITÓRIO NACIONAL
115;85;92;17;64.697.408-7
[62, 112, 113, 61];[86, 85, 96, 95];-1;-1;REGISTRO
[62, 96, 96, 61];[106, 107, 97, 97];-1;-1;GERAL
[61, 94, 92, 61];[127, 128, 120, 118];-1;-1;NOME
54;130;264;17;MARCUNS WLADEMIR ALTEMANI
[60, 110, 108, 61];[178, 180, 168, 168];-1;-1;FILIAÇÃO
54;186;222;11;CAZOLLATO GENARIO UMEKI
55;211;227;11;PANEQUE MASAKAZU RINGER
[62, 144, 143, 61];[243, 244, 254, 252];-1;-1;NATURALIDADE
56;258;89;12;BONITO-MS
131;279;206;12;C.NAS=477 LV=485 FL=446
[61, 131, 131, 61];[293, 295, 306, 304];-1;-1;DOC ORIGEM
55;307;114;11;JAGUARIÚNA-SP
85;336;88;9;427.938.677-30
[61, 82, 81, 60];[345, 346, 356, 356];-1;-1;CPF
53;382;44;12;NH 36
195;375;126;9;SACCHI DORO GADIA
[244, 377, 377, 244];[398, 399, 390, 388];-1;-1;ASSINATURA DO DIRETOR
[242, 388, 388, 242];[420, 422, 409, 408];-1;-1;LEI Nº7 116 DE 29/08/83
477;386;36;11;4 VIA
421;260;69;11;17/10/1956
[442, 563, 562, 442