In [1]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm, tqdm_notebook # loading bar 
import pandas as pd
import re
from pprint import pprint
from nltk.tokenize import sent_tokenize
import numpy as np

from spacy.util import minibatch, compounding

# Abrindo e gerando Datasets

### Entidade logradouro

In [2]:
# Obtendo dados crus de endereço

dataset = pd.read_csv("201906AGENCIAS.CSV")
dset = dataset.iloc[:-2,4:10].values  # tipo nparray object
pd_dset = pd.DataFrame(dset) # caso queira visualizar

pd_dset['Join'] = pd_dset[pd_dset.columns[0:]].apply(
    lambda x: ';'.join(x.dropna().astype(str)),
    axis=1
)
pd_dset.head(5)

Unnamed: 0,0,1,2,3,4,5,Join
0,"R.GUILHERME MOREIRA,315","SUBLOJA,LOJA E 2.ANDAR ...",CENTRO,69005-300,MANAUS ...,AM,"R.GUILHERME MOREIRA,315 ;SUBLO..."
1,"AV.PRES.VARGAS,248",1.E 2.ANDARES ...,CAMPINA,66010-900,BELEM ...,PA,"AV.PRES.VARGAS,248 ;1.E 2..."
2,"R.QUINZE DE NOVEMBRO,195",...,CENTRO,11010-908,SANTOS ...,SP,"R.QUINZE DE NOVEMBRO,195 ; ..."
3,"PCA.DAS QUATRO JORNADAS,11",MEZANINO ...,CENTRO,28010-000,CAMPOS DOS GOYTACAZES ...,RJ,"PCA.DAS QUATRO JORNADAS,11 ;MEZAN..."
4,"SEXTA AVENIDA,600",SECRETARIA DA EDUCACAO-TERREO ...,CAB,41745-002,SALVADOR ...,BA,"SEXTA AVENIDA,600 ;SECRE..."


In [3]:
# Tratando endereço completo do DataFrame
dset = np.array(pd_dset)
end_lista = []

for i in range(len(dset)):
    str_raw = dset[i][6]
    str_tratada = re.sub(r'[ ]{2,}', "",str_raw) # Tirando espaços excedentes no final do endereço
    str_tratada = re.sub(r'[;]{1,}', "; ", str_tratada) # Para complementos vazios, para não ter 2 ";"
    str_tratada = str_tratada.lower()
    
    end_lista.append(str_tratada)

end_lista[:3]

['r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am',
 'av.pres.vargas,248; 1.e 2.andares; campina; 66010-900; belem; pa',
 'r.quinze de novembro,195; centro; 11010-908; santos; sp']

In [4]:
# Lógica para extrair posição do logradouro:
# Dividir a String inteira por ";", pegar o len do primeiro split
# len de LOGRA é de 0 até len do primeiro split

print(end_lista[0])
split = end_lista[0].split(";")
print(split)
print(split[0])
print(len(split[0]))

r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am
['r.guilherme moreira,315', ' subloja,loja e 2.andar', ' centro', ' 69005-300', ' manaus', ' am']
r.guilherme moreira,315
23


In [5]:
# Extraindo iob com entidade apenas de logradouro

iob = []

for i in range(len(end_lista)):
    split = end_lista[i].split(";")
    len_logra = len(split[0])
    iob_dict = {"entities": [(0, len_logra-1,'LOGRA')]}
    end_lista[i] = end_lista[i].replace(";", " ")
    end_lista[i] = end_lista[i].replace(",", " ")
    end_lista[i] = re.sub(r'[ ]{2,}', " ", end_lista[i])

    
    tupla = (end_lista[i], iob_dict)
    iob.append(tupla)

FULL_DATA = iob[:2000]

In [6]:
print(FULL_DATA[0])
print(FULL_DATA[1])
print(FULL_DATA[2])
print(FULL_DATA[5:15])

('r.guilherme moreira 315 subloja loja e 2.andar centro 69005-300 manaus am', {'entities': [(0, 22, 'LOGRA')]})
('av.pres.vargas 248 1.e 2.andares campina 66010-900 belem pa', {'entities': [(0, 17, 'LOGRA')]})
('r.quinze de novembro 195 centro 11010-908 santos sp', {'entities': [(0, 23, 'LOGRA')]})
[('av.rio branco 240 1.andar recife antigo 50030-310 recife pe', {'entities': [(0, 16, 'LOGRA')]}), ('av.santos dumont 2828 5.andar aldeota 60150-162 fortaleza ce', {'entities': [(0, 20, 'LOGRA')]}), ('pca.tiradentes 410 1.andar centro 80020-100 curitiba pr', {'entities': [(0, 17, 'LOGRA')]}), ('r.uruguai 185 5.andar centro 90010-901 porto alegre rs', {'entities': [(0, 12, 'LOGRA')]}), ('pca.1817 129 1.andar centro 58013-010 joao pessoa pb', {'entities': [(0, 11, 'LOGRA')]}), ('pca.odilon resende andrade 76 centro 37410-000 tres coracoes mg', {'entities': [(0, 28, 'LOGRA')]}), ('av fernandes lima 2591 terreo farol 57057-972 maceio al', {'entities': [(0, 22, 'LOGRA')]}), ('r.treze de junho 91

In [7]:
# Criação da base de teste e treinamento

n_test= 0.1 # Porcentagem para base de teste
test_n = round(len(FULL_DATA) * n_test)

# Divisao em Train Test Val

def gerador_bases(dataset, n):
    indices_random = random.sample(range(0,len(dataset)-1), n)
    base_teste_n = []
    base_treinamento_n = []
    
    for i in range(n):
        base_teste_n.append(dataset[indices_random[i]])

    for j in range(len(dataset)):
        if(j not in indices_random):
            base_treinamento_n.append(dataset[j])
            
    return base_teste_n, base_treinamento_n


base_teste, base_treinamento = gerador_bases(FULL_DATA, test_n)

random.shuffle(base_treinamento)
random.shuffle(base_teste)

print("Treinamento: " + str(len(base_treinamento)), "\nTeste: " + str(len(base_teste)), "\nTotal: " + str(len(FULL_DATA)))

Treinamento: 1800 
Teste: 200 
Total: 2000


In [8]:
for i in range(5):
    print(base_treinamento[i])

('r.barao do paraim 10 esq.c/av.curimata centro 64960-000 curimata pi', {'entities': [(0, 19, 'LOGRA')]})
('av.sao paulo 666 centro 19840-000 maracai sp', {'entities': [(0, 15, 'LOGRA')]})
('r.cel.alexandrino 860 centro 62800-000 aracati ce', {'entities': [(0, 20, 'LOGRA')]})
('r.paulo de frontin 4 centro 27123-120 barra do pirai rj', {'entities': [(0, 19, 'LOGRA')]})
('pca.salviano leite 10 centro 58765-000 pianco pb', {'entities': [(0, 20, 'LOGRA')]})


# Carregando o modelo

In [9]:
# Define our variables

model = None
output_dir=Path(".")
n_iter= 100 # número de épocas
batch_size = 32

In [10]:
# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    lan = 'pt'
    nlp = spacy.blank(lan)  # create blank Language class
    print("Created blank '%s' model" % lan)
    
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    print('Added new NER')
else:
    ner = nlp.get_pipe('ner')
    print('Got an old NER')

Created blank 'pt' model
Added new NER


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

## Treinamento

In [12]:
print("Batch size: ", batch_size)
print("Épocas: ", n_iter)
print()

# add labels
for _, annotations in base_treinamento:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(base_treinamento)
        losses = {}
        batches = minibatch(base_treinamento, size=batch_size)
        
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            except:
                pass     
        
        print(itn+1, ' Losses', losses)        

Batch size:  32
Épocas:  100

1  Losses {'ner': 3061.0399953189285}
2  Losses {'ner': 2402.4427291956613}
3  Losses {'ner': 2757.6768878702205}
4  Losses {'ner': 1677.6563149470055}
5  Losses {'ner': 476.04927054536967}
6  Losses {'ner': 383.00027009827147}
7  Losses {'ner': 319.7894831095469}
8  Losses {'ner': 211.68919261949603}
9  Losses {'ner': 188.1920694739461}
10  Losses {'ner': 492.0586673295421}
11  Losses {'ner': 183.32801750524828}
12  Losses {'ner': 179.98726840813137}
13  Losses {'ner': 294.4811910471}
14  Losses {'ner': 119.9748945521456}
15  Losses {'ner': 135.87121025706242}
16  Losses {'ner': 90.55791834476798}
17  Losses {'ner': 73.62073223020415}
18  Losses {'ner': 117.23457776089228}
19  Losses {'ner': 188.47480340024214}
20  Losses {'ner': 135.79660283812822}
21  Losses {'ner': 203.80691031773193}
22  Losses {'ner': 212.94218746089447}
23  Losses {'ner': 77.31186337532179}
24  Losses {'ner': 114.58661471232105}
25  Losses {'ner': 134.4135958596508}
26  Losses {'ner

In [13]:
# test the trained model
for text, _ in base_teste:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    print()

Entities [('r.rui barbosa 275', 'LOGRA')]
Tokens [('r.rui', 'LOGRA', 3), ('barbosa', 'LOGRA', 1), ('275', 'LOGRA', 1), ('predio', '', 2), ('centro', '', 2), ('47400-000', '', 2), ('xique-xique', '', 2), ('ba', '', 2)]

Entities [('av.do engenho 309', 'LOGRA')]
Tokens [('av.do', 'LOGRA', 3), ('engenho', 'LOGRA', 1), ('309', 'LOGRA', 1), ('centro', '', 2), ('89893-000', '', 2), ('mondai', '', 2), ('sc', '', 2)]

Entities [('r.israel fonseca 4', 'LOGRA')]
Tokens [('r.israel', 'LOGRA', 3), ('fonseca', 'LOGRA', 1), ('4', 'LOGRA', 1), ('terreo', '', 2), ('centro', '', 2), ('55730-000', '', 2), ('bom', '', 2), ('jardim', '', 2), ('pe', '', 2)]

Entities [('r.bento goncalves 783', 'LOGRA')]
Tokens [('r.bento', 'LOGRA', 3), ('goncalves', 'LOGRA', 1), ('783', 'LOGRA', 1), ('centro', '', 2), ('95520-000', '', 2), ('osorio', '', 2), ('rs', '', 2)]

Entities [('av.dr.jairo sento-se s/', 'LOGRA')]
Tokens [('av.dr.jairo', 'LOGRA', 3), ('sento-se', 'LOGRA', 1), ('s', 'LOGRA', 1), ('/', 'LOGRA', 1), ('

Entities [('av.joao cesar de oliveira 1045', 'LOGRA')]
Tokens [('av.joao', 'LOGRA', 3), ('cesar', 'LOGRA', 1), ('de', 'LOGRA', 1), ('oliveira', 'LOGRA', 1), ('1045', 'LOGRA', 1), ('predio', '', 2), ('eldorado', '', 2), ('32315-000', '', 2), ('contagem', '', 2), ('mg', '', 2)]

Entities [('av.antonio afonso de lima 475', 'LOGRA')]
Tokens [('av.antonio', 'LOGRA', 3), ('afonso', 'LOGRA', 1), ('de', 'LOGRA', 1), ('lima', 'LOGRA', 1), ('475', 'LOGRA', 1), ('terreo', '', 2), ('centro', '', 2), ('07400-000', '', 2), ('aruja', '', 2), ('sp', '', 2)]

Entities [('av.erasto gaertner 160', 'LOGRA')]
Tokens [('av.erasto', 'LOGRA', 3), ('gaertner', 'LOGRA', 1), ('160', 'LOGRA', 1), ('bacacheri', '', 2), ('82510-160', '', 2), ('curitiba', '', 2), ('pr', '', 2)]

Entities [('pca.bolivar andrade 20', 'LOGRA')]
Tokens [('pca.bolivar', 'LOGRA', 3), ('andrade', 'LOGRA', 1), ('20', 'LOGRA', 1), ('centro', '', 2), ('35537-000', '', 2), ('passa', '', 2), ('tempo', '', 2), ('mg', '', 2)]

Entities [('av.pres

Entities [('av.rio de janeiro 720', 'LOGRA')]
Tokens [('av.rio', 'LOGRA', 3), ('de', 'LOGRA', 1), ('janeiro', 'LOGRA', 1), ('720', 'LOGRA', 1), ('centro', '', 2), ('86220-000', '', 2), ('assai', '', 2), ('pr', '', 2)]

Entities [('av.pres.getulio vargas 708', 'LOGRA')]
Tokens [('av.pres.getulio', 'LOGRA', 3), ('vargas', 'LOGRA', 1), ('708', 'LOGRA', 1), ('centro', '', 2), ('86900-000', '', 2), ('jandaia', '', 2), ('do', '', 2), ('sul', '', 2), ('pr', '', 2)]

Entities [('r.da padroeira 499', 'LOGRA')]
Tokens [('r.da', 'LOGRA', 3), ('padroeira', 'LOGRA', 1), ('499', 'LOGRA', 1), ('centro', '', 2), ('13201-026', '', 2), ('jundiai', '', 2), ('sp', '', 2)]

Entities [('pca.dr.arnolfo de azevedo 93', 'LOGRA')]
Tokens [('pca.dr.arnolfo', 'LOGRA', 3), ('de', 'LOGRA', 1), ('azevedo', 'LOGRA', 1), ('93', 'LOGRA', 1), ('centro', '', 2), ('12600-210', '', 2), ('lorena', '', 2), ('sp', '', 2)]

Entities [('r.campo grande 255', 'LOGRA')]
Tokens [('r.campo', 'LOGRA', 3), ('grande', 'LOGRA', 1), ('25

In [None]:
base_teste_final[0]

In [17]:
phrase = "SQSW 102, bloco D - O juventino foi na rua comprar pao "

doc = nlp(phrase)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('SQSW 102', 'LOGRA')]
Tokens [('SQSW', 'LOGRA', 3), ('102', 'LOGRA', 1), (',', '', 2), ('bloco', '', 2), ('D', '', 2), ('-', '', 2), ('O', '', 2), ('juventino', '', 2), ('foi', '', 2), ('na', '', 2), ('rua', '', 2), ('comprar', '', 2), ('pao', '', 2)]


In [None]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [None]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
loaded_model = spacy.load(output_dir)
evaluate(loaded_model, base_teste_final)