In [1]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm, tqdm_notebook # loading bar 
import pandas as pd
import re
from pprint import pprint
from nltk.tokenize import sent_tokenize
import numpy as np

from spacy.util import minibatch, compounding

# Abrindo e gerando Datasets

### Entidade logradouro

In [2]:
# Obtendo dados crus de endereço

dataset = pd.read_csv("201906AGENCIAS.CSV")
dset = dataset.iloc[:-2,4:10].values  # tipo nparray object
pd_dset = pd.DataFrame(dset) # caso queira visualizar

pd_dset['Join'] = pd_dset[pd_dset.columns[0:]].apply(
    lambda x: ';'.join(x.dropna().astype(str)),
    axis=1
)
pd_dset.head(5)

Unnamed: 0,0,1,2,3,4,5,Join
0,"R.GUILHERME MOREIRA,315","SUBLOJA,LOJA E 2.ANDAR ...",CENTRO,69005-300,MANAUS ...,AM,"R.GUILHERME MOREIRA,315 ;SUBLO..."
1,"AV.PRES.VARGAS,248",1.E 2.ANDARES ...,CAMPINA,66010-900,BELEM ...,PA,"AV.PRES.VARGAS,248 ;1.E 2..."
2,"R.QUINZE DE NOVEMBRO,195",...,CENTRO,11010-908,SANTOS ...,SP,"R.QUINZE DE NOVEMBRO,195 ; ..."
3,"PCA.DAS QUATRO JORNADAS,11",MEZANINO ...,CENTRO,28010-000,CAMPOS DOS GOYTACAZES ...,RJ,"PCA.DAS QUATRO JORNADAS,11 ;MEZAN..."
4,"SEXTA AVENIDA,600",SECRETARIA DA EDUCACAO-TERREO ...,CAB,41745-002,SALVADOR ...,BA,"SEXTA AVENIDA,600 ;SECRE..."


In [3]:
# Tratando endereço completo do DataFrame
dset = np.array(pd_dset)
end_lista = []

for i in range(len(dset)):
    str_raw = dset[i][6]
    str_tratada = re.sub(r'[ ]{2,}', "",str_raw) # Tirando espaços excedentes no final do endereço
    str_tratada = re.sub(r'[;]{1,}', "; ", str_tratada) # Para complementos vazios, para não ter 2 ";"
    str_tratada = str_tratada.lower()
    
    end_lista.append(str_tratada)

end_lista[:3]

['r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am',
 'av.pres.vargas,248; 1.e 2.andares; campina; 66010-900; belem; pa',
 'r.quinze de novembro,195; centro; 11010-908; santos; sp']

In [4]:
# Lógica para extrair posição do logradouro:
# Dividir a String inteira por ";", pegar o len do primeiro split
# len de LOGRA é de 0 até len do primeiro split

print(end_lista[0])
split = end_lista[0].split(";")
print(split)
print(split[0])
print(len(split[0]))

r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am
['r.guilherme moreira,315', ' subloja,loja e 2.andar', ' centro', ' 69005-300', ' manaus', ' am']
r.guilherme moreira,315
23


In [5]:
# Extraindo iob com entidade apenas de logradouro

iob = []

for i in range(len(end_lista)):
    split = end_lista[i].split(";")
    len_logra = len(split[0])
    iob_dict = {"entities": [(0, len_logra-1,'LOGRA')]}
    end_lista[i] = end_lista[i].replace(";", " ")
    end_lista[i] = end_lista[i].replace(",", " ")
    end_lista[i] = re.sub(r'[ ]{2,}', " ", end_lista[i])

    
    tupla = (end_lista[i], iob_dict)
    iob.append(tupla)

FULL_DATA = iob

In [6]:
print(FULL_DATA[0])
print(FULL_DATA[1])
print(FULL_DATA[2])
print(FULL_DATA[5:15])

('r.guilherme moreira 315 subloja loja e 2.andar centro 69005-300 manaus am', {'entities': [(0, 22, 'LOGRA')]})
('av.pres.vargas 248 1.e 2.andares campina 66010-900 belem pa', {'entities': [(0, 17, 'LOGRA')]})
('r.quinze de novembro 195 centro 11010-908 santos sp', {'entities': [(0, 23, 'LOGRA')]})
[('av.rio branco 240 1.andar recife antigo 50030-310 recife pe', {'entities': [(0, 16, 'LOGRA')]}), ('av.santos dumont 2828 5.andar aldeota 60150-162 fortaleza ce', {'entities': [(0, 20, 'LOGRA')]}), ('pca.tiradentes 410 1.andar centro 80020-100 curitiba pr', {'entities': [(0, 17, 'LOGRA')]}), ('r.uruguai 185 5.andar centro 90010-901 porto alegre rs', {'entities': [(0, 12, 'LOGRA')]}), ('pca.1817 129 1.andar centro 58013-010 joao pessoa pb', {'entities': [(0, 11, 'LOGRA')]}), ('pca.odilon resende andrade 76 centro 37410-000 tres coracoes mg', {'entities': [(0, 28, 'LOGRA')]}), ('av fernandes lima 2591 terreo farol 57057-972 maceio al', {'entities': [(0, 22, 'LOGRA')]}), ('r.treze de junho 91

In [7]:
# Criação da base de teste e treinamento

n_test= 0.1 # Porcentagem para base de teste
test_n = round(len(FULL_DATA) * n_test)

# Divisao em Train Test Val

def gerador_bases(dataset, n):
    indices_random = random.sample(range(0,len(dataset)-1), n)
    base_teste_n = []
    base_treinamento_n = []
    
    for i in range(n):
        base_teste_n.append(dataset[indices_random[i]])

    for j in range(len(dataset)):
        if(j not in indices_random):
            base_treinamento_n.append(dataset[j])
            
    return base_teste_n, base_treinamento_n


base_teste, base_treinamento = gerador_bases(FULL_DATA, test_n)

random.shuffle(base_treinamento)
random.shuffle(base_teste)

print("Treinamento: " + str(len(base_treinamento)), "\nTeste: " + str(len(base_teste)), "\nTotal: " + str(len(FULL_DATA)))

Treinamento: 19202 
Teste: 2134 
Total: 21336


In [8]:
for i in range(5):
    print(base_treinamento[i])

('av. cardeal eugenio pacelli 735 cidade industrial 32210-000 contagem mg', {'entities': [(0, 31, 'LOGRA')]})
('av. ceci 1850 tambor� 06460-120 barueri sp', {'entities': [(0, 13, 'LOGRA')]})
('r.miguel braz arroteia 661 centro 17260-000 itaju sp', {'entities': [(0, 25, 'LOGRA')]})
('av brasil zona 01 zona 01 87013-000 maringa pr', {'entities': [(0, 9, 'LOGRA')]})
('rua jose pires neto 314 lojas 01 e 02 cambui 13025-170 campinas sp', {'entities': [(0, 23, 'LOGRA')]})


# Carregando o modelo

In [9]:
# Define our variables

model = None
output_dir=Path(".")
n_iter= 100 # número de épocas
batch_size = 16

In [10]:
# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    lan = 'pt'
    nlp = spacy.blank(lan)  # create blank Language class
    print("Created blank '%s' model" % lan)
    
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    print('Added new NER')
else:
    ner = nlp.get_pipe('ner')
    print('Got an old NER')

Created blank 'pt' model
Added new NER


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

## Treinamento

In [None]:
print("Batch size: ", batch_size)
print("Épocas: ", n_iter)
print()

# add labels
for _, annotations in base_treinamento:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(base_treinamento)
        losses = {}
        batches = minibatch(base_treinamento, size=batch_size)
        
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            except:
                pass     
        
        print(itn+1, ' Losses', losses)        

Batch size:  16
Épocas:  100

1  Losses {'ner': 11602.628621428197}
2  Losses {'ner': 5276.184860599323}
3  Losses {'ner': 4595.871697298568}
4  Losses {'ner': 4241.3442721682695}
5  Losses {'ner': 3900.618381412886}
6  Losses {'ner': 3493.3100356395607}
7  Losses {'ner': 3618.9451190538234}
8  Losses {'ner': 3205.481979475259}
9  Losses {'ner': 3086.0042172057842}
10  Losses {'ner': 2962.6198100986962}
11  Losses {'ner': 2850.363329297907}
12  Losses {'ner': 2762.507619019205}
13  Losses {'ner': 2706.1839191763024}
14  Losses {'ner': 2394.040338692879}
15  Losses {'ner': 2100.4768702094952}
16  Losses {'ner': 2116.5966580680883}
17  Losses {'ner': 2010.862363741636}
18  Losses {'ner': 1856.1133115363568}
19  Losses {'ner': 2295.4676639147715}
20  Losses {'ner': 2164.5152972484448}
21  Losses {'ner': 1976.617942370528}
22  Losses {'ner': 1774.0753300385845}
23  Losses {'ner': 1615.6813327383754}
24  Losses {'ner': 1773.4604692836442}
25  Losses {'ner': 1724.1240108527331}
26  Losses {'

In [13]:
# test the trained model
for text, _ in base_teste:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    print()

Entities [('av. andre araujo', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('andre', 'LOGRA', 1), ('araujo', 'LOGRA', 1), ('adrianopolis', '', 2), ('69057-025', '', 2), ('manaus', '', 2), ('am', '', 2)]

Entities [('praca padre francisco mira(bandeira) 6', 'LOGRA')]
Tokens [('praca', 'LOGRA', 3), ('padre', 'LOGRA', 1), ('francisco', 'LOGRA', 1), ('mira(bandeira', 'LOGRA', 1), (')', 'LOGRA', 1), ('6', 'LOGRA', 1), ('centro', '', 2), ('a', '', 2), ('37466-000', '', 2), ('itamonte', '', 2), ('mg', '', 2)]

Entities [('av. leopoldino de oliveira 3446', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('leopoldino', 'LOGRA', 1), ('de', 'LOGRA', 1), ('oliveira', 'LOGRA', 1), ('3446', 'LOGRA', 1), ('60746948', '', 2), ('centro', '', 2), ('38010-000', '', 2), ('uberaba', '', 2), ('mg', '', 2)]

Entities [('av.antonio piranga 143', 'LOGRA')]
Tokens [('av.antonio', 'LOGRA', 3), ('piranga', 'LOGRA', 1), ('143', 'LOGRA', 1), ('centro', '', 2), ('09911-160', '', 2), ('diadema',

Entities [('pra�a santos dumont 225 qd. 41 a', 'LOGRA')]
Tokens [('pra', 'LOGRA', 3), ('�', 'LOGRA', 1), ('a', 'LOGRA', 1), ('santos', 'LOGRA', 1), ('dumont', 'LOGRA', 1), ('225', 'LOGRA', 1), ('qd', 'LOGRA', 1), ('.', 'LOGRA', 1), ('41', 'LOGRA', 1), ('a', 'LOGRA', 1), ('-', '', 2), ('lote', '', 2), ('12', '', 2), ('setor', '', 2), ('aeroporto', '', 2), ('74070-050', '', 2), ('goiania', '', 2), ('go', '', 2)]

Entities [('rua marechal deodoro 212', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('marechal', 'LOGRA', 1), ('deodoro', 'LOGRA', 1), ('212', 'LOGRA', 1), ('centro', '', 2), ('47500-000', '', 2), ('paratinga', '', 2), ('ba', '', 2)]

Entities [('r.jose cleto 28', 'LOGRA')]
Tokens [('r.jose', 'LOGRA', 3), ('cleto', 'LOGRA', 1), ('28', 'LOGRA', 1), ('palmares', '', 2), ('31160-470', '', 2), ('belo', '', 2), ('horizonte', '', 2), ('mg', '', 2)]

Entities [('pca. prado filho 30', 'LOGRA')]
Tokens [('pca', 'LOGRA', 3), ('.', 'LOGRA', 1), ('prado', 'LOGRA', 1), ('filho', 'LOGRA', 1), ('30'

Entities [('avenida matteo bei 2592 e 2598', 'LOGRA')]
Tokens [('avenida', 'LOGRA', 3), ('matteo', 'LOGRA', 1), ('bei', 'LOGRA', 1), ('2592', 'LOGRA', 1), ('e', 'LOGRA', 1), ('2598', 'LOGRA', 1), ('sao', '', 2), ('mateus', '', 2), ('03949-200', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('rua bocai�va n� 1600', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('bocai', 'LOGRA', 1), ('�', 'LOGRA', 1), ('va', 'LOGRA', 1), ('n', 'LOGRA', 1), ('�', 'LOGRA', 1), ('1600', 'LOGRA', 1), ('centro', '', 2), ('88015-530', '', 2), ('florianopolis', '', 2), ('sc', '', 2)]

Entities [('praca emaculada conceicao 458', 'LOGRA')]
Tokens [('praca', 'LOGRA', 3), ('emaculada', 'LOGRA', 1), ('conceicao', 'LOGRA', 1), ('458', 'LOGRA', 1), ('centro', '', 2), ('46500-000', '', 2), ('macaubas', '', 2), ('ba', '', 2)]

Entities [('avenida barao de studart', 'LOGRA')]
Tokens [('avenida', 'LOGRA', 3), ('barao', 'LOGRA', 1), ('de', 'LOGRA', 1), ('studart', 'LOGRA', 1), ('meireles', '', 2), ('60120-0

Entities [('pra�a wilson sales 163', 'LOGRA')]
Tokens [('pra', 'LOGRA', 3), ('�', 'LOGRA', 1), ('a', 'LOGRA', 1), ('wilson', 'LOGRA', 1), ('sales', 'LOGRA', 1), ('163', 'LOGRA', 1), ('quadra', '', 2), ('586', '', 2), ('-', '', 2), ('lotes', '', 2), ('3', '', 2), ('e', '', 2), ('4', '', 2), ('-', '', 2), ('parte', '', 2), ('setor', '', 2), ('nova', '', 2), ('sui', '', 2), ('�', '', 2), ('a', '', 2), ('74280-370', '', 2), ('goiania', '', 2), ('go', '', 2)]

Entities [('rua presidente vargas 537', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('presidente', 'LOGRA', 1), ('vargas', 'LOGRA', 1), ('537', 'LOGRA', 1), ('centro', '', 2), ('17690-000', '', 2), ('bastos', '', 2), ('sp', '', 2)]

Entities [('rua mal. jos� bernardino bormann 1348', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('mal', 'LOGRA', 1), ('.', 'LOGRA', 1), ('jos', 'LOGRA', 1), ('�', 'LOGRA', 1), ('bernardino', 'LOGRA', 1), ('bormann', 'LOGRA', 1), ('1348', 'LOGRA', 1), ('bigorrilho', '', 2), ('80710-500', '', 2), ('curitiba', '', 2), 

Entities [('rua percilio santana 188', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('percilio', 'LOGRA', 1), ('santana', 'LOGRA', 1), ('188', 'LOGRA', 1), ('centro', '', 2), ('47990-000', '', 2), ('formosa', '', 2), ('do', '', 2), ('rio', '', 2), ('preto', '', 2), ('ba', '', 2)]

Entities [('rua cardoso de morais 60', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('cardoso', 'LOGRA', 1), ('de', 'LOGRA', 1), ('morais', 'LOGRA', 1), ('60', 'LOGRA', 1), ('loja', '', 2), ('a', '', 2), ('bonsucesso', '', 2), ('21032-000', '', 2), ('rio', '', 2), ('de', '', 2), ('janeiro', '', 2), ('rj', '', 2)]

Entities [('av. guajajaras n� 1.000', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('guajajaras', 'LOGRA', 1), ('n', 'LOGRA', 1), ('�', 'LOGRA', 1), ('1.000', 'LOGRA', 1), ('tirirical', '', 2), ('65055-285', '', 2), ('sao', '', 2), ('luis', '', 2), ('ma', '', 2)]

Entities [('av. nossa senhora medianeira 1844', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('nossa', 'LOGRA', 1), ('senhora

Entities [('rua 25 de agosto 500', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('25', 'LOGRA', 1), ('de', 'LOGRA', 1), ('agosto', 'LOGRA', 1), ('500', 'LOGRA', 1), ('centro', '', 2), ('85801-060', '', 2), ('cascavel', '', 2), ('pr', '', 2)]

Entities [('av. nossa sra. da penha 699', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('nossa', 'LOGRA', 1), ('sra', 'LOGRA', 1), ('.', 'LOGRA', 1), ('da', 'LOGRA', 1), ('penha', 'LOGRA', 1), ('699', 'LOGRA', 1), ('ed', '', 2), ('.', '', 2), ('century', '', 2), ('towers', '', 2), ('praia', '', 2), ('do', '', 2), ('canto', '', 2), ('29055-131', '', 2), ('vitoria', '', 2), ('es', '', 2)]

Entities [('rua presidente nereu ramos', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('presidente', 'LOGRA', 1), ('nereu', 'LOGRA', 1), ('ramos', 'LOGRA', 1), ('centro', '', 2), ('88015-010', '', 2), ('florianopolis', '', 2), ('sc', '', 2)]

Entities [('av baltazar de oliveira garcia 943/955', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('baltazar', 'LOGRA', 1), ('de',

Entities [('rua dr. cesar castiglioni junior 186', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('dr', 'LOGRA', 1), ('.', 'LOGRA', 1), ('cesar', 'LOGRA', 1), ('castiglioni', 'LOGRA', 1), ('junior', 'LOGRA', 1), ('186', 'LOGRA', 1), ('conjunto', '', 2), ('2', '', 2), ('casa', '', 2), ('verde', '', 2), ('02515-000', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('pca.orlando gomes dos santos 352', 'LOGRA')]
Tokens [('pca.orlando', 'LOGRA', 3), ('gomes', 'LOGRA', 1), ('dos', 'LOGRA', 1), ('santos', 'LOGRA', 1), ('352', 'LOGRA', 1), ('centro', '', 2), ('49200-000', '', 2), ('estancia', '', 2), ('se', '', 2)]

Entities [('rua 28 de agosto', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('28', 'LOGRA', 1), ('de', 'LOGRA', 1), ('agosto', 'LOGRA', 1), ('loja', '', 2), ('02', '', 2), ('centro', '', 2), ('89270-000', '', 2), ('guaramirim', '', 2), ('sc', '', 2)]

Entities [('r.vinte e nove de dezembro 909', 'LOGRA')]
Tokens [('r.vinte', 'LOGRA', 3), ('e', 'LOGRA', 1), ('nove', 'LOGRA', 

Entities [('r.dois s/', 'LOGRA')]
Tokens [('r.dois', 'LOGRA', 3), ('s', 'LOGRA', 1), ('/', 'LOGRA', 1), ('n', '', 2), ('rubiao', '', 2), ('jr', '', 2), ('18618-970', '', 2), ('botucatu', '', 2), ('sp', '', 2)]

Entities [('rua cel carlos pioli', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('cel', 'LOGRA', 1), ('carlos', 'LOGRA', 1), ('pioli', 'LOGRA', 1), ('centro', '', 2), ('83540-000', '', 2), ('rio', '', 2), ('branco', '', 2), ('do', '', 2), ('sul', '', 2), ('pr', '', 2)]

Entities [('rua francisco lindner 189', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('francisco', 'LOGRA', 1), ('lindner', 'LOGRA', 1), ('189', 'LOGRA', 1), ('centro', '', 2), ('89600-000', '', 2), ('joacaba', '', 2), ('sc', '', 2)]

Entities [('rua andrade neves 100', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('andrade', 'LOGRA', 1), ('neves', 'LOGRA', 1), ('100', 'LOGRA', 1), ('centro', '', 2), ('95330-000', '', 2), ('veranopolis', '', 2), ('rs', '', 2)]

Entities [('r. padre anchieta 1947', 'LOGRA')]
Tokens [('r.', 'LOGRA',

Entities [('rua heitor penteado 1.393', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('heitor', 'LOGRA', 1), ('penteado', 'LOGRA', 1), ('1.393', 'LOGRA', 1), ('sumarezinho', '', 2), ('05437-001', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av. dr pereira barreto', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('dr', 'LOGRA', 1), ('pereira', 'LOGRA', 1), ('barreto', 'LOGRA', 1), ('centro', '', 2), ('69190-000', '', 2), ('maues', '', 2), ('am', '', 2)]

Entities [('rua padre jose marins 273', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('padre', 'LOGRA', 1), ('jose', 'LOGRA', 1), ('marins', 'LOGRA', 1), ('273', 'LOGRA', 1), ('centro', '', 2), ('18660-000', '', 2), ('pratania', '', 2), ('sp', '', 2)]

Entities [('rua sao joao del rey 150/rua uberlandia', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('sao', 'LOGRA', 1), ('joao', 'LOGRA', 1), ('del', 'LOGRA', 1), ('rey', 'LOGRA', 1), ('150', 'LOGRA', 1), ('/', 'LOGRA', 1), ('rua', 'LOGRA', 1), ('uberlandia', 'LOGRA', 1),

Entities [('avenida sete de setembro 2775', 'LOGRA')]
Tokens [('avenida', 'LOGRA', 3), ('sete', 'LOGRA', 1), ('de', 'LOGRA', 1), ('setembro', 'LOGRA', 1), ('2775', 'LOGRA', 1), ('lojas', '', 2), ('b25', '', 2), ('e', '', 2), ('b26', '', 2), ('centro', '', 2), ('80230-010', '', 2), ('curitiba', '', 2), ('pr', '', 2)]

Entities [('av.brasilia lojas 1 e 2', 'LOGRA')]
Tokens [('av.brasilia', 'LOGRA', 3), ('lojas', 'LOGRA', 1), ('1', 'LOGRA', 1), ('e', 'LOGRA', 1), ('2', 'LOGRA', 1), ('s', '', 2), ('�', '', 2), ('o', '', 2), ('benedito', '', 2), ('33110-580', '', 2), ('santa', '', 2), ('luzia', '', 2), ('mg', '', 2)]

Entities [('av dom lino', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('dom', 'LOGRA', 1), ('lino', 'LOGRA', 1), ('centro', '', 2), ('62900-000', '', 2), ('russas', '', 2), ('ce', '', 2)]

Entities [('r.dos pioneiros 400', 'LOGRA')]
Tokens [('r.dos', 'LOGRA', 3), ('pioneiros', 'LOGRA', 1), ('400', 'LOGRA', 1), ('centro', '', 2), ('88420-000', '', 2), ('agrolandia', '', 2), ('sc', ''

Entities [('av guapira 2117', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('guapira', 'LOGRA', 1), ('2117', 'LOGRA', 1), ('tucuruvi', '', 2), ('02265-002', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av. protasio alves', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('protasio', 'LOGRA', 1), ('alves', 'LOGRA', 1), ('bom', '', 2), ('jesus', '', 2), ('91310-002', '', 2), ('porto', '', 2), ('alegre', '', 2), ('rs', '', 2)]

Entities [('pca.cel.jose vieira 176', 'LOGRA')]
Tokens [('pca.cel.jose', 'LOGRA', 3), ('vieira', 'LOGRA', 1), ('176', 'LOGRA', 1), ('centro', '', 2), ('37660-000', '', 2), ('paraisopolis', '', 2), ('mg', '', 2)]

Entities [('av.doca paraiba 100', 'LOGRA')]
Tokens [('av.doca', 'LOGRA', 3), ('paraiba', 'LOGRA', 1), ('100', 'LOGRA', 1), ('centro', '', 2), ('62670-000', '', 2), ('sao', '', 2), ('goncalo', '', 2), ('do', '', 2), ('amarante', '', 2), ('ce', '', 2)]

Entities [('avenida fernando correa da costa 1899', 'LOGRA')]
Tokens [('avenida

Entities [('r.lauro sodre 193', 'LOGRA')]
Tokens [('r.lauro', 'LOGRA', 3), ('sodre', 'LOGRA', 1), ('193', 'LOGRA', 1), ('centro', '', 2), ('68430-000', '', 2), ('igarape-miri', '', 2), ('pa', '', 2)]

Entities [('av.flores da cunha 1286', 'LOGRA')]
Tokens [('av.flores', 'LOGRA', 3), ('da', 'LOGRA', 1), ('cunha', 'LOGRA', 1), ('1286', 'LOGRA', 1), ('centro', '', 2), ('99500-000', '', 2), ('carazinho', '', 2), ('rs', '', 2)]

Entities [('rua viuva olinda watter', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('viuva', 'LOGRA', 1), ('olinda', 'LOGRA', 1), ('watter', 'LOGRA', 1), ('sala', '', 2), ('17', '', 2), ('centro', '', 2), ('99920-000', '', 2), ('erebango', '', 2), ('rs', '', 2)]

Entities [('av paes de barros 183', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('paes', 'LOGRA', 1), ('de', 'LOGRA', 1), ('barros', 'LOGRA', 1), ('183', 'LOGRA', 1), ('alto', '', 2), ('da', '', 2), ('mooca', '', 2), ('03115-020', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av.trifon hanysz 69

Entities [('av. brasil 660', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('brasil', 'LOGRA', 1), ('660', 'LOGRA', 1), ('centro', '', 2), ('89887-000', '', 2), ('palmitos', '', 2), ('sc', '', 2)]

Entities [('r.coronel souza franco 1185', 'LOGRA')]
Tokens [('r.coronel', 'LOGRA', 3), ('souza', 'LOGRA', 1), ('franco', 'LOGRA', 1), ('1185', 'LOGRA', 1), ('1.andar', '', 2), ('centro', '', 2), ('08780-120', '', 2), ('mogi', '', 2), ('das', '', 2), ('cruzes', '', 2), ('sp', '', 2)]

Entities [('rua julio de castilhos 400', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('julio', 'LOGRA', 1), ('de', 'LOGRA', 1), ('castilhos', 'LOGRA', 1), ('400', 'LOGRA', 1), ('centro', '', 2), ('93510-130', '', 2), ('novo', '', 2), ('hamburgo', '', 2), ('rs', '', 2)]

Entities [('pca.da bandeira 138', 'LOGRA')]
Tokens [('pca.da', 'LOGRA', 3), ('bandeira', 'LOGRA', 1), ('138', 'LOGRA', 1), ('centro', '', 2), ('36830-000', '', 2), ('espera', '', 2), ('feliz', '', 2), ('mg', '', 2)]

Entities [('est do gale

Entities [('av.rio das pedras 1731', 'LOGRA')]
Tokens [('av.rio', 'LOGRA', 3), ('das', 'LOGRA', 1), ('pedras', 'LOGRA', 1), ('1731', 'LOGRA', 1), ('jd.aricanduva', '', 2), ('03453-100', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('r.otto niemeyer 2539', 'LOGRA')]
Tokens [('r.otto', 'LOGRA', 3), ('niemeyer', 'LOGRA', 1), ('2539', 'LOGRA', 1), ('esq.c', '', 2), ('/', '', 2), ('silvio', '', 2), ('silveira', '', 2), ('soares', '', 2), ('cavalhada', '', 2), ('91910-971', '', 2), ('porto', '', 2), ('alegre', '', 2), ('rs', '', 2)]

Entities [('rua sete de setembro num 156', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('sete', 'LOGRA', 1), ('de', 'LOGRA', 1), ('setembro', 'LOGRA', 1), ('num', 'LOGRA', 1), ('156', 'LOGRA', 1), ('centro', '', 2), ('14770-000', '', 2), ('colina', '', 2), ('sp', '', 2)]

Entities [('praca capitao gabriel 235', 'LOGRA')]
Tokens [('praca', 'LOGRA', 3), ('capitao', 'LOGRA', 1), ('gabriel', 'LOGRA', 1), ('235', 'LOGRA', 1), ('centro', '', 2), ('07

Entities [('r.alvaro mendes 1313', 'LOGRA')]
Tokens [('r.alvaro', 'LOGRA', 3), ('mendes', 'LOGRA', 1), ('1313', 'LOGRA', 1), ('3', '', 2), ('andar', '', 2), ('centro', '', 2), ('64000-060', '', 2), ('teresina', '', 2), ('pi', '', 2)]

Entities [('av. papa jo�o paulo', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('papa', 'LOGRA', 1), ('jo', 'LOGRA', 1), ('�', 'LOGRA', 1), ('o', 'LOGRA', 1), ('paulo', 'LOGRA', 1), ('i', '', 2), ('cidade', '', 2), ('parque', '', 2), ('s', '', 2), ('�', '', 2), ('o', '', 2), ('luiz', '', 2), ('07170-385', '', 2), ('guarulhos', '', 2), ('sp', '', 2)]

Entities [('praca 24 de outubro 06', 'LOGRA')]
Tokens [('praca', 'LOGRA', 3), ('24', 'LOGRA', 1), ('de', 'LOGRA', 1), ('outubro', 'LOGRA', 1), ('06', 'LOGRA', 1), ('centro', '', 2), ('49200-000', '', 2), ('estancia', '', 2), ('se', '', 2)]

Entities [('avenida luis de camoes num 575', 'LOGRA')]
Tokens [('avenida', 'LOGRA', 3), ('luis', 'LOGRA', 1), ('de', 'LOGRA', 1), ('camoes', 'LOGRA', 1), ('num

Entities [('av.antonio carlos comitre n� 525', 'LOGRA')]
Tokens [('av.antonio', 'LOGRA', 3), ('carlos', 'LOGRA', 1), ('comitre', 'LOGRA', 1), ('n', 'LOGRA', 1), ('�', 'LOGRA', 1), ('525', 'LOGRA', 1), ('conjuntos', '', 2), ('22', '', 2), ('e', '', 2), ('23', '', 2), ('parque', '', 2), ('campolim', '', 2), ('18047-620', '', 2), ('sorocaba', '', 2), ('sp', '', 2)]

Entities [('av. das palmeiras 522', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('das', 'LOGRA', 1), ('palmeiras', 'LOGRA', 1), ('522', 'LOGRA', 1), ('palmeiropolis', '', 2), ('77365-000', '', 2), ('palmeiropolis', '', 2), ('to', '', 2)]

Entities [('av.sao miguel 5092', 'LOGRA')]
Tokens [('av.sao', 'LOGRA', 3), ('miguel', 'LOGRA', 1), ('5092', 'LOGRA', 1), ('jd.cotinha', '', 2), ('03870-100', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('rua marechal floriano 237', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('marechal', 'LOGRA', 1), ('floriano', 'LOGRA', 1), ('237', 'LOGRA', 1), ('centro', '',

Entities [('srtvs quadra 701 conjunto d', 'LOGRA')]
Tokens [('srtvs', 'LOGRA', 3), ('quadra', 'LOGRA', 1), ('701', 'LOGRA', 1), ('conjunto', 'LOGRA', 1), ('d', 'LOGRA', 1), ('bloco', '', 2), ('b', '', 2), ('loja', '', 2), ('242', '', 2), ('parte', '', 2), ('a', '', 2), ('setor', '', 2), ('asa', '', 2), ('sul', '', 2), ('70340-000', '', 2), ('brasilia', '', 2), ('df', '', 2)]

Entities [('r.pe.pedro pinto 6711', 'LOGRA')]
Tokens [('r.pe.pedro', 'LOGRA', 3), ('pinto', 'LOGRA', 1), ('6711', 'LOGRA', 1), ('lagoa', '', 2), ('31570-000', '', 2), ('belo', '', 2), ('horizonte', '', 2), ('mg', '', 2)]

Entities [('r 15 de novembro 608', 'LOGRA')]
Tokens [('r', 'LOGRA', 3), ('15', 'LOGRA', 1), ('de', 'LOGRA', 1), ('novembro', 'LOGRA', 1), ('608', 'LOGRA', 1), ('centro', '', 2), ('79002-140', '', 2), ('campo', '', 2), ('grande', '', 2), ('ms', '', 2)]

Entities [('pca da catedral', 'LOGRA')]
Tokens [('pca', 'LOGRA', 3), ('da', 'LOGRA', 1), ('catedral', 'LOGRA', 1), ('centro', '', 2), ('13400-150'

Entities [('r.sta.catarina 1381', 'LOGRA')]
Tokens [('r.sta.catarina', 'LOGRA', 3), ('1381', 'LOGRA', 1), ('centro', '', 2), ('18700-005', '', 2), ('avare', '', 2), ('sp', '', 2)]

Entities [('rua conselheiro antonio prado 56', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('conselheiro', 'LOGRA', 1), ('antonio', 'LOGRA', 1), ('prado', 'LOGRA', 1), ('56', 'LOGRA', 1), ('centro', '', 2), ('16200-000', '', 2), ('birigui', '', 2), ('sp', '', 2)]

Entities [('rua pe. valdevino nogueira 2244', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('pe', 'LOGRA', 1), ('.', 'LOGRA', 1), ('valdevino', 'LOGRA', 1), ('nogueira', 'LOGRA', 1), ('2244', 'LOGRA', 1), ('centro', '', 2), ('62850-000', '', 2), ('cascavel', '', 2), ('ce', '', 2)]

Entities [('rua barao de itapetininga 298', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('barao', 'LOGRA', 1), ('de', 'LOGRA', 1), ('itapetininga', 'LOGRA', 1), ('298', 'LOGRA', 1), ('centro', '', 2), ('01042-000', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av

Entities [('av. jose de alencar 614', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('jose', 'LOGRA', 1), ('de', 'LOGRA', 1), ('alencar', 'LOGRA', 1), ('614', 'LOGRA', 1), ('menino', '', 2), ('deus', '', 2), ('90880-480', '', 2), ('porto', '', 2), ('alegre', '', 2), ('rs', '', 2)]

Entities [('avenida vital brasil', 'LOGRA')]
Tokens [('avenida', 'LOGRA', 3), ('vital', 'LOGRA', 1), ('brasil', 'LOGRA', 1), ('butanta', '', 2), ('05503-000', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av. rio maria 646/av.8 dist', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('rio', 'LOGRA', 1), ('maria', 'LOGRA', 1), ('646', 'LOGRA', 1), ('/', 'LOGRA', 1), ('av.8', 'LOGRA', 1), ('dist', 'LOGRA', 1), ('rio', '', 2), ('maria', '', 2), ('68540-000', '', 2), ('rio', '', 2), ('maria', '', 2), ('pa', '', 2)]

Entities [('rua capitao eleuterio num 429', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('capitao', 'LOGRA', 1), ('eleuterio', 'LOGRA', 1), ('num', 'LOGRA', 1), 

Entities [('rua 15 de novembro', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('15', 'LOGRA', 1), ('de', 'LOGRA', 1), ('novembro', 'LOGRA', 1), ('centro', '', 2), ('96015-000', '', 2), ('pelotas', '', 2), ('rs', '', 2)]

Entities [('av. manoel guimar�es 195', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('manoel', 'LOGRA', 1), ('guimar', 'LOGRA', 1), ('�', 'LOGRA', 1), ('es', 'LOGRA', 1), ('195', 'LOGRA', 1), ('jos', '', 2), ('�', '', 2), ('pinheiro', '', 2), ('58407-363', '', 2), ('campina', '', 2), ('grande', '', 2), ('pb', '', 2)]

Entities [('rodovia d. pedro i km 131', 'LOGRA')]
Tokens [('rodovia', 'LOGRA', 3), ('d.', 'LOGRA', 1), ('pedro', 'LOGRA', 1), ('i', 'LOGRA', 1), ('km', 'LOGRA', 1), ('131', 'LOGRA', 1), ('5', '', 2), ('jardim', '', 2), ('nilopolis', '', 2), ('13097-100', '', 2), ('campinas', '', 2), ('sp', '', 2)]

Entities [('av.fabricio de oliveira pillar 589', 'LOGRA')]
Tokens [('av.fabricio', 'LOGRA', 3), ('de', 'LOGRA', 1), ('oliveira', 'LOGRA', 1), ('pillar', 

Entities [('av. ibirapuera 2220', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('ibirapuera', 'LOGRA', 1), ('2220', 'LOGRA', 1), ('indianopolis', '', 2), ('04028-001', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('av paulista 2083 2093 loja 105/106/131a1', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('paulista', 'LOGRA', 1), ('2083', 'LOGRA', 1), ('2093', 'LOGRA', 1), ('loja', 'LOGRA', 1), ('105/106/131a1', 'LOGRA', 1), ('bela', '', 2), ('vista', '', 2), ('01311-940', '', 2), ('sao', '', 2), ('paulo', '', 2), ('sp', '', 2)]

Entities [('rua luiz barreto', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('luiz', 'LOGRA', 1), ('barreto', 'LOGRA', 1), ('centro', '', 2), ('95840-000', '', 2), ('triunfo', '', 2), ('rs', '', 2)]

Entities [('estrada do gale�o 2751', 'LOGRA')]
Tokens [('estrada', 'LOGRA', 3), ('do', 'LOGRA', 1), ('gale', 'LOGRA', 1), ('�', 'LOGRA', 1), ('o', 'LOGRA', 1), ('2751', 'LOGRA', 1), ('loja', '', 2), ('d', '', 2), ('-', '', 2), ('parte', '', 2), 

Entities []
Tokens [(' ', '', 2), ('av', '', 2), ('.', '', 2), ('francisco', '', 2), ('de', '', 2), ('paula', '', 2), ('leite', '', 2), ('2163', '', 2), ('jardim', '', 2), ('kioto', '', 2), ('ii', '', 2), ('13344-700', '', 2), ('indaiatuba', '', 2), ('sp', '', 2)]

Entities [('av. jones do santos neves 47', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('jones', 'LOGRA', 1), ('do', 'LOGRA', 1), ('santos', 'LOGRA', 1), ('neves', 'LOGRA', 1), ('47', 'LOGRA', 1), ('centro', '', 2), ('29930-015', '', 2), ('sao', '', 2), ('mateus', '', 2), ('es', '', 2)]

Entities [('praca melo viana', 'LOGRA')]
Tokens [('praca', 'LOGRA', 3), ('melo', 'LOGRA', 1), ('viana', 'LOGRA', 1), ('centro', '', 2), ('39270-000', '', 2), ('pirapora', '', 2), ('mg', '', 2)]

Entities [('r.rio jari 1043', 'LOGRA')]
Tokens [('r.rio', 'LOGRA', 3), ('jari', 'LOGRA', 1), ('1043', 'LOGRA', 1), ('em', '', 2), ('frente', '', 2), ('a', '', 2), ('o', '', 2), ('superm.ideal', '', 2), ('agreste', '', 2), ('68920-000', '

Entities [('rua da integra��o vereador bernardino pucci 2025', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('da', 'LOGRA', 1), ('integra', 'LOGRA', 1), ('�', 'LOGRA', 1), ('�', 'LOGRA', 1), ('o', 'LOGRA', 1), ('vereador', 'LOGRA', 1), ('bernardino', 'LOGRA', 1), ('pucci', 'LOGRA', 1), ('2025', 'LOGRA', 1), ('jardim', '', 2), ('integra', '', 2), ('�', '', 2), ('�', '', 2), ('o', '', 2), ('14405-265', '', 2), ('franca', '', 2), ('sp', '', 2)]

Entities [('av inglaterra', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('inglaterra', 'LOGRA', 1), ('jd', '', 2), ('sao', '', 2), ('vicente', '', 2), ('86046-000', '', 2), ('londrina', '', 2), ('pr', '', 2)]

Entities [('rua amando de barros 664', 'LOGRA')]
Tokens [('rua', 'LOGRA', 3), ('amando', 'LOGRA', 1), ('de', 'LOGRA', 1), ('barros', 'LOGRA', 1), ('664', 'LOGRA', 1), ('parte', '', 2), ('centro', '', 2), ('18600-050', '', 2), ('botucatu', '', 2), ('sp', '', 2)]

Entities [('pca prof jose azevedo antunes 48', 'LOGRA')]
Tokens [('pca', 'LOGRA', 3), ('prof

In [None]:
base_teste_final[0]

In [None]:
phrase = "SQSW 102, bloco D - O juventino foi na rua comprar pao "

doc = nlp(phrase)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [None]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [None]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
loaded_model = spacy.load(output_dir)
evaluate(loaded_model, base_teste_final)