In [1]:
# Load Packages
from __future__ import unicode_literals, print_function

import plac #  wrapper over argparse
import random
from pathlib import Path
import spacy
from tqdm import tqdm, tqdm_notebook # loading bar 
import pandas as pd
import re
from pprint import pprint
from nltk.tokenize import sent_tokenize
import numpy as np

from spacy.util import minibatch, compounding

# Abrindo e gerando Datasets

### Entidade logradouro

In [2]:
# Obtendo dados crus de endereço

dataset = pd.read_csv("201906AGENCIAS.CSV")
dset = dataset.iloc[:-2,4:10].values  # tipo nparray object
pd_dset = pd.DataFrame(dset) # caso queira visualizar

pd_dset['Join'] = pd_dset[pd_dset.columns[0:]].apply(
    lambda x: ';'.join(x.dropna().astype(str)),
    axis=1
)
pd_dset.head(5)

Unnamed: 0,0,1,2,3,4,5,Join
0,"R.GUILHERME MOREIRA,315","SUBLOJA,LOJA E 2.ANDAR ...",CENTRO,69005-300,MANAUS ...,AM,"R.GUILHERME MOREIRA,315 ;SUBLO..."
1,"AV.PRES.VARGAS,248",1.E 2.ANDARES ...,CAMPINA,66010-900,BELEM ...,PA,"AV.PRES.VARGAS,248 ;1.E 2..."
2,"R.QUINZE DE NOVEMBRO,195",...,CENTRO,11010-908,SANTOS ...,SP,"R.QUINZE DE NOVEMBRO,195 ; ..."
3,"PCA.DAS QUATRO JORNADAS,11",MEZANINO ...,CENTRO,28010-000,CAMPOS DOS GOYTACAZES ...,RJ,"PCA.DAS QUATRO JORNADAS,11 ;MEZAN..."
4,"SEXTA AVENIDA,600",SECRETARIA DA EDUCACAO-TERREO ...,CAB,41745-002,SALVADOR ...,BA,"SEXTA AVENIDA,600 ;SECRE..."


In [3]:
# Tratando endereço completo do DataFrame
dset = np.array(pd_dset)
end_lista = []

for i in range(len(dset)):
    str_raw = dset[i][6]
    str_tratada = re.sub(r'[ ]{2,}', "",str_raw) # Tirando espaços excedentes no final do endereço
    str_tratada = re.sub(r'[;]{1,}', "; ", str_tratada) # Para complementos vazios, para não ter 2 ";"
    str_tratada = str_tratada.lower()
    
    end_lista.append(str_tratada)

end_lista[:3]

['r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am',
 'av.pres.vargas,248; 1.e 2.andares; campina; 66010-900; belem; pa',
 'r.quinze de novembro,195; centro; 11010-908; santos; sp']

In [4]:
# Lógica para extrair posição do logradouro:
# Dividir a String inteira por ";", pegar o len do primeiro split
# len de LOGRA é de 0 até len do primeiro split

print(end_lista[0])
split = end_lista[0].split(";")
print(split)
print(split[0])
print(len(split[0]))

r.guilherme moreira,315; subloja,loja e 2.andar; centro; 69005-300; manaus; am
['r.guilherme moreira,315', ' subloja,loja e 2.andar', ' centro', ' 69005-300', ' manaus', ' am']
r.guilherme moreira,315
23


In [5]:
# Extraindo iob com entidade apenas de logradouro

iob = []

for i in range(len(end_lista)):
    split = end_lista[i].split(";")
    len_logra = len(split[0])
    iob_dict = {"entities": [(0, len_logra-1,'LOGRA')]}
    end_lista[i] = end_lista[i].replace(";", " ")
    end_lista[i] = end_lista[i].replace(",", " ")
    end_lista[i] = re.sub(r'[ ]{2,}', " ", end_lista[i])

    
    tupla = (end_lista[i], iob_dict)
    iob.append(tupla)

FULL_DATA = iob

In [6]:
print(FULL_DATA[0])
print(FULL_DATA[1])
print(FULL_DATA[2])
print(FULL_DATA[5:15])

('r.guilherme moreira 315 subloja loja e 2.andar centro 69005-300 manaus am', {'entities': [(0, 22, 'LOGRA')]})
('av.pres.vargas 248 1.e 2.andares campina 66010-900 belem pa', {'entities': [(0, 17, 'LOGRA')]})
('r.quinze de novembro 195 centro 11010-908 santos sp', {'entities': [(0, 23, 'LOGRA')]})
[('av.rio branco 240 1.andar recife antigo 50030-310 recife pe', {'entities': [(0, 16, 'LOGRA')]}), ('av.santos dumont 2828 5.andar aldeota 60150-162 fortaleza ce', {'entities': [(0, 20, 'LOGRA')]}), ('pca.tiradentes 410 1.andar centro 80020-100 curitiba pr', {'entities': [(0, 17, 'LOGRA')]}), ('r.uruguai 185 5.andar centro 90010-901 porto alegre rs', {'entities': [(0, 12, 'LOGRA')]}), ('pca.1817 129 1.andar centro 58013-010 joao pessoa pb', {'entities': [(0, 11, 'LOGRA')]}), ('pca.odilon resende andrade 76 centro 37410-000 tres coracoes mg', {'entities': [(0, 28, 'LOGRA')]}), ('av fernandes lima 2591 terreo farol 57057-972 maceio al', {'entities': [(0, 22, 'LOGRA')]}), ('r.treze de junho 91

In [7]:
# Criação da base de teste e treinamento

n_test= 0.1 # Porcentagem para base de teste
test_n = round(len(FULL_DATA) * n_test)

# Divisao em Train Test Val

def gerador_bases(dataset, n):
    indices_random = random.sample(range(0,len(dataset)-1), n)
    base_teste_n = []
    base_treinamento_n = []
    
    for i in range(n):
        base_teste_n.append(dataset[indices_random[i]])

    for j in range(len(dataset)):
        if(j not in indices_random):
            base_treinamento_n.append(dataset[j])
            
    return base_teste_n, base_treinamento_n


base_teste, base_treinamento = gerador_bases(FULL_DATA, test_n)

random.shuffle(base_treinamento)
random.shuffle(base_teste)

print("Treinamento: " + str(len(base_treinamento)), "\nTeste: " + str(len(base_teste)), "\nTotal: " + str(len(FULL_DATA)))

Treinamento: 1800 
Teste: 200 
Total: 2000


In [8]:
for i in range(5):
    print(base_treinamento[i])

('r.epaminondas otoni 655 centro 39800-013 teofilo otoni mg', {'entities': [(0, 22, 'LOGRA')]})
('r.manoel teixeira 104 centro 62690-000 trairi ce', {'entities': [(0, 20, 'LOGRA')]})
('av.treze de maio 1271 fatima 60040-531 fortaleza ce', {'entities': [(0, 20, 'LOGRA')]})
('r.mal.deodoro 250 centro 36800-000 carangola mg', {'entities': [(0, 16, 'LOGRA')]})
('r.osvaldo aranha 1173 centro 95800-000 venancio aires rs', {'entities': [(0, 20, 'LOGRA')]})


# Carregando o modelo

In [9]:
# Define our variables

model = None
output_dir=Path(".")
n_iter= 100 # número de épocas
batch_size = 64

In [10]:
# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    lan = 'pt'
    nlp = spacy.blank(lan)  # create blank Language class
    print("Created blank '%s' model" % lan)
    
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    print('Added new NER')
else:
    ner = nlp.get_pipe('ner')
    print('Got an old NER')

Created blank 'pt' model
Added new NER


In [11]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

## Treinamento

In [12]:
print("Batch size: ", batch_size)
print("Épocas: ", n_iter)
print()

# add labels
for _, annotations in base_treinamento:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(base_treinamento)
        losses = {}
        batches = minibatch(base_treinamento, size=batch_size)
        
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            except:
                pass     
        
        print(itn+1, ' Losses', losses)        

Batch size:  64
Épocas:  100

1  Losses {'ner': 1559.5595977943287}
2  Losses {'ner': 1490.0462197471606}
3  Losses {'ner': 1134.8614108696947}
4  Losses {'ner': 687.8915019851787}
5  Losses {'ner': 1376.311012612151}
6  Losses {'ner': 2280.8992327513965}
7  Losses {'ner': 1927.3640766712306}
8  Losses {'ner': 1283.797549846296}
9  Losses {'ner': 1624.361715951236}
10  Losses {'ner': 2852.494131278356}
11  Losses {'ner': 2104.7023815662833}
12  Losses {'ner': 2805.3739378665923}
13  Losses {'ner': 1256.035228499821}
14  Losses {'ner': 1200.4833045062624}
15  Losses {'ner': 535.1601272814876}
16  Losses {'ner': 215.59785919485748}
17  Losses {'ner': 133.67288609961608}
18  Losses {'ner': 89.57682846451706}
19  Losses {'ner': 65.54582043993138}
20  Losses {'ner': 33.82947820466246}
21  Losses {'ner': 49.49752629896843}
22  Losses {'ner': 34.20698050746046}
23  Losses {'ner': 35.13046170945543}
24  Losses {'ner': 23.74079663874369}
25  Losses {'ner': 41.14272138716639}
26  Losses {'ner': 

In [13]:
# test the trained model
for text, _ in base_teste:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    print()

Entities [('av.barao do rio branco 2108', 'LOGRA')]
Tokens [('av.barao', 'LOGRA', 3), ('do', 'LOGRA', 1), ('rio', 'LOGRA', 1), ('branco', 'LOGRA', 1), ('2108', 'LOGRA', 1), ('terreo', '', 2), ('centro', '', 2), ('68743-050', '', 2), ('castanhal', '', 2), ('pa', '', 2)]

Entities [('av. presidente getulio vargas 2230', 'LOGRA')]
Tokens [('av', 'LOGRA', 3), ('.', 'LOGRA', 1), ('presidente', 'LOGRA', 1), ('getulio', 'LOGRA', 1), ('vargas', 'LOGRA', 1), ('2230', 'LOGRA', 1), ('rec', '', 2), ('lago', '', 2), ('45987-900', '', 2), ('teixeira', '', 2), ('de', '', 2), ('freitas', '', 2), ('ba', '', 2)]

Entities [('av.do engenho 309', 'LOGRA')]
Tokens [('av.do', 'LOGRA', 3), ('engenho', 'LOGRA', 1), ('309', 'LOGRA', 1), ('centro', '', 2), ('89893-000', '', 2), ('mondai', '', 2), ('sc', '', 2)]

Entities [('r.comandai 500 centro', 'LOGRA')]
Tokens [('r.comandai', 'LOGRA', 3), ('500', 'LOGRA', 1), ('centro', 'LOGRA', 1), ('97950-000', '', 2), ('guarani', '', 2), ('das', '', 2), ('missoes', '', 2

Entities [('r.des.souto maior 162', 'LOGRA')]
Tokens [('r.des.souto', 'LOGRA', 3), ('maior', 'LOGRA', 1), ('162', 'LOGRA', 1), ('centro', '', 2), ('58013-190', '', 2), ('joao', '', 2), ('pessoa', '', 2), ('pb', '', 2)]

Entities [('lote 4 loja 01', 'LOGRA')]
Tokens [('lote', 'LOGRA', 3), ('4', 'LOGRA', 1), ('loja', 'LOGRA', 1), ('01', 'LOGRA', 1), ('a', '', 2), ('101', '', 2), ('setor', '', 2), ('hoteleiro', '', 2), ('central', '', 2), ('st', '', 2), ('central', '', 2), ('(', '', 2), ('gama', '', 2), (')', '', 2), ('72405-604', '', 2), ('brasilia', '', 2), ('(', '', 2), ('gama', '', 2), (')', '', 2), ('df', '', 2)]

Entities [('av.benjamin pinto dias 1762', 'LOGRA')]
Tokens [('av.benjamin', 'LOGRA', 3), ('pinto', 'LOGRA', 1), ('dias', 'LOGRA', 1), ('1762', 'LOGRA', 1), ('centro', '', 2), ('26130-000', '', 2), ('belford', '', 2), ('roxo', '', 2), ('rj', '', 2)]

Entities [('av.brasil 1005', 'LOGRA')]
Tokens [('av.brasil', 'LOGRA', 3), ('1005', 'LOGRA', 1), ('centro', '', 2), ('99170-000

Entities [('av.trifon hanysz 189', 'LOGRA')]
Tokens [('av.trifon', 'LOGRA', 3), ('hanysz', 'LOGRA', 1), ('189', 'LOGRA', 1), ('centro', '', 2), ('85170-000', '', 2), ('pinhao', '', 2), ('pr', '', 2)]

Entities [('r.alfredo winck 758', 'LOGRA')]
Tokens [('r.alfredo', 'LOGRA', 3), ('winck', 'LOGRA', 1), ('758', 'LOGRA', 1), ('centro', '', 2), ('99530-000', '', 2), ('chapada', '', 2), ('rs', '', 2)]

Entities [('r.felipe schmidt 312', 'LOGRA')]
Tokens [('r.felipe', 'LOGRA', 3), ('schmidt', 'LOGRA', 1), ('312', 'LOGRA', 1), ('esq.c', '', 2), ('/', '', 2), ('r.major', '', 2), ('vieira', '', 2), ('centro', '', 2), ('89460-000', '', 2), ('canoinhas', '', 2), ('sc', '', 2)]

Entities [('av.nogueira acioly 1495', 'LOGRA')]
Tokens [('av.nogueira', 'LOGRA', 3), ('acioly', 'LOGRA', 1), ('1495', 'LOGRA', 1), ('terreo', '', 2), ('centro', '', 2), ('63430-000', '', 2), ('ico', '', 2), ('ce', '', 2)]

Entities [('r.dr.joao pessoa 677', 'LOGRA')]
Tokens [('r.dr.joao', 'LOGRA', 3), ('pessoa', 'LOGRA', 1

In [None]:
base_teste_final[0]

In [None]:
phrase = "rua leonardo de freitas, 1444,   este levantamento de preços que inclui também produtos de higiene e de limpeza mostra recuo de 1,58 % em relação à sexta-feira anterior \n "

doc = nlp(phrase)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [None]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [None]:
# save model to output directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
loaded_model = spacy.load(output_dir)
evaluate(loaded_model, base_teste_final)