In [1]:
# Load Packages
from __future__ import unicode_literals, print_function

import random
import numpy as np
import unicodedata
import pickle
import spacy
import pandas as pd
import re

from spacy.util import minibatch, compounding
from nltk.tokenize import sent_tokenize

In [2]:
# Função para substituir acentos

def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError): # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    
    return str(text)

# Abrindo e gerando Datasets

### Entidade logradouro

In [3]:
with open ('../Datasets/Dataset_V2.0.txt', 'rb') as fp:
    dataset = pickle.load(fp)

print(len(dataset))

128016


In [4]:
dataset[0]

('R.GUILHERME MOREIRA,315 SUBLOJA,LOJA E 2.ANDAR 69005-300 MANAUS CENTRO AM',
 {'entities': [(0, 23, 'END_LOG')]})

In [5]:
end_lista = []

for i in range(len(dataset)):
    str_tratada = dataset[i][0]
    str_tratada = str_tratada.lower() # Tudo em minúsculo
    str_tratada = strip_accents(str_tratada) # Substituição de acentos e caracteres diferentes
    
    end_lista.append(str_tratada)

In [6]:
end_lista[:3]

['r.guilherme moreira,315 subloja,loja e 2.andar 69005-300 manaus centro am',
 '69005-300 manaus am r.guilherme moreira,315 centro subloja,loja e 2.andar',
 'r.guilherme moreira,315 centro 69005-300 subloja,loja e 2.andar manaus am']

In [7]:
iob = []

for i in range(len(dataset)):
    tupla = (end_lista[i], dataset[i][1])
    
    iob.append(tupla)
    
FULL_DATA = iob

In [8]:
print(FULL_DATA[0])
print(FULL_DATA[1])
print(FULL_DATA[2])

('r.guilherme moreira,315 subloja,loja e 2.andar 69005-300 manaus centro am', {'entities': [(0, 23, 'END_LOG')]})
('69005-300 manaus am r.guilherme moreira,315 centro subloja,loja e 2.andar', {'entities': [(20, 43, 'END_LOG')]})
('r.guilherme moreira,315 centro 69005-300 subloja,loja e 2.andar manaus am', {'entities': [(0, 23, 'END_LOG')]})


In [9]:
# Criação da base de teste e treinamento

n_test= 0.1 # Porcentagem para base de teste
test_n = round(len(FULL_DATA) * n_test)

# Divisao em Train Test Val

def gerador_bases(dataset, n):
    indices_random = random.sample(range(0,len(dataset)-1), n)
    base_teste_n = []
    base_treinamento_n = []
    
    for i in range(n):
        base_teste_n.append(dataset[indices_random[i]])

    for j in range(len(dataset)):
        if(j not in indices_random):
            base_treinamento_n.append(dataset[j])
            
    return base_teste_n, base_treinamento_n


base_teste, base_treinamento = gerador_bases(FULL_DATA, test_n)

random.shuffle(base_treinamento)
random.shuffle(base_teste)

print("Treinamento: " + str(len(base_treinamento)), "\nTeste: " + str(len(base_teste)), "\nTotal: " + str(len(FULL_DATA)))

Treinamento: 115214 
Teste: 12802 
Total: 128016


In [10]:
for i in range(5):
    print(base_treinamento[i])

('av. guajajaras, 200 tirirical ma 65055-285 quadra 220 - lojas 01 e 02 sao luis', {'entities': [(0, 19, 'END_LOG')]})
('cambe centro avenida inglaterra  86181-000 pr', {'entities': [(13, 31, 'END_LOG')]})
('centro sp mogi das cruzes 08710-500 av.vol.fernando p.franco,432 sala exclusivo', {'entities': [(36, 64, 'END_LOG')]})
('79965-000 itaquirai ms av.treze de maio,393 centro', {'entities': [(24, 44, 'END_LOG')]})
('rua dos andradas 1143 90020-015 rs porto alegre centro', {'entities': [(0, 21, 'END_LOG')]})


# Criando o modelo

In [11]:
# Define our variables

model = None
n_iter= 250 # número de épocas
batch_size = 64

In [12]:
# Setting up the pipeline and entity recognizer.
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    lan = 'pt'
    nlp = spacy.blank(lan)  # create blank Language class
    print("Created blank '%s' model" % lan)
    
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    print('Added new NER')
else:
    ner = nlp.get_pipe('ner')
    print('Got an old NER')

Created blank 'pt' model
Added new NER


In [13]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

## Treinamento

In [14]:
print("Batch size: ", batch_size)
print("Épocas: ", n_iter)
print()

# add labels
for _, annotations in base_treinamento:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(base_treinamento)
        losses = {}
        batches = minibatch(base_treinamento, size=batch_size)
        
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            except:
                pass     
        
        print(itn+1, '\tLosses', losses)        

Batch size:  64
Épocas:  250

1 	Losses {'ner': 42406.08663019806}
2 	Losses {'ner': 15049.701008918018}
3 	Losses {'ner': 11350.498958502678}
4 	Losses {'ner': 9064.326153172273}
5 	Losses {'ner': 7616.480735804446}
6 	Losses {'ner': 6346.333682917659}
7 	Losses {'ner': 5661.365211955623}
8 	Losses {'ner': 5224.147627824183}
9 	Losses {'ner': 4786.875430345193}
10 	Losses {'ner': 4257.502126250984}
11 	Losses {'ner': 4274.676365970277}
12 	Losses {'ner': 4122.700997887913}
13 	Losses {'ner': 3948.6326896470528}
14 	Losses {'ner': 3632.2018594776814}
15 	Losses {'ner': 3526.779333861872}
16 	Losses {'ner': 3383.506706048531}
17 	Losses {'ner': 3158.5789782703105}
18 	Losses {'ner': 3201.920417003539}
19 	Losses {'ner': 2990.0705192696396}
20 	Losses {'ner': 2834.5102037985994}
21 	Losses {'ner': 2756.1972640193785}
22 	Losses {'ner': 2706.5008327412083}
23 	Losses {'ner': 2496.563174540845}
24 	Losses {'ner': 2471.653247965368}
25 	Losses {'ner': 2648.209574222119}
26 	Losses {'ner': 2

213 	Losses {'ner': 563.0215611536013}
214 	Losses {'ner': 520.6180420960169}
215 	Losses {'ner': 659.9350522392068}
216 	Losses {'ner': 579.6235185133171}
217 	Losses {'ner': 564.095032943676}
218 	Losses {'ner': 562.1823636748845}
219 	Losses {'ner': 547.6217024772541}
220 	Losses {'ner': 511.5338387494853}
221 	Losses {'ner': 578.3926111448992}
222 	Losses {'ner': 596.0341605029375}
223 	Losses {'ner': 579.7318777297733}
224 	Losses {'ner': 557.1131084603004}
225 	Losses {'ner': 472.17445095570025}
226 	Losses {'ner': 541.1947102670923}
227 	Losses {'ner': 585.7414298735154}
228 	Losses {'ner': 608.1470068059665}
229 	Losses {'ner': 595.8964407480152}
230 	Losses {'ner': 519.5385208411714}
231 	Losses {'ner': 633.3852996582453}
232 	Losses {'ner': 549.4795733238786}
233 	Losses {'ner': 623.8059795845331}
234 	Losses {'ner': 535.1691910501901}
235 	Losses {'ner': 525.7145849011621}
236 	Losses {'ner': 584.1992667302761}
237 	Losses {'ner': 574.5143701610921}
238 	Losses {'ner': 570.5

## Salvando modelo

In [15]:
nlp.to_disk('../Saved_models/')

## Carregando modelo para teste
#### Executar até antes de `Criando o modelo` caso já tenha treinado anteriormente.

In [10]:
nlp = spacy.load('../Saved_models/')

## Teste

Base de Teste

In [20]:
from spacy import displacy

for data in base_teste[:20]:
    doc = nlp(data[0])
    if doc.ents:
        displacy.render(doc, style = 'ent')

Frase qualquer

In [17]:
phrase = "SHVP Rua 12 435 casa 22 lote 1"

doc = nlp(phrase)
displacy.render(doc, style = 'ent' )