In [None]:
from random import shuffle
from pickle import dump, load
from numpy import array
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.optimizers import SGD

In [None]:
def preparing_documents(dict):
    words = []
    documents = []
    for chave, valor in dict.items():
        for i in valor:
            words.append(i)
            documents.append((i, chave))
    return words, documents

def train_model(dict):
    classes = []
    classes.extend(list(dict.keys()))
    words,documents = preparing_documents(dict)

    words = sorted(list(set(words)))
    classes = sorted(list(set(classes)))

    words_path = ("words.pkl")
    classes_path = ("classes.pkl")

    dump(words,open(words_path, 'wb'))
    dump(classes,open(classes_path, 'wb'))

    training = []
    output_empty = [0] * len(classes)
    for document in documents:
        bag = []
        pattern_words = document[0]
        for word in words:
            bag.append(1) if word in pattern_words else bag.append(0)
        while len(bag) < len(words):
            bag.append(0)
        output_row = list(output_empty)
        output_row[classes.index(document[1])] = 1
        training.append([bag, output_row])
    shuffle(training)
    training = array(training, dtype=object)

    x = list(training[:, 0])
    y = list(training[:, 1])

    model = Sequential()
    model.add(Dense(600, input_shape=(len(x[0]),), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(600, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(classes), activation='softmax'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])

    m = model.fit(array(x), array(y), epochs=200, batch_size=5, verbose=1)

    model_path = ("model.h5")
    model.save(model_path, m)

## Testando NER

In [15]:
import spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'c:\Users\Semeq\Desktop\chats\chatbot_hipotese\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [21]:
nlp = spacy.load('pt_core_news_sm')
nlp.pipe_names

['tok2vec', 'morphologizer', 'parser', 'lemmatizer', 'attribute_ruler', 'ner']

#### Identificando entidades apartir do modelo pré-treinado

In [27]:
doc = nlp('herik muller mora em limeira, são paulo')
for ent in doc.ents:
    print(ent.text, '|', ent.label_, '|', spacy.explain(ent.label_))

herik muller | PER | Named person or family.
limeira | LOC | Non-GPE locations, mountain ranges, bodies of water
são paulo | LOC | Non-GPE locations, mountain ranges, bodies of water


In [32]:
from spacy import displacy

displacy.render(doc, style="ent")

#### Setando novas entidades

In [30]:
from spacy.tokens import Span

s1 = Span(doc, 1,2, label='SOBRENOME')
doc.set_ents([s1], default='unmodified')

In [31]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, '|', spacy.explain(ent.label_))

herik | PER | Named person or family.
muller | SOBRENOME | None
limeira | LOC | Non-GPE locations, mountain ranges, bodies of water
são paulo | LOC | Non-GPE locations, mountain ranges, bodies of water


