In [8]:
import spacy
import random
from spacy.util import minibatch, compounding

In [9]:
nlp = spacy.load("pt_core_news_sm")

OSError: [E050] Can't find model 'pt_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [16]:

def train_model(data_dict, iterations):
    nlp = spacy.blank("pt")

    nlp.add_pipe("ner", name="ner", last=True)

    for label in data_dict.keys():
        nlp.get_pipe("ner").add_label(label)

    train_data = []
    for label, examples in data_dict.items():
        for text, annotations in examples:
            train_data.append((text, annotations))

    nlp.begin_training()
    for itn in range(iterations):
        random.shuffle(train_data)
        losses = {}
        
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            example_batch = []
            for text, annotation in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = spacy.training.example.Example.from_dict(doc, annotation)
                example_batch.append(example)
            nlp.update(example_batch, losses=losses)
        
        print("Iteration:", itn+1, "Loss:", losses)

    return nlp

#### Exemplo de dados dos parâmetros:

In [17]:
data_dict = entidades = {
    "BOMBA": [
    ("Bomba centrífuga", {"entities": [(0, 5, "BOMBA")]}),
    ]}

output_dir = r"..\documents"

iterations = 100
model = train_model(data_dict, iterations)

Iteration: 1 Loss: {'ner': 1.6666666865348816}
Iteration: 2 Loss: {'ner': 1.5900554656982422}
Iteration: 3 Loss: {'ner': 1.4327635169029236}
Iteration: 4 Loss: {'ner': 1.220018208026886}
Iteration: 5 Loss: {'ner': 0.9769566357135773}
Iteration: 6 Loss: {'ner': 0.6846418380737305}
Iteration: 7 Loss: {'ner': 0.4011929929256439}
Iteration: 8 Loss: {'ner': 0.16825400292873383}
Iteration: 9 Loss: {'ner': 0.07151926681399345}
Iteration: 10 Loss: {'ner': 0.020669556222856045}
Iteration: 11 Loss: {'ner': 0.003666175529360771}
Iteration: 12 Loss: {'ner': 0.0006613792502321303}
Iteration: 13 Loss: {'ner': 0.0002842040848918259}
Iteration: 14 Loss: {'ner': 1.965474075404927e-05}
Iteration: 15 Loss: {'ner': 4.1652087929833215e-06}
Iteration: 16 Loss: {'ner': 7.758024906934224e-07}
Iteration: 17 Loss: {'ner': 9.059862549065656e-08}
Iteration: 18 Loss: {'ner': 1.0446634401972688e-07}
Iteration: 19 Loss: {'ner': 3.749559968069249e-09}
Iteration: 20 Loss: {'ner': 1.5915142537181737e-09}
Iteration: 21 

#### Preparando os dados

In [22]:
import pandas as pd
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from re import sub
from nltk import download
download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
def create_json(json_path, content):
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=4)


def load_json(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def load_df(file):
    df = pd.read_excel(file)
    return df

def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    stems = []
    tokens = word_tokenize(text)
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return str(stems)

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess(text):
    text = remove_punct(text)
    text = preprocess_stem(text)
    text = remove_accent(text)
    return text

In [31]:
import re
import pandas as pd

def encontrar_palavras(texto, frases):
    resultado = []
    
    for frase in frases:
        padrao = re.compile(r'\b{}\b'.format(re.escape(frase)))
        correspondencias = padrao.finditer(texto)
        
        for correspondencia in correspondencias:
            dicionario = {
                "text": frase,
                "startc": correspondencia.start(),
                "endc": correspondencia.end() - 1,
                "startp": len(re.findall(r'\b\w+\b', texto[:correspondencia.start()])),
                "endp": (len(re.findall(r'\b\w+\b', texto[:correspondencia.start()])) + len(frase.split())) - 1
            }
            resultado.append(dicionario)
    return resultado

In [24]:
data = load_json(r'..\documents\entidades2.json')
df = load_df('..\documents\classes.xlsx')

In [53]:
new_data = []
words = []
for idx, row in df.iterrows():
    classe = idx + 1
    try:
        if data[idx]['classe'] == classe:
            list_words = row['keywords'].split(',')
            for i in range(len(list_words)):
                list_words[i] = preprocess(list_words[i]).strip()
            words.append(list_words)
    except IndexError:
        break

new_dict = {}
for i in range(len(data)):
    list_tuple = []
    for texto in data[i]["texts"]:
        texto = preprocess(texto).strip()
        list_find_word = encontrar_palavras(texto,words[i])
        list_find = []
        for j in list_find_word:
            list_word_found = [j['startc'],j['endc']+1,str(data[i]['classe'])]
            list_find.append(list_word_found)
        list_tuple.append((texto, {"entities": list_find}))
    new_dict[str(data[i]['classe'])] = list_tuple
        # dict_ = encontrar_palavras(texto,words)
        # new_dict[f'texto{cont}'] = texto
        # new_dict['entities'] = dict_
        # cont += 1
        # new_data.append(new_dict)
        # print(new_data)
    create_json(r'..\documents\entidades2-classify.json',new_dict)

#### Treinando o modelo

In [54]:
def corrigir_sobreposicao_entidades(data_dict):
    for label, examples in data_dict.items():
        for i in range(len(examples)):
            entities = examples[i][1]["entities"]
            entities_sorted = sorted(entities, key=lambda x: x[1] - x[0], reverse=True)
            entities_filtered = []
            for j in range(len(entities_sorted)):
                entity = entities_sorted[j]
                entity_start = entity[0]
                entity_end = entity[1]
                entity_label = entity[2]
                
                entity_exists = False
                for existing_entity in entities_filtered:
                    if (existing_entity[0] <= entity_start <= existing_entity[1]) or (existing_entity[0] <= entity_end <= existing_entity[1]):
                        entity_exists = True
                        break
                
                if not entity_exists:
                    entities_filtered.append(entity)
            
            data_dict[label][i][1]["entities"] = entities_filtered
    
    return data_dict


In [55]:
data_train = load_json(r'..\documents\entidades2-classify.json')

data_dict = corrigir_sobreposicao_entidades(data_train)

output_dir = r"..\documents"

iterations = 100
model = train_model(data_dict, iterations)

Iteration: 1 Loss: {'ner': 1718.6461276270636}
Iteration: 2 Loss: {'ner': 367.1908256192367}
Iteration: 3 Loss: {'ner': 327.83803614774195}
Iteration: 4 Loss: {'ner': 206.2609648051739}
Iteration: 5 Loss: {'ner': 70.65038252161628}
Iteration: 6 Loss: {'ner': 47.07972406327113}
Iteration: 7 Loss: {'ner': 65.5676245116571}
Iteration: 8 Loss: {'ner': 97.66917580153701}
Iteration: 9 Loss: {'ner': 48.561792174187}
Iteration: 10 Loss: {'ner': 58.35268329232744}
Iteration: 11 Loss: {'ner': 27.516725597068636}
Iteration: 12 Loss: {'ner': 45.275293702007154}
Iteration: 13 Loss: {'ner': 29.091798090263243}
Iteration: 14 Loss: {'ner': 32.93091400832715}
Iteration: 15 Loss: {'ner': 25.606365041901142}
Iteration: 16 Loss: {'ner': 26.592954891271624}
Iteration: 17 Loss: {'ner': 20.300315833996876}
Iteration: 18 Loss: {'ner': 62.497444262652856}
Iteration: 19 Loss: {'ner': 54.406453047254736}
Iteration: 20 Loss: {'ner': 13.691980790213547}
Iteration: 21 Loss: {'ner': 32.71854349355182}
Iteration: 22 

#### Testando o modelo

In [73]:
text = "O disjuntor el\u00e9trico de caixa moldada \u00e9 um dispositivo de prote\u00e7\u00e3o utilizado em sistemas el\u00e9tricos de baixa e m\u00e9dia tens\u00e3o, cuja fun\u00e7\u00e3o \u00e9 interromper corrente el\u00e9trica excessiva para evitar danos aos equipamentos."
text = preprocess(text)

doc = model(text)
for ent in doc.ents:
    print('palavra:',ent.text)
    print('classe:',ent.label_)

palavra: disjuntor eletr
classe: 12
palavra: caix mold
classe: 12
