In [1]:
from spacy import load, displacy
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from re import sub
from nltk import download
download('punkt')
download('stopwords')

def remove_num(text):
    text = sub(r'\d+', '', text)
    text = sub(r'\s+', ' ',text)
    return text

def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    return ' '.join(keywords)

def get_synonyms(text):
    tokens = word_tokenize(text)
    synonyms = []
    for word in tokens:
        for syn in wordnet.synsets(word, lang="por"):
            for lemma in syn.lemmas(lang="por"):
                synonyms.append(lemma.name())
    return synonyms

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess_lemma(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmas = []
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmas = ' '.join(lemmas)
    return lemmas

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    tokens = word_tokenize(text)
    stems = []
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return stems



def preprocess(text, tipo=None):
    text = remove_punct(text)
    text = remove_num(text)
    text = extract_keywords(text)
    if tipo == 'lemma':
        text = preprocess_lemma(text)
    elif tipo == 'stem':
        text = preprocess_stem(text)
    else:
        pass
    text = remove_accent(text)
    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def classifier_equipament(text, path_model):
    text = preprocess(text,'lemma')
    nlp = load(path_model)
    doc = nlp(text)
    # labels = [{"text":entidade.text,"class":entidade.label_} for entidade in doc.ents]
    labels = displacy.render(doc, style="ent")
    return labels

def classifier_intent(text, path_model):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(path_model)
    label_encoder = LabelEncoder()
    label_encoder.classes_ = torch.load(f'{path_model}\label_encoder_classes.pt')
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    # model.eval()
    with torch.no_grad():
        logits = model(**encoded_input).logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1)
        predicted_classes = label_encoder.inverse_transform(predicted_labels)
    probability = probabilities[0][predicted_labels].item()
    if probability > 0.8:
        classe = predicted_classes.item()
    else:
        classe = 'NaN'
        probability = 1.0
    return {"text":text,"class":classe,"probability":probability}

def classifier_problem(text, path_model):
    text = preprocess(text, 'lemma')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(path_model)
    label_encoder = LabelEncoder()
    label_encoder.classes_ = torch.load(f'{path_model}\label_encoder_classes.pt')
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    model.eval()
    with torch.no_grad():
        logits = model(**encoded_input).logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1)
        predicted_classes = label_encoder.inverse_transform(predicted_labels)
    probability = probabilities[0][predicted_labels].item()
    if probability > 0.8:
        classe = predicted_classes.item()
    else:
        classe = 'NaN'
        probability = 1.0
    return {"text":text,"class":classe,"probability":probability}

In [6]:
text = "bomba com temperatura elevada e luzes piscando em sequência, disjuntor"

# equipamento = classifier_equipament(text,r'test\model_NER')
# intencao = classifier_intent(text,r'model\model_bert')
problema = classifier_problem(text,r'model/model_bert_problem')

# print(equipamento)
# print(intencao)
print(problema)

{'text': 'bomba temperatura elevada luzes piscando sequencia disjuntor', 'class': 'arquivo1', 'probability': 0.9923225045204163}
