In [None]:
from random import shuffle
from pickle import dump, load
from numpy import array
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.optimizers import SGD

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from re import sub
from nltk import download
download('punkt')
download('stopwords')

def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    keywords = ' '.join(keywords)
    return keywords

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    stems = []
    tokens = word_tokenize(text)
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return str(stems)

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess(text):
    text = remove_punct(text)
    text = extract_keywords(text)
    text = preprocess_stem(text)
    text = remove_accent(text)
    return text


In [None]:
def preparing_documents(list_obj):
    words = []
    documents = []
    for obj in list_obj:
        label = obj['label']
        samples = obj['keywords']
        for sample in samples:
            words.extend(sample)
            documents.append((sample, label))
    return words, documents

def train_model1(list_obj):
    labels = []
    for obj in list_obj:
        labels.append(obj['label'])

    words, documents = preparing_documents(list_obj)

    words = sorted(list(set(words)))
    labels = sorted(list(set(labels)))

    words_path = "words.pkl"
    labels_path = "labels.pkl"

    dump(words, open(words_path, 'wb'))
    dump(labels, open(labels_path, 'wb'))

    training = []
    output_empty = [0] * len(labels)
    for document in documents:
        bag = []
        pattern_words = document[0]
        for word in words:
            bag.append(1) if word in pattern_words else bag.append(0)
        while len(bag) < len(words):
            bag.append(0)
        output_row = list(output_empty)
        output_row[labels.index(document[1])] = 1
        training.append([bag, output_row])

    shuffle(training)
    training = array(training, dtype=object)

    x = list(training[:, 0])
    y = list(training[:, 1])

    model = Sequential()
    model.add(Dense(128, input_shape=(len(x[0]),), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(labels), activation='softmax'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])

    m = model.fit(array(x), array(y), epochs=200, batch_size=5, verbose=1)

    model_path = ("model.h5")
    model.save(model_path, m)

In [None]:
import json

with open('..\problems_samples.json','r',encoding='utf-8') as f:
    data = json.load(f)

for i in range(len(data)):
    new = []
    for j in data[i]['keywords']:
        new.append(preprocess(j))
    data[i]['keywords'] = new

train_model(data)

In [None]:
import os
import pandas as pd
import numpy as np

from pickle import load
from keras.models import load_model
from random import choice


# retorna 0 ou 1 para cada palavra da bolsa de palavras
def bag_of_words(writing, words):
    # Pega as sentenças que são limpas e cria um pacote de palavras que são usadas para classes de previsão que são baseadas nos resultados que obtiver treinando o modelo.
    sentence_words = writing.split()
    # cria uma matriz de N palavras
    bag = [0]*len(words)
    for setence in sentence_words:
        for i, word in enumerate(words):
            if word == setence:
                # atribui 1 no pacote de palavra se a palavra atual estiver na posição da frase
                bag[i] = 1
    return(np.array(bag))

# Faz a previsao do pacote de palavras, usa como limite de erro 0.25 para evitar overfitting, e classifica esses resultados por força da probabilidade.
def class_prediction(input_user, model_path, words_path, classes_path):
    model = load_model(model_path)
    words = load(open(words_path, 'rb'))
    classes = load(open(classes_path, 'rb'))
    # filtra as previsões abaixo de um limite 0.25
    prevision = bag_of_words(input_user, words)
    response_prediction = model.predict(np.array([prevision]))[0]
    results = [[index, response] for index, response in enumerate(response_prediction)]
    # verifica nas previsões se não há 1 na lista, se não há envia a resposta padrão (anything_else) ou se não corresponde a margem de erro
    if "1" not in str(prevision) or len(results) == 0 :
        results = [[0, response_prediction[0]]]
    # classifica por força de probabilidade
    results.sort(key=lambda x: x[1], reverse=True)
    return [{"intent": classes[r[0]], "probability": str(r[1])} for r in results]


text = preprocess("Existe alguma forma de ajustar a bomba para reduzir as vibrações que estou sentindo?")
a = class_prediction(text,r'model.h5',r'words.pkl',r'labels.pkl')
a

# Novo modelo

In [None]:
# Data Science
import pandas as pd
import numpy as np
import collections

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import ModelCheckpoint

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import hashing_trick, text_to_word_sequence

# Model Components
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout

# Data Splitting
from sklearn.model_selection import train_test_split


In [None]:
# lendo os dados de treino
train = pd.read_pickle('objects/train.pkl')

X_train, X_test, y_train, y_test = train_test_split(train['samples'], train['Intent'], test_size = 0.3, shuffle = True, stratify = train['Intents'], random_state = 7)

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

tokenizer_ = Tokenizer()
tokenizer_.fit_on_texts(X_train)
print(f"Train Document Count: \n{tokenizer_.document_count}\n")

def convert_to_padded(tokenizer, docs):
    embedded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(embedded, maxlen = max_length, padding = 'post')
    return padded

vocab_size = len(tokenizer_.word_counts) + 1
print(f'Vocab size:\n{vocab_size}')

padded_X_train = convert_to_padded(tokenizer = tokenizer_, docs = X_train)
padded_X_test = convert_to_padded(tokenizer = tokenizer_, docs = X_test)
print(f'padded_X_train\n{padded_X_train}')
print(f'padded_X_val\n{padded_X_test}')

max_length = len(max(padded_X_train, key = len))
print(f'Max length:\n{max_length}')

embeddings_index = {}
f = open('models/glove.twitter.27B/glove.twitter.27B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

word_index = tokenizer_.word_index
EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def model_intent_classifier(vocab_size):
    # define o modelo
    model = Sequential()
    labels = []

    # Camada de Embedding
    model.add(Embedding(vocab_size, embedding_matrix.shape[1], input_length=32, trainable=False, weights=[embedding_matrix]))

    # Camada LSTM (camada recorrente)
    model.add(Bidirectional(LSTM(128)))

    # Camadas densas
    model.add(Dense(224, activation="relu", kernel_regularizer='l2'))
    model.add(Dense(224, activation="relu", kernel_regularizer='l2'))

    # Camada de dropout para evitar overfitting
    model.add(Dropout(0.5))
    model.add(Dense(len(labels), activation="softmax"))

    return model

In [None]:
def train_model(model, filename, epoch_num):
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    def scheduler(epoch, lr):
        if epoch < epoch_num-10:
            return lr
        else:
            return lr * tf.math.exp(-0.1)

    lr_sched_checkpoint = tf.keras.callbacks.LearningRateScheduler(scheduler)

    early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto',
    baseline=None, restore_best_weights=True
    )

    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

    hist = model.fit(padded_X_train, y_train, epoch_num, batch_size=32, validation_data=(padded_X_test, y_test), callbacks=[checkpoint, lr_sched_checkpoint, early_stopping])