### Imports

In [None]:
import pickle
import random
import numpy as np
import os
from random import shuffle
from pickle import dump, load
from numpy import array
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from re import sub
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
import language_tool_python
import pandas as pd

In [None]:
def preprocess_model(dict):
    words = []
    documents = []
    for chave, valor in dict.items():
        for i in valor:
            words.append(i)
            documents.append((i, chave))
    return words, documents

### Model

In [None]:
def train_model(dict):
    classes = []
    classes.extend(list(dict.keys()))
    words,documents = preprocess_model(dict)

    words = sorted(list(set(words)))
    classes = sorted(list(set(classes)))

    words_path = ("words.pkl")
    classes_path = ("classes.pkl")

    dump(words,open(words_path, 'wb'))
    dump(classes,open(classes_path, 'wb'))

    training = []
    output_empty = [0] * len(classes)
    for document in documents:
        bag = []
        pattern_words = document[0]
        for word in words:
            bag.append(1) if word in pattern_words else bag.append(0)
        while len(bag) < len(words):
            bag.append(0)
        output_row = list(output_empty)
        output_row[classes.index(document[1])] = 1
        training.append([bag, output_row])
    shuffle(training)
    training = array(training, dtype=object)

    x = list(training[:, 0])
    y = list(training[:, 1])

    model = Sequential()
    model.add(Dense(128, input_shape=(len(x[0]),), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(classes), activation='softmax'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])

    m = model.fit(array(x), array(y), epochs=200, batch_size=5, verbose=1)

    model_path = ("model.h5")
    model.save(model_path, m)

In [None]:
df = pd.read_excel(r'..\portugues\pt_troubleshooting.xlsx')
df_pattern = pd.read_excel(r'..\portugues\pt_patterns.xlsx')

In [None]:
dict = {}
for r in range(len(df['problem'].unique())):
    dict[str(r)] = df_pattern.loc[r,'patterns']

In [None]:
train_model(dict)

In [None]:
# retorna 0 ou 1 para cada palavra da bolsa de palavras
def bag_of_words(writing, words):
    sentence_words = writing.split()
    # cria uma matriz de N palavras
    bag = [0]*len(words)
    for setence in sentence_words:
        for i, word in enumerate(words):
            if word == setence:
                # atribui 1 no pacote de palavra se a palavra atual estiver na posição da frase
                bag[i] = 1
    return(array(bag))

def class_prediction(input_user):
    model = load_model('model.h5')
    words = load(open('words.pkl', 'rb'))
    classes = load(open('classes.pkl', 'rb'))
    # filtra as previsões abaixo de um limite 0.25
    prevision = bag_of_words(input_user, words)
    response_prediction = model.predict(array([prevision]))[0]
    results = [[index, response] for index, response in enumerate(response_prediction)]

    results.sort(key=lambda x: x[1], reverse=True)
    return [{"suggestion": classes[r[0]], "probability": str(r[1])} for r in results]

In [None]:
def preprocess_semantic(frase):
    tool = language_tool_python.LanguageTool('pt')
    matches = tool.check(frase)
    for i in matches:
        frase = frase[:i.offset] + i.replacements[0] + frase[i.offset+i.errorLength:]
    tool.close()
    return frase


def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    tokens = word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    text = ' '.join([str(element) for element in stems])
    return text

def preprocess_input(text):
    text = preprocess_semantic(text)
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = preprocess_stem(text)
    text = text.lower().strip()
    # tirar pontuações, acentos e espaços extras
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    # tirar espaços em branco
    text = sub(r'\s+', ' ',text)
    return text

In [None]:
texto = 'fortes vibrações durante a operação da bomba'
texto = preprocess_input(texto)
lista = texto.split()
for i in lista:
    response = class_prediction(i)
    print(i)
    print(response)
    print()
# max_value = 0
# classe = ''
# for i in response:
#     for j in i:
#         if 'e' not in i["probability"] or '-' not in i["probability"]:
#             value = float(i["probability"])
#             if value > float(max_value):
#                 max_value = value
#                 classe = i