In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from re import sub
from nltk import download
download('wordnet')
download('omw-1.4')
download('punkt')
download('stopwords')

def remove_num(text):
    text = sub(r'\d+', '', text)
    text = sub(r'\s+', ' ',text)
    return text

def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    return ' '.join(keywords)

# def get_synonyms(text):
#     tokens = word_tokenize(text)
#     synonyms = []
#     for word in tokens:
#         for syn in wordnet.synsets(word, lang="por"):
#             for lemma in syn.lemmas(lang="por"):
#                 synonyms.append(lemma.name())
#     return synonyms

def preprocess_lemma(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmas = []
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmas = ' '.join(lemmas)
    return lemmas

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    tokens = word_tokenize(text)
    stems = []
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return stems



def preprocess(text):
    text = remove_punct(text)
    text = remove_num(text)
    # text = extract_keywords(text)
    # text = preprocess_lemma(text)
    text = remove_accent(text)
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import json

def arquivo(path,lista):
    content = []
    with open(path,'r', encoding='utf-8') as file:
        linhas = file.readlines()
        linhas = [linha.strip() for linha in linhas]
    with open(path,'w', encoding='utf-8') as file:
        linhas += lista
        for j in linhas:
            j = preprocess(j)
            j = j.lower().strip()
            if j not in content and len(j) > 3:
                content.append(j)
                
            j_keyword = extract_keywords(j)
            if j_keyword not in content and len(j) > 3:
                content.append(j_keyword)
            j_lemma = preprocess_lemma(j)
            if j_lemma not in content:
                content.append(j_lemma)
            j_stem = preprocess_stem(j)
            if j_stem not in content:
                content.append(j_stem)

            j_keyword2 = extract_keywords(j_lemma)
            if j_keyword2 not in content:
                content.append(j_keyword2)
            j_stem2 = preprocess_stem(j_lemma)
            if j_stem2 not in content:
                content.append(j_stem2)

            j_lemma3 = preprocess_lemma(j_keyword)
            if j_lemma3 not in content:
                content.append(j_lemma3)
            j_stem3 = preprocess_stem(j_keyword)
            if j_stem3 not in content:
                content.append(j_stem3)


        for i in content:
            i = i.strip()

            file.writelines(i + '\n')

def load_json(path):
    with open(path,'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [None]:
data = load_json('intents_samples2.json')
# for i in data:
    # if 'INSATISFACAO' in i.keys():
    #     lista = i['INSATISFACAO']
    # if 'Reclamação' in list(i.values())[0]:
    #     lista = i['keywords']
arquivo(r'..\database\offensive\OFFENSIVE.txt',lista)

In [None]:
import docx2txt

result = docx2txt.process(r'..\censura pt-BR.docx')
result = result.split('\n')

lista = []
for i in result:
    if i != '':
        i = i.lower().strip()
        lista.append(i)
lista


In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from random import choice

def load_txt(path):
    with open(path,'r', encoding='utf-8') as file:
        linhas = file.readlines()
        lines = []
        for linha in linhas:
            linha = linha.strip()
            if linha != '':
                lines.append(linha)
    return lines

def sintetizar_dados(lista):
    def substituir_sinonimos(frase):
        tokens = nltk.word_tokenize(frase)
        sinonimos = []
        for token in tokens:
            try:
                sinonimo = choice(wordnet.synsets(token)).lemmas()[0].name()
                sinonimos.append(sinonimo if sinonimo != token else token)
            except IndexError:
                sinonimos.append(token)
        return ' '.join(sinonimos)
    def rearranjar_estrutura(frase):
        tokens = nltk.word_tokenize(frase)
        rearranjada = ' '.join(tokens[::-1])
        return rearranjada
    sintetizados = []
    for exemplo in lista:
        exemplo_sinonimos = substituir_sinonimos(exemplo)
        exemplo_rearranjado = rearranjar_estrutura(exemplo)
        sintetizados.extend([exemplo, exemplo_sinonimos, exemplo_rearranjado])
    lista2 = []
    for i,j in zip(lista,sintetizados):
        if i not in lista2:
            lista2.append(i)
        if j not in lista2:
            lista2.append(j)
        i2 = extract_keywords(i)
        j2 = extract_keywords(j)
        if i2 not in lista2:
            lista2.append(i2)
        if j2 not in lista2:
            lista2.append(j2)
    lista3 = lista2
    for x in lista3:
        x = preprocess_lemma(x)
        if x not in lista2:
            lista2.append(x)
    return lista2

def salvar_lista_em_arquivo(lista, nome_arquivo):
    with open(nome_arquivo, 'a+', encoding="utf-8") as arquivo:
        for exemplo in lista:
            arquivo.write(exemplo + '\n')

def eliminar_linhas_repetidas(nome_arquivo):
    linhas_unicas = set()
    with open(nome_arquivo, 'r',encoding="utf-8") as arquivo:
        for linha in arquivo:
            linha = remove_accent(remove_punct(linha)).strip().lower()
            if linha not in linhas_unicas:
                linhas_unicas.add(linha)
    with open(nome_arquivo, 'w',encoding="utf-8") as arquivo:
        for linha in linhas_unicas:
            arquivo.write(linha + '\n')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\herik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
lines = load_txt(r'..\database\intents\intents txt\FEEDBACK_NEGATIVO.txt')
sintetizados = sintetizar_dados(lines)
salvar_lista_em_arquivo(sintetizados,r'..\database\intents\intents txt\FEEDBACK_NEGATIVO.txt')
eliminar_linhas_repetidas(r'..\database\intents\intents txt\FEEDBACK_NEGATIVO.txt')