In [None]:
import nltk
import spacy
import pandas as pd

from re import sub
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
def remove_num(text):
    text = sub(r'\d+', '', text)
    text = sub(r'\s+', ' ',text)
    return text

def remove_punct(text):
    text = sub(r"[!#$%&'()*+,-./:;<=>?@[^_`{|}~]+", ' ',text)
    text = sub(r'\s+', ' ',text)
    return text

def extract_keywords(text):
    tokens = word_tokenize(text)
    keywords = []
    for word in tokens:
        word = word.lower()
        if word not in stopwords.words('portuguese') or word.lower() not in STOP_WORDS:
            keywords.append(word)
    return keywords

def get_synonyms(text):
    tokens = word_tokenize(text)
    synonyms = []
    for word in tokens:
        for syn in wordnet.synsets(word, lang="por"):
            for lemma in syn.lemmas(lang="por"):
                synonyms.append(lemma.name())
    return synonyms

def preprocess_lemma(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmas = []
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    lemmas = ' '.join(lemmas)
    return lemmas

def remove_accent(text):
    text = sub('[áàãâä]', 'a', sub('[éèêë]', 'e', sub('[íìîï]', 'i', sub('[óòõôö]', 'o', sub('[úùûü]', 'u', text)))))
    text = sub(r'\s+', ' ',text)
    return text

def preprocess_stem(text):
    stemmer = SnowballStemmer("portuguese")
    tokens = word_tokenize(text)
    stems = []
    for token in tokens:
        stems.append(stemmer.stem(token))
    stems = ' '.join(stems)
    return stems

In [None]:
df = pd.read_excel(r'..\portugues\pt_troubleshooting.xlsx')v

In [None]:
list_rows = []
for row in df['problem'].unique():
    list_words = []
    row = remove_num(row)
    row = remove_punct(row)
    list_keywords = extract_keywords(row)
    for keyword in list_keywords:                
        x = remove_accent(keyword)
        if x not in list_words:
            list_words.append(x)

        x = preprocess_lemma(x)
        if x not in list_words:
            list_words.append(x)
            
        x = preprocess_stem(x)
        if x not in list_words:
            list_words.append(x)
        
        z = get_synonyms(keyword)
        for synonym in z:
            synonym = remove_punct(synonym)
            x = remove_accent(synonym)
            if x not in list_words:
                list_words.append(x)

            x = preprocess_lemma(x)
            if x not in list_words:
                list_words.append(x)
                
            x = preprocess_stem(x)
            if x not in list_words:
                list_words.append(x)
    list_rows.append(list_words)

In [None]:
for i in range(len(list_rows)):
    list_rows[i] = [', '.join(list_rows[i])]
new_df = pd.DataFrame(list_rows, columns=['patterns'])
new_df

In [None]:
new_df.to_excel(r'..\portugues\pt_patterns.xlsx')