# Importing

In [81]:
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer 
import spacy
import string, re

# Pre-processing the text

In [82]:
def remove_stopwords(text):
    new_text = []
    for word in text:
        if word not in stopwords.words('portuguese'):
            new_text += [word]
    return new_text

In [83]:
def remove_ponctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    new_text = [regex.sub('', word) for word in text]
    return new_text

In [84]:
def stemming(text):
    stemmer = RSLPStemmer()
    new_text = [stemmer.stem(word) for word in text]
    return new_text

In [85]:
def lowercase(text):
    new_text = [word.lower() for word in text]
    return new_text

In [86]:
def lemmatization(text):
    nlp = spacy.load('pt')
    new_text = []
    for word in text:
        token = nlp(word)[0]
        if token.pos_ == 'VERB':
            new_text += [token.lemma_ ]
        else:
            new_text +=[word]
    return new_text

In [95]:
def preprocess(text):
    text_punc = remove_ponctuation(text)
    text_lower = lowercase(text_punc)
    text_stem = stemming(text_lower)
    text_lemma  = lemmatization(text_stem)
    text_stop = remove_stopwords(text_lemma)
    return text_stop

In [96]:
text = ["Eu,", "qUeRo,", "uM:", "chocoLATe?", "coM/", "Caramelo."]

In [97]:
print(remove_stopwords(text))

['Eu,', 'qUeRo,', 'uM:', 'chocoLATe?', 'coM/', 'Caramelo.']


In [98]:
no_punc = remove_ponctuation(text)

In [99]:
stemming(no_punc)

['eu', 'quer', 'um', 'chocolat', 'com', 'caramel']

In [100]:
lowercase(text)

['eu,', 'quero,', 'um:', 'chocolate?', 'com/', 'caramelo.']

In [101]:
lemmatization(text)

['Eu,', 'qUeRo', 'uM:', 'chocoLATe?', 'coM/', 'Caramelo.']

In [102]:
print(preprocess(text))

['querer', 'chocolat', 'caramel']
