# Importing

In [6]:
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer 
import spacy
import string, re

# Pre-processing the text

In [77]:
def remove_stopwords(text):
    new_text = []
    stop_words = set(stopwords.words('portuguese'))
    for word in text:
        if word not in stop_words:
            new_text += [word]
    return new_text

In [28]:
def remove_punctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    new_text = [regex.sub('', word) for word in text]
    return new_text

In [9]:
def stemming(text):
    stemmer = RSLPStemmer()
    new_text = [stemmer.stem(word) for word in text]
    return new_text

In [10]:
def lowercase(text):
    new_text = [word.lower() for word in text]
    return new_text

In [11]:
def lemmatization(text):
    nlp = spacy.load('pt')
    new_text = []
    for word in text:
        token = nlp(word)[0]
        if token.pos_ == 'VERB':
            new_text += [token.lemma_ ]
        else:
            new_text +=[word]
    return new_text

In [64]:
 def preprocess(text):
    assert type(text) == list, "input must be a list"
    text_lower = lowercase(text)
    text_punc = remove_ponctuation(text_lower)
    text_stop = remove_stopwords(text_punc)
    text_stem = stemming(text_stop)
    text_lemma  = lemmatization(text_stem)
    return text_lemma

In [13]:
text = ["Eu,", "qUeRo,", "uM:", "chocoLATe?", "coM/", "Caramelo."]

In [78]:
print(remove_stopwords(text))

['Eu,', 'qUeRo,', 'uM:', 'chocoLATe?', 'coM/', 'Caramelo.']


In [31]:
no_punc = remove_punctuation(text)
print(no_punc)

['Eu', 'qUeRo', 'uM', 'chocoLATe', 'coM', 'Caramelo']


In [14]:
stemming(no_punc)

['eu', 'quer', 'um', 'chocolat', 'com', 'caramel']

In [15]:
lowercase(text)

['eu,', 'quero,', 'um:', 'chocolate?', 'com/', 'caramelo.']

In [16]:
lemmatization(text)

['Eu,', 'qUeRo', 'uM:', 'chocoLATe?', 'coM/', 'Caramelo.']

In [17]:
print(preprocess(text))

['querer', 'chocolat', 'caramel']


In [73]:
class Parser:
    def extract(url):
        pass
    
    def parse(tags_dict):
        pass
    
    def get_wdlist(url):
        pass
        
    def preprocess(text):
        assert type(text) == list, "input must be a list"
        text_lower = lowercase(text)
        text_punc = remove_ponctuation(text_lower)
        text_stop = remove_stopwords(text_punc)
        text_stem = stemming(text_stop)
        text_lemma  = lemmatization(text_stem)
        return text_lemma              

In [1]:
import unittest as ut

In [80]:
class TestPreProcessTextMethods(ut.TestCase):
    
    def test_remove_stopwords(self):
        self.assertEqual(remove_stopwords(['este', 'é',  'um',  'teste']), ['é', 'teste'])
        self.assertEqual(remove_stopwords(['Este', 'é', 'UM', 'teste.']), ['Este', 'é', 'UM', 'teste.'])
    
    def test_remove_punctuation(self):
        self.assertEqual(remove_punctuation(['Este', 'é', 'UM', 'teste.']), ['Este', 'é', 'UM', 'teste'])
    
    def test_stemming(self):
        self.assertEqual(stemming(['sabia', 'pedraria', 'casarão', 'ferreiro']), ['sab', 'pedr', 'cas', 'ferr'])
    
    def test_lowercase(self):
        self.assertEqual(lowercase(['TEmoS', 'QUE', 'estAr', 'minÚscULas']), ['temos', 'que', 'estar', 'minúsculas'])
        
    def test_lemmatization(self):
        self.assertEqual(lemmatization(['é', 'quero', 'sei', 'vemos', 'lê']), ['ser', 'querer', 'saber', 'ver', 'ler'])
    
    def test_preprocess(self):
        self.assertEqual(preprocess(['Este', 'é', 'uM.', 'tesTe?', 'das', 'FUNÇÔES', 'ACima,', 'em', 'conjunto%']), 
                         ['ser', 'test', 'funçô', 'acim', 'conjunt'])
        
    def test_preprocess_notlist(self):
        with self.assertRaises(Exception) as context:
            preprocess("este é um teste") 
        self.assertTrue('input must be a list' in str(context.exception))
       
    
# if __name__ == '__main__':
#     ut.main() only for script

if __name__ == '__main__':
    ut.main(argv=['first-arg-is-ignored'], exit=False)
    
    

  return _unpackb(packed, **kwargs)
  return concat([self.open(f).read() for f in fileids])
.....
----------------------------------------------------------------------
Ran 7 tests in 0.978s

OK


In [20]:
remove_stopwords(['este', 'é', 'um', 'teste'])

['é', 'teste']

In [18]:
print('Este' in stopwords.words('portuguese'))

False
