# Importing

In [85]:
import nltk
nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')
!python -m spacy download pt

[nltk_data] Downloading package stopwords to /home/luke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/luke/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/luke/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /home/luke/anaconda3/envs/text-rec/lib/python3.6/site-packages/pt_core_news_sm
    -->
    /home/luke/anaconda3/envs/text-rec/lib/python3.6/site-packages/spacy/data/pt

    You can now load the model via spacy.load('pt')



In [86]:
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer 
from nltk import tokenize
import spacy
import string, re

# Pre-processing the text

In [87]:
def remove_stopwords(text):
    new_text = []
    stop_words = set(stopwords.words('portuguese'))
    for word in text:
        if word not in stop_words:
            new_text += [word]
    return new_text

In [88]:
def remove_punctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    new_text = [regex.sub('', word) for word in text]
    return new_text

In [99]:
def stemming(text):
    text = [word for word in text if word != ""]
    stemmer = RSLPStemmer()
    new_text = [stemmer.stem(word) for word in text]
    return new_text

In [100]:
def lowercase(text):
    new_text = [word.lower() for word in text]
    return new_text

In [101]:
def lemmatization(text):
    nlp = spacy.load('pt')
    new_text = []
    for word in text:
        token = nlp(word)[0]
        if token.pos_ == 'VERB':
            new_text += [token.lemma_ ]
        else:
            new_text +=[word]
    return new_text

In [102]:
def tokenize_text(sentences):
    words = [tokenize.word_tokenize(sent, language='portuguese') for sent in sentences]
    words = sum(words, [])
    return words
    
    
def preprocess(text):
    assert type(text) == list, "input must be a list"
    text_lower = lowercase(text)
    text_punc = remove_punctuation(text_lower)
    text_stop = remove_stopwords(text_punc)
    text_stem = stemming(text_stop)
    text_lemma  = lemmatization(text_stem)
    return text_lemma

In [103]:
text = ["".join(["Eu,", "qUeRo,", "uM:", "chocoLATe?", "coM/", "Caramelo."]),
        "e um gato gordo, por favor"]
text = tokenize_text(text)
print(text)

['Eu', ',', 'qUeRo', ',', 'uM', ':', 'chocoLATe', '?', 'coM/Caramelo', '.', 'e', 'um', 'gato', 'gordo', ',', 'por', 'favor']


In [104]:
print(remove_stopwords(text))

['Eu', ',', 'qUeRo', ',', 'uM', ':', 'chocoLATe', '?', 'coM/Caramelo', '.', 'gato', 'gordo', ',', 'favor']


In [105]:
no_punc = remove_punctuation(text)
print(no_punc)

['Eu', '', 'qUeRo', '', 'uM', '', 'chocoLATe', '', 'coMCaramelo', '', 'e', 'um', 'gato', 'gordo', '', 'por', 'favor']


In [106]:
stemming(no_punc)

['eu',
 'quer',
 'um',
 'chocolat',
 'comcaramel',
 'e',
 'um',
 'gat',
 'gord',
 'por',
 'favor']

In [107]:
lowercase(text)

['eu',
 ',',
 'quero',
 ',',
 'um',
 ':',
 'chocolate',
 '?',
 'com/caramelo',
 '.',
 'e',
 'um',
 'gato',
 'gordo',
 ',',
 'por',
 'favor']

In [108]:
lemmatization(text)

['Eu',
 ',',
 'qUeRo',
 ',',
 'uM',
 ':',
 'chocoLATe',
 '?',
 'coM/Caramelo',
 '.',
 'e',
 'um',
 'gato',
 'gordo',
 ',',
 'por',
 'favor']

In [109]:
print(preprocess(text))

['querer', 'chocolat', 'comcaramel', 'gat', 'gord', 'favor']


In [73]:
class Parser:
    def extract(url):
        pass
    
    def parse(tags_dict):
        pass
    
    def get_wdlist(url):
        pass
        
    def preprocess(text):
        assert type(text) == list, "input must be a list"
        text_lower = lowercase(text)
        text_punc = remove_ponctuation(text_lower)
        text_stop = remove_stopwords(text_punc)
        text_stem = stemming(text_stop)
        text_lemma  = lemmatization(text_stem)
        return text_lemma              

In [1]:
import unittest as ut

In [80]:
class TestPreProcessTextMethods(ut.TestCase):
    
    def test_remove_stopwords(self):
        self.assertEqual(remove_stopwords(['este', 'é',  'um',  'teste']), ['é', 'teste'])
        self.assertEqual(remove_stopwords(['Este', 'é', 'UM', 'teste.']), ['Este', 'é', 'UM', 'teste.'])
    
    def test_remove_punctuation(self):
        self.assertEqual(remove_punctuation(['Este', 'é', 'UM', 'teste.']), ['Este', 'é', 'UM', 'teste'])
    
    def test_stemming(self):
        self.assertEqual(stemming(['sabia', 'pedraria', 'casarão', 'ferreiro']), ['sab', 'pedr', 'cas', 'ferr'])
    
    def test_lowercase(self):
        self.assertEqual(lowercase(['TEmoS', 'QUE', 'estAr', 'minÚscULas']), ['temos', 'que', 'estar', 'minúsculas'])
        
    def test_lemmatization(self):
        self.assertEqual(lemmatization(['é', 'quero', 'sei', 'vemos', 'lê']), ['ser', 'querer', 'saber', 'ver', 'ler'])
    
    def test_preprocess(self):
        self.assertEqual(preprocess(['Este', 'é', 'uM.', 'tesTe?', 'das', 'FUNÇÔES', 'ACima,', 'em', 'conjunto%']), 
                         ['ser', 'test', 'funçô', 'acim', 'conjunt'])
        
    def test_preprocess_notlist(self):
        with self.assertRaises(Exception) as context:
            preprocess("este é um teste") 
        self.assertTrue('input must be a list' in str(context.exception))
       
    
# if __name__ == '__main__':
#     ut.main() only for script

if __name__ == '__main__':
    ut.main(argv=['first-arg-is-ignored'], exit=False)
    
    

  return _unpackb(packed, **kwargs)
  return concat([self.open(f).read() for f in fileids])
.....
----------------------------------------------------------------------
Ran 7 tests in 0.978s

OK


In [20]:
remove_stopwords(['este', 'é', 'um', 'teste'])

['é', 'teste']

In [18]:
print('Este' in stopwords.words('portuguese'))

False
