In [18]:
raw_docs = ["Let's do some excercise today.",
"Are you interested in running, I love it!",
"what about swimming, Yeaah woh fun!"
"These are basic text to learn how cleaning works."]

## Tokenizing text into bags of words

In [19]:

from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['Let', "'s", 'do', 'some', 'excercise', 'today', '.'], ['Are', 'you', 'interested', 'in', 'running', ',', 'I', 'love', 'it', '!'], ['what', 'about', 'swimming', ',', 'Yeaah', 'woh', 'fun', '!', 'These', 'are', 'basic', 'text', 'to', 'learn', 'how', 'cleaning', 'works', '.']]


## Removing punctuation


In [20]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['Let', 's', 'do', 'some', 'excercise', 'today'], ['Are', 'you', 'interested', 'in', 'running', 'I', 'love', 'it'], ['what', 'about', 'swimming', 'Yeaah', 'woh', 'fun', 'These', 'are', 'basic', 'text', 'to', 'learn', 'how', 'cleaning', 'works']]


## Cleaning text of stopwords

In [21]:
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['Let', 'excercise', 'today'], ['Are', 'interested', 'running', 'I', 'love'], ['swimming', 'Yeaah', 'woh', 'fun', 'These', 'basic', 'text', 'learn', 'cleaning', 'works']]


## Stemming and Lemmatizing

In [22]:

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('omw-1.4')
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        final_doc.append(snowball.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['let', 'let', 'Let', 'excercis', 'excercis', 'excercise', 'today', 'today', 'today'], ['are', 'are', 'Are', 'interest', 'interest', 'interested', 'run', 'run', 'running', 'i', 'i', 'I', 'love', 'love', 'love'], ['swim', 'swim', 'swimming', 'yeaah', 'yeaah', 'Yeaah', 'woh', 'woh', 'woh', 'fun', 'fun', 'fun', 'these', 'these', 'These', 'basic', 'basic', 'basic', 'text', 'text', 'text', 'learn', 'learn', 'learn', 'clean', 'clean', 'cleaning', 'work', 'work', 'work']]


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mubinaarastu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
