In [1]:
raw_docs = ["Here are some very simple basic sentences.",
"I like to eat apple in the breakfast.",
"Much of this content has never been taught elsewhere, and is drawn from my experience building and shipping many deep learning products. ."]

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np



In [3]:
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['I', 'like', 'to', 'eat', 'apple', 'in', 'the', 'breakfast', '.'], ['Much', 'of', 'this', 'content', 'has', 'never', 'been', 'taught', 'elsewhere', ',', 'and', 'is', 'drawn', 'from', 'my', 'experience', 'building', 'and', 'shipping', 'many', 'deep', 'learning', 'products', '.', '.']]


In [4]:
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['I', 'like', 'to', 'eat', 'apple', 'in', 'the', 'breakfast'], ['Much', 'of', 'this', 'content', 'has', 'never', 'been', 'taught', 'elsewhere', 'and', 'is', 'drawn', 'from', 'my', 'experience', 'building', 'and', 'shipping', 'many', 'deep', 'learning', 'products']]


In [5]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['Here', 'simple', 'basic', 'sentences'], ['I', 'like', 'eat', 'apple', 'breakfast'], ['Much', 'content', 'never', 'taught', 'elsewhere', 'drawn', 'experience', 'building', 'shipping', 'many', 'deep', 'learning', 'products']]


In [6]:

# Stemming and Lemmatizing
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        #final_doc.append(snowball.stem(word))
        #final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['here', 'simpl', 'basic', 'sentenc'], ['I', 'like', 'eat', 'appl', 'breakfast'], ['much', 'content', 'never', 'taught', 'elsewher', 'drawn', 'experi', 'build', 'ship', 'mani', 'deep', 'learn', 'product']]
