In [46]:
corpus = """
Up until the 1980s, most natural language processing systems were based on complex sets of hand-written rules. 
Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of 
machine learning algorithms for language processing. This was due to both the steady increase in computational power
(see Moore's law) and the gradual lessening of the dominance of Chomskyan theories of linguistics (e.g. transformational grammar),
whose theoretical underpinnings discouraged the sort of corpus linguistics that underlies the machine-learning approach to language processing.[8]
"""

In [47]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [48]:
# stopwords.words('english')

In [49]:
stemmer = PorterStemmer()

In [50]:
import nltk

In [51]:
documents = nltk.sent_tokenize(corpus)

In [19]:
## Apply stopwords and filter, then apply stemming!

In [25]:
for i in range(len(documents)):
    words = nltk.word_tokenize(documents[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    documents[i] = ' '.join(words)

In [26]:
documents

['up 1980 , natur languag process system base complex set hand-written rule .',
 'start late 1980 , howev , revolut natur languag process introduct machin learn algorithm languag process .',
 "thi due steadi increas comput power ( see moor 's law ) gradual lessen domin chomskyan theori linguist ( e.g .",
 'transform grammar ) , whose theoret underpin discourag sort corpu linguist underli machine-learn approach languag process .',
 '[ 8 ]']

In [27]:
from nltk.stem import SnowballStemmer

In [28]:
snow = SnowballStemmer('english')

In [38]:
for i in range(len(documents)):
    words = nltk.word_tokenize(documents[i])
    words = [snow.stem(word) for word in words if word not in set(stopwords.words('english'))]
    documents[i] = ' '.join(words)

In [39]:
documents

['up 1980s , natur languag process system base complex set hand-written rule .',
 'start late 1980s , howev , revolut natur languag process introduct machin learn algorithm languag process .',
 "this due steadi increas comput power ( see moor 's law ) gradual lessen domin chomskyan theori linguist ( e.g .",
 'transform grammar ) , whose theoret underpin discourag sort corpus linguist under machine-learn approach languag process .',
 '[ 8 ]']

In [52]:
from nltk.stem import WordNetLemmatizer

In [53]:
lemm = WordNetLemmatizer()

In [54]:
for i in range(len(documents)):
    words = nltk.word_tokenize(documents[i])
    words = [lemm.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    documents[i] = ' '.join(words)

In [55]:
documents

['Up 1980s , natural language processing system based complex set hand-written rule .',
 'Starting late 1980s , however , revolution natural language processing introduction machine learning algorithm language processing .',
 "This due steady increase computational power ( see Moore 's law ) gradual lessening dominance Chomskyan theory linguistics ( e.g .",
 'transformational grammar ) , whose theoretical underpinnings discouraged sort corpus linguistics underlies machine-learning approach language processing .',
 '[ 8 ]']