In [1]:
import nltk
nltk.download('punkt', quiet=True)

True

In [2]:
from nltk import sent_tokenize

text_sample = 'He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish. In the first forty days a boy had been with him. But after forty days without a fish the boy’s parents had told him that the old man was now definitely and finally salao, which is the worst form of unlucky, and the boy had gone at their orders in another boat which caught three good fish the first week. '
sentences = sent_tokenize(text=text_sample)
print(type(sentences), len(sentences))
print(sentences)

<class 'list'> 3
['He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.', 'In the first forty days a boy had been with him.', 'But after forty days without a fish the boy’s parents had told him that the old man was now definitely and finally salao, which is the worst form of unlucky, and the boy had gone at their orders in another boat which caught three good fish the first week.']


In [3]:
from nltk import word_tokenize

sentence = 'He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.'
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 27
['He', 'was', 'an', 'old', 'man', 'who', 'fished', 'alone', 'in', 'a', 'skiff', 'in', 'the', 'Gulf', 'Stream', 'and', 'he', 'had', 'gone', 'eighty-four', 'days', 'now', 'without', 'taking', 'a', 'fish', '.']


In [4]:
def tokenize_text(text):
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

In [5]:
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['He', 'was', 'an', 'old', 'man', 'who', 'fished', 'alone', 'in', 'a', 'skiff', 'in', 'the', 'Gulf', 'Stream', 'and', 'he', 'had', 'gone', 'eighty-four', 'days', 'now', 'without', 'taking', 'a', 'fish', '.'], ['In', 'the', 'first', 'forty', 'days', 'a', 'boy', 'had', 'been', 'with', 'him', '.'], ['But', 'after', 'forty', 'days', 'without', 'a', 'fish', 'the', 'boy', '’', 's', 'parents', 'had', 'told', 'him', 'that', 'the', 'old', 'man', 'was', 'now', 'definitely', 'and', 'finally', 'salao', ',', 'which', 'is', 'the', 'worst', 'form', 'of', 'unlucky', ',', 'and', 'the', 'boy', 'had', 'gone', 'at', 'their', 'orders', 'in', 'another', 'boat', 'which', 'caught', 'three', 'good', 'fish', 'the', 'first', 'week', '.']]


In [6]:
from nltk import ngrams

sentence = 'He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.'
words = word_tokenize(sentence)

all_ngrams = ngrams(words, 3)
ngrams = [ngram for ngram in all_ngrams]
print(ngrams)

[('He', 'was', 'an'), ('was', 'an', 'old'), ('an', 'old', 'man'), ('old', 'man', 'who'), ('man', 'who', 'fished'), ('who', 'fished', 'alone'), ('fished', 'alone', 'in'), ('alone', 'in', 'a'), ('in', 'a', 'skiff'), ('a', 'skiff', 'in'), ('skiff', 'in', 'the'), ('in', 'the', 'Gulf'), ('the', 'Gulf', 'Stream'), ('Gulf', 'Stream', 'and'), ('Stream', 'and', 'he'), ('and', 'he', 'had'), ('he', 'had', 'gone'), ('had', 'gone', 'eighty-four'), ('gone', 'eighty-four', 'days'), ('eighty-four', 'days', 'now'), ('days', 'now', 'without'), ('now', 'without', 'taking'), ('without', 'taking', 'a'), ('taking', 'a', 'fish'), ('a', 'fish', '.')]


In [7]:
nltk.download('stopwords', quiet=True)

True

In [8]:
print('number of english stopwords:', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

number of english stopwords: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [9]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
    
print(all_tokens)

[['old', 'man', 'fished', 'alone', 'skiff', 'gulf', 'stream', 'gone', 'eighty-four', 'days', 'without', 'taking', 'fish', '.'], ['first', 'forty', 'days', 'boy', '.'], ['forty', 'days', 'without', 'fish', 'boy', '’', 'parents', 'told', 'old', 'man', 'definitely', 'finally', 'salao', ',', 'worst', 'form', 'unlucky', ',', 'boy', 'gone', 'orders', 'another', 'boat', 'caught', 'three', 'good', 'fish', 'first', 'week', '.']]


In [10]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'), stemmer.stem('happiest'))
print(stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [11]:
nltk.download('wordnet', quiet=True)

True

In [12]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy
fancy fancy
