In [2]:
import nltk

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordDetokenizer

In [8]:
corpus = """
    This is a para.
    Today is Friday.
    Is it Yay Yay?
"""

In [5]:
sent_tokenize(corpus)

['\n    This is a para.', 'Today is Fridaiy.', 'Is it Yay Yay?']

In [6]:
word_tokenize(corpus)

['This',
 'is',
 'a',
 'para',
 '.',
 'Today',
 'is',
 'Fridaiy',
 '.',
 'Is',
 'it',
 'Yay',
 'Yay',
 '?']

In [10]:
TreebankWordDetokenizer.tokenize(corpus)

TypeError: TreebankWordDetokenizer.tokenize() missing 1 required positional argument: 'tokens'

## Stemming

In [20]:
words = ["eating", "eats", "eaten", "writing", "writes", "programming", "programs", 'history', 'congratulations']

In [13]:
## Porter Stemmer

In [14]:
from nltk.stem import PorterStemmer

In [15]:
stemming = PorterStemmer()

In [21]:
for word in words:
    print(word + " __ " + stemming.stem(word))

eating __ eat
eats __ eat
eaten __ eaten
writing __ write
writes __ write
programming __ program
programs __ program
history __ histori
congratulations __ congratul


In [23]:
# # history __ histori
# or
# congratulations __ congratul

# - Issue with PorterStemming

### RegexpStemmer Class

In [24]:
from nltk.stem import RegexpStemmer

In [25]:
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

In [26]:
for word in words:
    print(word + " __ " + reg_stemmer.stem(word))

eating __ eat
eats __ eat
eaten __ eaten
writing __ writ
writes __ write
programming __ programm
programs __ program
history __ history
congratulations __ congratulation


### Snowball Stemmer

In [28]:
from nltk.stem import SnowballStemmer

In [33]:
snowball_stemmer = SnowballStemmer('english')

In [34]:
for word in words:
    print(word + " __ " + snowball_stemmer.stem(word))

eating __ eat
eats __ eat
eaten __ eaten
writing __ write
writes __ write
programming __ program
programs __ program
history __ histori
congratulations __ congratul


## Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

In [37]:
lemma = WordNetLemmatizer()

In [47]:
for word in words:
    print(word + " __ " + lemma.lemmatize(word, pos='v'))

eating __ eat
eats __ eat
eaten __ eat
writing __ write
writes __ write
programming __ program
programs __ program
history __ history
congratulations __ congratulations


In [42]:
lemma.lemmatize('goes')

'go'

In [43]:
lemma.lemmatize('going', pos='v')

'go'

In [44]:
lemma.lemmatize('going', pos='a')

'going'

In [46]:
lemma.lemmatize('going', pos='r')

'going'

## Stopwords

In [48]:
from nltk.stem import PorterStemmer

In [49]:
from nltk.corpus import stopwords

In [50]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

## POS Tag

In [9]:
words = nltk.word_tokenize(corpus)
pos_tag = nltk.pos_tag(words)

In [10]:
pos_tag

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('para', 'NN'),
 ('.', '.'),
 ('Today', 'NN'),
 ('is', 'VBZ'),
 ('Friday', 'NNP'),
 ('.', '.'),
 ('Is', 'VBZ'),
 ('it', 'PRP'),
 ('Yay', 'NNP'),
 ('Yay', 'NNP'),
 ('?', '.')]