In [1]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize

## TOKENIZE

In [2]:
#tokenizing
#word tokenizers and sentence tokenizers
sentence = "Hello Mr. Smith, how are you doing today? I am doing great!"
sent_tokenize(sentence)

['Hello Mr. Smith, how are you doing today?', 'I am doing great!']

In [3]:
for i in word_tokenize(sentence):
    print(i)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
I
am
doing
great
!


## STOPWORDS

In [4]:
from nltk.corpus import stopwords

In [5]:
stop_words = stopwords.words("english")
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
cleaned = [word for word in word_tokenize(sentence) if word not in stop_words]
cleaned

['Hello', 'Mr.', 'Smith', ',', 'today', '?', 'I', 'great', '!']

## STEMMING

In [11]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()
example = "Hi there! This is Gautham practicing python on the jupyter notebook"
words = word_tokenize(example)
stemmed_words = [ps.stem(word) for word in words]
stemmed_words

['Hi',
 'there',
 '!',
 'thi',
 'is',
 'gautham',
 'practic',
 'python',
 'on',
 'the',
 'jupyt',
 'notebook']

## LEMMATIZING

In [17]:
from nltk.stem import WordNetLemmatizer

In [18]:
l = WordNetLemmatizer()

In [23]:
print(l.lemmatize("runs"))
print(l.lemmatize("cacti"))
print(l.lemmatize("octopi"))
print(l.lemmatize("better",pos='a'))
print(l.lemmatize("men"))
print(l.lemmatize("women"))
print(l.lemmatize("donkeys"))

run
cactus
octopus
good
men
woman
donkey


## OTHERS

In [33]:
from nltk.corpus import movie_reviews
import random

In [28]:
docs = []
for cat in movie_reviews.categories():
    for f_id in movie_reviews.fileids(cat):
        docs.append((list(movie_reviews.words(f_id)),cat))

In [34]:
random.shuffle(docs)

In [37]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [38]:
all_words = nltk.FreqDist(all_words)
all_words.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [39]:
all_words["stupid"]

253

In [42]:
sentence = "Gautham is a very good boy. He can study and run."
nltk.pos_tag(sentence)

[('G', 'VB'),
 ('a', 'DT'),
 ('u', 'JJ'),
 ('t', 'NN'),
 ('h', 'VBD'),
 ('a', 'DT'),
 ('m', 'NN'),
 (' ', 'NN'),
 ('i', 'NN'),
 ('s', 'VBP'),
 (' ', 'PDT'),
 ('a', 'DT'),
 (' ', 'JJ'),
 ('v', 'NN'),
 ('e', 'NN'),
 ('r', 'NN'),
 ('y', 'NN'),
 (' ', 'NNP'),
 ('g', 'NN'),
 ('o', 'NN'),
 ('o', 'NN'),
 ('d', 'NN'),
 (' ', 'NN'),
 ('b', 'NN'),
 ('o', 'NN'),
 ('y', 'NN'),
 ('.', '.'),
 (' ', 'CC'),
 ('H', 'NNP'),
 ('e', 'VBP'),
 (' ', 'NNP'),
 ('c', 'VBP'),
 ('a', 'DT'),
 ('n', 'JJ'),
 (' ', 'NN'),
 ('s', 'NN'),
 ('t', 'NN'),
 ('u', 'JJ'),
 ('d', 'NN'),
 ('y', 'NN'),
 (' ', 'VBZ'),
 ('a', 'DT'),
 ('n', 'JJ'),
 ('d', 'NN'),
 (' ', 'NNP'),
 ('r', 'NN'),
 ('u', 'JJ'),
 ('n', 'NN'),
 ('.', '.')]

In [47]:
from nltk.tokenize import PunktSentenceTokenizer

In [48]:
sent_token = PunktSentenceTokenizer(sentence)

In [49]:
sent_token

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x2091ff37400>

In [50]:
sentence2 = "Gautham is 21 years old. He lives in Bangalore, the capital of the state of Karntaka"

In [51]:
tokenized = sent_token.tokenize(sentence2)

In [52]:
tokenized

['Gautham is 21 years old.',
 'He lives in Bangalore, the capital of the state of Karntaka']

In [54]:
sent_tokenize(sentence2)

['Gautham is 21 years old.',
 'He lives in Bangalore, the capital of the state of Karntaka']

In [55]:
type(sent_tokenize(sentence2)[0])

str

In [57]:
type(tokenized[0])

str

In [96]:
words = nltk.word_tokenize(sentence)
nltk.pos_tag(words)

[('Gautham', 'NNP'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('boy', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('does', 'VBZ'),
 ('not', 'RB'),
 ('like', 'JJ'),
 ('sports', 'NNS'),
 ('.', '.')]

In [109]:
sentence = "Gautham is not a very good boy. He does not like sports."
words = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(words)

In [110]:
tags

[('Gautham', 'NNP'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('boy', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('does', 'VBZ'),
 ('not', 'RB'),
 ('like', 'JJ'),
 ('sports', 'NNS'),
 ('.', '.')]

In [111]:
from nltk.corpus import wordnet

In [112]:
for i in range(len(tags)):
    if(tags[i][0]=="not"):
        f2=0
        for j in range(i+1,len(tags)):
            if(tags[j][1] == 'JJ'):
                f1=0
                for syn in wordnet.synsets(tags[j][0]): 
                    for l in syn.lemmas(): 
                        if l.antonyms(): 
                            words[j] = l.antonyms()[0].name()
                            f1=1
                            break
                    if(f1==1):
                        f2=1
                        break
            if(f2 == 1):
                del(words[words.index('not')])
                break
                        

In [113]:
words

['Gautham',
 'is',
 'a',
 'very',
 'evil',
 'boy',
 '.',
 'He',
 'does',
 'like',
 'dislike',
 '.']

In [103]:
a=['a','a']
a.index('a')

0