# Introduction To Natural Language Processing with NLTK

## Tokenization

In [None]:
from nltk import sent_tokenize, word_tokenize

### 01. Sentence Tokenization

In [None]:
text = "Success is not final. Failure is not fatal. It is the courage to continue that counts."
print(text)

In [None]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

In [None]:
for sentence in sentence_tokens:
    print(sentence)

### 02. Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
sentence = "Let's see how the tokenizer split's this!"

In [None]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

In [None]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, WhitespaceTokenizer

In [None]:
tree_tokenizer = TreebankWordTokenizer()
word_punct_tokenizer = WordPunctTokenizer()
white_space_tokenizer = WhitespaceTokenizer()

In [None]:
word_tokens = tree_tokenizer.tokenize(sentence)
print(word_tokens)

In [None]:
word_tokens = word_punct_tokenizer.tokenize(sentence)
print(word_tokens)

In [None]:
word_tokens = white_space_tokenizer.tokenize(sentence)
print(word_tokens)

## Stemming and Lemmatization

### 01. Stemming

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [None]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('lying'))
print(porter_stemmer.stem('lies'))
print(porter_stemmer.stem('lied'))

In [None]:
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('lying'))
print(lancaster_stemmer.stem('lies'))
print(lancaster_stemmer.stem('lied'))

In [None]:
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('lying'))
print(snowball_stemmer.stem('lies'))
print(snowball_stemmer.stem('lied'))

### 02. Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))

In [None]:
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    print("verb form: " + lemmatizer.lemmatize(word, pos="v"))
    print("noun form: " + lemmatizer.lemmatize(word, pos="n"))
    print("adverb form: " + lemmatizer.lemmatize(word, pos="r"))
    print("adjective form: " + lemmatizer.lemmatize(word, pos="a"))

In [None]:
lemmatize("ears")

In [None]:
lemmatize("running")

### 03. Stemming vs Lemmatization 

In [None]:
stemmer = PorterStemmer();
lemmatizer = WordNetLemmatizer()

In [None]:
print(stemmer.stem("deactivating"))
print(stemmer.stem("deactivated"))
print(stemmer.stem("deactivates"))

In [None]:
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))

In [None]:
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 

In [None]:
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

# POS Tagging

In [None]:
from nltk import pos_tag

In [None]:
sentence = "The hardest choices require the strongest wills"

In [None]:
sentence_tokens = word_tokenize(sentence)
print(sentence_tokens)

In [None]:
pos_tag(sentence_tokens)

# Chunking

In [None]:
from nltk import RegexpParser

In [None]:
sentence = "the big visious dog barked at the small feeble cat"
# sentence = "the little yellow hard tight dog barked at the cat"

In [None]:
#Define your grammar using regular expressions
grammar = ('''NP: {<DT>?<JJ>*<NN>} # NP''')

In [None]:
chunkParser = RegexpParser(grammar)
tagged = pos_tag(word_tokenize(sentence))
tagged

In [None]:
tree = chunkParser.parse(tagged)

In [None]:
for subtree in tree.subtrees():
    print(subtree)

# Stop Word Removal

In [None]:
from nltk.corpus import stopwords

In [None]:
print(stopwords.words('english'))

In [None]:
sentence = "Success is not final. Failure is not fatal. It is the courage to continue that counts."

In [None]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

In [None]:
clean_tokens = word_tokens[:] 
for token in word_tokens:
    if token in stopwords.words('english'):
        clean_tokens.remove(token)

In [None]:
print(clean_tokens)

# Named Entity Recognition

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [None]:
sentence = "Mark who works at Yahoo and John who works at Google decided to meet at New York City"

In [None]:
print (ne_chunk(pos_tag(word_tokenize(sentence))))

# Interface to WordNet

In [None]:
from nltk.corpus import wordnet

In [None]:
wordnet.synsets("computer")

In [None]:
syn = wordnet.synset('computer.n.01')
syn.lemma_names()

In [None]:
syn.definition()

In [None]:
wordnet.synset("car.n.01").examples()

In [None]:
synonyms = []
for syn in wordnet.synsets('large'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)

In [None]:
antonyms = []
for syn in wordnet.synsets("large"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)