# Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# 1. Tokenizing - Word | Sentence 

1. **Corpus** --> It is defined as collection of documents. It can be thought as just a bunch of text files in a directory, often alongside many other directories of text files.

2. **Corpora** --> Body of text

3. **Lexicon** --> Word and their meanings

4. **Token** --> Each "entity" that is a part of whatever was split up based on rules. For eg: each word is a token

In [2]:
sample_text = """
Mahendra Singh Dhoni, affectionately known as "Captain Cool," is an iconic figure in Indian cricket.
Renowned for his exceptional leadership, agile wicket-keeping skills, and a remarkable ability to finish matches 
with his bat, Dhoni's impact on the sport is profound. He was the first captain to lead India to victory in the 
ICC T20 World Cup and to attain the top ranking in Test cricket. Despite his immense success, Dhoni remained humble 
and down-to-earth, earning respect both on and off the field. His retirement in 2020 marked the end of an era, leaving 
behind a lasting legacy that continues to inspire cricket enthusiasts and fans worldwide, with his famous 
"helicopter shot" and the Chennai Super Kings' successes in the IPL adding to his legendary status.
"""

## Sentence Tokenize 

In [3]:
print(sent_tokenize(sample_text))

['\nMahendra Singh Dhoni, affectionately known as "Captain Cool," is an iconic figure in Indian cricket.', "Renowned for his exceptional leadership, agile wicket-keeping skills, and a remarkable ability to finish matches \nwith his bat, Dhoni's impact on the sport is profound.", 'He was the first captain to lead India to victory in the \nICC T20 World Cup and to attain the top ranking in Test cricket.', 'Despite his immense success, Dhoni remained humble \nand down-to-earth, earning respect both on and off the field.', 'His retirement in 2020 marked the end of an era, leaving \nbehind a lasting legacy that continues to inspire cricket enthusiasts and fans worldwide, with his famous \n"helicopter shot" and the Chennai Super Kings\' successes in the IPL adding to his legendary status.']


## Word Tokenize 

In [4]:
print(word_tokenize(sample_text))

['Mahendra', 'Singh', 'Dhoni', ',', 'affectionately', 'known', 'as', '``', 'Captain', 'Cool', ',', "''", 'is', 'an', 'iconic', 'figure', 'in', 'Indian', 'cricket', '.', 'Renowned', 'for', 'his', 'exceptional', 'leadership', ',', 'agile', 'wicket-keeping', 'skills', ',', 'and', 'a', 'remarkable', 'ability', 'to', 'finish', 'matches', 'with', 'his', 'bat', ',', 'Dhoni', "'s", 'impact', 'on', 'the', 'sport', 'is', 'profound', '.', 'He', 'was', 'the', 'first', 'captain', 'to', 'lead', 'India', 'to', 'victory', 'in', 'the', 'ICC', 'T20', 'World', 'Cup', 'and', 'to', 'attain', 'the', 'top', 'ranking', 'in', 'Test', 'cricket', '.', 'Despite', 'his', 'immense', 'success', ',', 'Dhoni', 'remained', 'humble', 'and', 'down-to-earth', ',', 'earning', 'respect', 'both', 'on', 'and', 'off', 'the', 'field', '.', 'His', 'retirement', 'in', '2020', 'marked', 'the', 'end', 'of', 'an', 'era', ',', 'leaving', 'behind', 'a', 'lasting', 'legacy', 'that', 'continues', 'to', 'inspire', 'cricket', 'enthusiasts

# 2. Stopwords 

Stopwords are the words in any language which does not add much meaning to a sentence. They can be ignored without sacrificing their meaning of the sentence. Ex: 'is','are', 'at'

In [5]:
from nltk.corpus import stopwords

In [6]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'for', 'into', 'yours', 'him', 'what', 'do', 'theirs', "weren't", 'myself', 'a', 'ourselves', 'on', 'o', 'then', 'an', 'when', 'those', 'any', 'both', 'm', 'as', 'the', "you've", 'through', 'didn', 'be', "shan't", 'same', 'shouldn', 'wouldn', "haven't", 'most', 'had', "mightn't", 'her', 'you', 'she', 'because', "doesn't", 'is', 'too', 'now', 've', 'very', 'if', 'our', 'this', 'between', 'yourself', "wasn't", 'under', "shouldn't", 'until', 'with', 'couldn', 'have', 'did', 'so', 'over', 'all', 'has', 'doing', 'wasn', 'been', 'his', 'it', 'not', 'y', 'by', 'ours', 'about', 'itself', 'ain', 'was', 're', 'shan', "it's", 'off', 'while', 'themselves', 'during', 't', 'don', 'aren', 'only', "hasn't", 'in', 'himself', "you'd", 'their', 'or', 'down', 'but', 'and', 'yourselves', 'my', 'mightn', 'needn', 'own', 'll', 'to', 'some', 'why', 'its', 'hadn', 'd', "aren't", 'can', "should've", 'where', "that'll", 'won', 'which', 'up', 'weren', 'these', "don't", 'other', 'hers', 'against', 'am', 'whom', '

In [7]:
words = word_tokenize(sample_text)

filter_sentence = []

for word in words:
    if word not in stop_words:
        filter_sentence.append(word)
    
print(filter_sentence)

['Mahendra', 'Singh', 'Dhoni', ',', 'affectionately', 'known', '``', 'Captain', 'Cool', ',', "''", 'iconic', 'figure', 'Indian', 'cricket', '.', 'Renowned', 'exceptional', 'leadership', ',', 'agile', 'wicket-keeping', 'skills', ',', 'remarkable', 'ability', 'finish', 'matches', 'bat', ',', 'Dhoni', "'s", 'impact', 'sport', 'profound', '.', 'He', 'first', 'captain', 'lead', 'India', 'victory', 'ICC', 'T20', 'World', 'Cup', 'attain', 'top', 'ranking', 'Test', 'cricket', '.', 'Despite', 'immense', 'success', ',', 'Dhoni', 'remained', 'humble', 'down-to-earth', ',', 'earning', 'respect', 'field', '.', 'His', 'retirement', '2020', 'marked', 'end', 'era', ',', 'leaving', 'behind', 'lasting', 'legacy', 'continues', 'inspire', 'cricket', 'enthusiasts', 'fans', 'worldwide', ',', 'famous', "''", 'helicopter', 'shot', "''", 'Chennai', 'Super', 'Kings', "'", 'successes', 'IPL', 'adding', 'legendary', 'status', '.']


# 3. Stemming 

This is known as sort of normalizing method. Many variations of words carry the same meaning, other than when tense is involved.

The reason why we stem is to shorten the lookup, and normalize sentences.

Eg: I was taking a ride in the car. I was riding in the car

In [8]:
from nltk.stem import PorterStemmer

In [9]:
ps = PorterStemmer()

sam_words = ["python", "pythoner", "pythoning", "phthoned", "pythonly"]

for word in sam_words:
    print(ps.stem(word))

python
python
python
phthone
pythonli


In [10]:
sam = "It is very important to be pythoniy while you are pythoning with python. All pythoners have pythoned poorly atleast once."

words = word_tokenize(sam)

output = []

for word in words:
    output.append(ps.stem(word))

output

['it',
 'is',
 'veri',
 'import',
 'to',
 'be',
 'pythoniy',
 'while',
 'you',
 'are',
 'python',
 'with',
 'python',
 '.',
 'all',
 'python',
 'have',
 'python',
 'poorli',
 'atleast',
 'onc',
 '.']

# 4. Part of speech tagging 

Labeling the words as sentences and nouns, adejectives, verbs etc. Even more impressive, it also labels by tense and more. 

#### POS tag list:

CC coordinating conjunction

CD cardinal digit

DT determiner

EX existential there (like: "there is" ... think of it like "there exists")

FW foreign word

IN preposition/subordinating conjunction

JJ adjective 'big'

JJR adjective, comparative 'bigger'

JJS adjective, superlative 'biggest'
LS list marker 1)
MD modal could, will

NN noun, singular 'desk'

NNS noun plural 'desks'

NNP proper noun, singular 'Harrison'

NNPS proper noun, plural 'Americans'

PDT predeterminer 'all the kids'

POS possessive ending parent\'s

PRP personal pronoun I, he, she

RB adverb very, silently,

RBR adverb, comparative better

RBS adverb, superlative best
RP particle give up

TO to go 'to' the store.

UH interjection errrrrrrrm

VB verb, base form take

VBD verb, past tense took

VBG verb, gerund/present participle taking

VBN verb, past participle taken

VBP verb, sing. present, non-3d take

VBZ verb, 3rd person sing. present takes

WDT wh-determiner which

WP wh-pronoun who, what

WRB wh-abverb where, when

In [11]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [12]:
train_text = "Engineers, as practitioners of engineering, are professionals who invent, design, analyze, build and test machines, complex systems, structures, gadgets and materials to fulfill functional objectives and requirements while considering the limitations imposed by practicality, regulation, safety and cost.[1][2] The word engineer (Latin ingeniator[3]) is derived from the Latin words ingeniare (to create, generate, contrive, devise and ingenium) (cleverness).[4][5] The foundational qualifications of an engineer typically include a four-year bachelor's degree in an engineering discipline, or in some jurisdictions, a master's degree in an engineering discipline plus four to six years of peer-reviewed professional practice (culminating in a project report or thesis) and passage of engineering board examinations.The work of engineers forms the link between scientific discoveries and their subsequent applications to human and business needs and quality of life.[1]"

In [13]:
sample_text = "Neuro-linguistic programming (NLP) is a pseudoscientific approach to communication, personal development, and psychotherapy created by Richard Bandler and John Grinder in California, United States, in the 1970s. NLP's creators claim there is a connection between neurological processes (neuro-), language (linguistic) and behavioral patterns learned through experience (programming), and that these can be changed to achieve specific goals in life.[1][2]: 2  Bandler and Grinder also claim that NLP methodology can model the skills of exceptional people, allowing anyone to acquire those skills.[3]: 5–6 [4] They claim as well that, often in a single session, NLP can treat problems such as phobias, depression, tic disorders, psychosomatic illnesses, near-sightedness,[5] allergy, the common cold,[Note 1] and learning disorders.[7][8] NLP has been adopted by some hypnotherapists and also by companies that run seminars marketed as leadership training to businesses and government agencies.[9][10]There is no scientific evidence supporting the claims made by NLP advocates, and it has been discredited as a pseudoscience.[11][12][13] Scientific reviews state that NLP is based on outdated metaphors of how the brain works that are inconsistent with current neurological theory and contain numerous factual errors. [10][14] Reviews also found that all of the supportive research on NLP contained significant methodological flaws and that there were three times as many studies of a much higher quality that failed to reproduce the extraordinary claims made by Bandler, Grinder, and other NLP practitioners.[12][13]"

In [14]:
custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sentence_tokenizer.tokenize(sample_text)

In [15]:
tokenized

['Neuro-linguistic programming (NLP) is a pseudoscientific approach to communication, personal development, and psychotherapy created by Richard Bandler and John Grinder in California, United States, in the 1970s.',
 "NLP's creators claim there is a connection between neurological processes (neuro-), language (linguistic) and behavioral patterns learned through experience (programming), and that these can be changed to achieve specific goals in life.",
 '[1][2]:\u200a2\u200a Bandler and Grinder also claim that NLP methodology can model the skills of exceptional people, allowing anyone to acquire those skills.',
 '[3]:\u200a5–6\u200a[4] They claim as well that, often in a single session, NLP can treat problems such as phobias, depression, tic disorders, psychosomatic illnesses, near-sightedness,[5] allergy, the common cold,[Note 1] and learning disorders.',
 '[7][8] NLP has been adopted by some hypnotherapists and also by companies that run seminars marketed as leadership training to bu

In [16]:
def process_content():
    try:
        for word in tokenized:
            words = nltk.word_tokenize(word)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
        
process_content()

[('Neuro-linguistic', 'JJ'), ('programming', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('pseudoscientific', 'JJ'), ('approach', 'NN'), ('to', 'TO'), ('communication', 'NN'), (',', ','), ('personal', 'JJ'), ('development', 'NN'), (',', ','), ('and', 'CC'), ('psychotherapy', 'RB'), ('created', 'VBN'), ('by', 'IN'), ('Richard', 'NNP'), ('Bandler', 'NNP'), ('and', 'CC'), ('John', 'NNP'), ('Grinder', 'NNP'), ('in', 'IN'), ('California', 'NNP'), (',', ','), ('United', 'NNP'), ('States', 'NNPS'), (',', ','), ('in', 'IN'), ('the', 'DT'), ('1970s', 'CD'), ('.', '.')]
[('NLP', 'NNP'), ("'s", 'POS'), ('creators', 'NNS'), ('claim', 'VBP'), ('there', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('connection', 'NN'), ('between', 'IN'), ('neurological', 'JJ'), ('processes', 'NNS'), ('(', '('), ('neuro-', 'JJ'), (')', ')'), (',', ','), ('language', 'NN'), ('(', '('), ('linguistic', 'JJ'), (')', ')'), ('and', 'CC'), ('behavioral', 'JJ'), ('patterns', 'NNS'), ('learned', 'VBD'), (

# 5. Chunking

Group the words in to meaningful chunks. Main goals of chunking is "Noun phrases". These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with tht words that are relation to them.

In order to chunk, we combine the part of speech tags wth regular expression. Mainly from regular expression, we are going to utilize the following.

### _Identifier_

1. \d - any number
2. \D - anything but a number
3. \s - space
4. \S - anything but a space
5. \w - any letter 
6. \W - anything but a letter
7. . - any character, except for a new line
8. \b - space around whole words.

In [17]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" 
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
            

    except Exception as e:
        print(str(e))


process_content()

(S
  Neuro-linguistic/JJ
  programming/NN
  (/(
  (Chunk NLP/NNP)
  )/)
  is/VBZ
  a/DT
  pseudoscientific/JJ
  approach/NN
  to/TO
  communication/NN
  ,/,
  personal/JJ
  development/NN
  ,/,
  and/CC
  psychotherapy/RB
  created/VBN
  by/IN
  (Chunk Richard/NNP Bandler/NNP)
  and/CC
  (Chunk John/NNP Grinder/NNP)
  in/IN
  (Chunk California/NNP)
  ,/,
  (Chunk United/NNP)
  States/NNPS
  ,/,
  in/IN
  the/DT
  1970s/CD
  ./.)
(S
  (Chunk NLP/NNP)
  's/POS
  creators/NNS
  claim/VBP
  there/EX
  is/VBZ
  a/DT
  connection/NN
  between/IN
  neurological/JJ
  processes/NNS
  (/(
  neuro-/JJ
  )/)
  ,/,
  language/NN
  (/(
  linguistic/JJ
  )/)
  and/CC
  behavioral/JJ
  patterns/NNS
  learned/VBD
  through/IN
  experience/NN
  (/(
  programming/VBG
  )/)
  ,/,
  and/CC
  that/IN
  these/DT
  can/MD
  be/VB
  changed/VBN
  to/TO
  achieve/VB
  specific/JJ
  goals/NNS
  in/IN
  life/NN
  ./.)
(S
  [/RB
  1/CD
  ]/JJ
  [/$
  2/CD
  ]/NN
  :/:
  2/CD
  (Chunk Bandler/NNP)
  and/CC
  (Chunk

In [18]:
from sklearn import tree
import os
from IPython.display import Image, display
from nltk.draw import TreeWidget
from nltk.draw.util import CanvasFrame
%matplotlib inline

In [19]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #print(tagged)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" 
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            #print(chunked)
            chunked.draw()

    except Exception as e:
        print(str(e))


process_content()

# 6. Chinking 

It is not like chinking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is chink.

In [23]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
            
            chunkparser = nltk.RegexpParser(chunkGram)
            chunked = chunkparser.parse(tagged)
            print(chunked)
            
    except Exception as e:
        print(str(e))
            
process_content()

(S
  (Chunk Neuro-linguistic/JJ programming/NN (/( NLP/NNP )/))
  is/VBZ
  a/DT
  (Chunk pseudoscientific/JJ approach/NN)
  to/TO
  (Chunk
    communication/NN
    ,/,
    personal/JJ
    development/NN
    ,/,
    and/CC
    psychotherapy/RB)
  created/VBN
  by/IN
  (Chunk Richard/NNP Bandler/NNP and/CC John/NNP Grinder/NNP)
  in/IN
  (Chunk California/NNP ,/, United/NNP States/NNPS ,/,)
  in/IN
  the/DT
  (Chunk 1970s/CD ./.))
(S
  (Chunk NLP/NNP 's/POS creators/NNS)
  claim/VBP
  (Chunk there/EX)
  is/VBZ
  a/DT
  (Chunk connection/NN)
  between/IN
  (Chunk
    neurological/JJ
    processes/NNS
    (/(
    neuro-/JJ
    )/)
    ,/,
    language/NN
    (/(
    linguistic/JJ
    )/)
    and/CC
    behavioral/JJ
    patterns/NNS)
  learned/VBD
  through/IN
  (Chunk experience/NN (/()
  programming/VBG
  (Chunk )/) ,/, and/CC)
  that/IN
  these/DT
  (Chunk can/MD)
  be/VB
  changed/VBN
  to/TO
  achieve/VB
  (Chunk specific/JJ goals/NNS)
  in/IN
  (Chunk life/NN ./.))
(S
  (Chunk
    [/