# Natural Language Processing Introduction

Here I will be using the NLTK library for Natural Language Processing, credits Sentdex video tutorials on Youtube

In [2]:
!pip install nltk



In [1]:
import nltk

In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


## Preprocessing

### First we will be discussing some methods which are used in Natural Language Processing for Preprocessing of the data

### First Method: Tokenizing

We will be discussing two methods of tokenizing; sentence tokenizing and word tokenizing

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

example_text = "Hey man! Liverpool FC is playing tomorrow. Where are you watching it? I have a midterm tomorrow, have to prepare for it."
"""Tokenising the text"""
sentence_tokenized = sent_tokenize(example_text)
word_tokenized = word_tokenize(example_text)

for sentence in sentence_tokenized:
    print(sentence)
for word in word_tokenized:
    print(word)

Hey man!
Liverpool FC is playing tomorrow.
Where are you watching it?
I have a midterm tomorrow, have to prepare for it.
Hey
man
!
Liverpool
FC
is
playing
tomorrow
.
Where
are
you
watching
it
?
I
have
a
midterm
tomorrow
,
have
to
prepare
for
it
.


## Second Method : Removing Stopwords

Stop words are words like "The", "am", "is" and others which appear in the text a lot more than the other words but don't help us in any way to understand the data more or provide any value.

In [3]:
from nltk.corpus import stopwords

In [4]:
stop_words = set(stopwords.words("english"))

filtered_words = []
for word in word_tokenized:
    if word not in stop_words:
        filtered_words.append(word)

print(filtered_words)

['Hey', 'man', '!', 'Liverpool', 'FC', 'playing', 'tomorrow', '.', 'Where', 'watching', '?', 'I', 'midterm', 'tomorrow', ',', 'prepare', '.']


## Third Method: Stemming

Stemming is like, it gives you the root of the word. It removes the "ing" or "es" or "ess" or "s" or other suffixes from the end of the word to give you the stem of the word

In [5]:
from nltk.stem import PorterStemmer

In [6]:
pstem = PorterStemmer()

for word in word_tokenized:
    print(pstem.stem(word))

hey
man
!
liverpool
FC
is
play
tomorrow
.
where
are
you
watch
it
?
I
have
a
midterm
tomorrow
,
have
to
prepar
for
it
.


Like in the above example, you can see "playing" is stemmed to "play" and "watching" to "watch"

Another Example of Stemming

In [7]:
w = ['cats', 'corns','caresses','carriers']

In [8]:
for word in w:
    print(pstem.stem(word))

cat
corn
caress
carrier


## Third Method : Part of Speech Tagging

Part of Speech or POS tagging is also called grammatical tagging, where the words of the text are marked or tagged according to the context and its meaning in accordance to the particular text it is used in

In [9]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [10]:
train_text = state_union.raw("1961-Kennedy.txt")
sample_text = state_union.raw("1962-Kennedy.txt")

In [11]:
"""Training the PunktSentenceTokenizer"""
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [12]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [13]:
def processing_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))

processing_content()

[('PRESIDENT', 'NNP'), ('JOHN', 'NNP'), ('F.', 'NNP'), ('KENNEDY', 'NNP'), ("'S", 'POS'), ('ANNUAL', 'NNP'), ('ADDRESS', 'NNP'), ('TO', 'NNP'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('This', 'DT'), ('week', 'NN'), ('we', 'PRP'), ('begin', 'VBP'), ('anew', 'RB'), ('our', 'PRP$'), ('joint', 'NN'), ('and', 'CC'), ('separate', 'JJ'), ('efforts', 'NNS'), ('to', 'TO'), ('build', 'VB'), ('the', 'DT'), ('American', 'JJ'), ('future', 'NN'), ('.', '.')]
[('But', 'CC'), (',', ','), ('sadly', 'RB'), (',', ','), ('we', 'PRP'), ('build', 'VBP'), ('without', 'IN'), ('a', 'DT'), ('man', 'NN'), ('who', 'WP'), ('linked', 'VBD'), ('a', 'DT'), ('long', 'JJ'), ('past', 'NN'), ('with', 'IN'), ('the', 'DT'), ('present', 'JJ'), ('and', 'CC'), ('looked', 'VBD'), ('strongly', 'RB'), ('to', 'TO'), ('the', 'DT'), ('future', 'NN'), ('.', '.')]
[('``', '``'), ('Mister', 

[('To', 'TO'), ('help', 'VB'), ('those', 'DT'), ('least', 'JJS'), ('fortunate', 'NN'), ('of', 'IN'), ('all', 'DT'), (',', ','), ('I', 'PRP'), ('am', 'VBP'), ('recommending', 'VBG'), ('a', 'DT'), ('new', 'JJ'), ('public', 'JJ'), ('welfare', 'NN'), ('program', 'NN'), (',', ','), ('stressing', 'VBG'), ('services', 'NNS'), ('instead', 'RB'), ('of', 'IN'), ('support', 'NN'), (',', ','), ('rehabilitation', 'NN'), ('instead', 'RB'), ('of', 'IN'), ('relief', 'NN'), (',', ','), ('and', 'CC'), ('training', 'VBG'), ('for', 'IN'), ('useful', 'JJ'), ('work', 'NN'), ('instead', 'RB'), ('of', 'IN'), ('prolonged', 'JJ'), ('dependency', 'NN'), ('.', '.')]
[('To', 'TO'), ('relieve', 'VB'), ('the', 'DT'), ('critical', 'JJ'), ('shortage', 'NN'), ('of', 'IN'), ('doctors', 'NNS'), ('and', 'CC'), ('dentists', 'NNS'), ('-', ':'), ('and', 'CC'), ('this', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('matter', 'NN'), ('which', 'WDT'), ('should', 'MD'), ('concern', 'NN'), ('us', 'PRP'), ('all', 'DT'), ('-', ':'), ('and', 

[('We', 'PRP'), ('may', 'MD'), ('not', 'RB'), ('always', 'RB'), ('agree', 'VBP'), ('with', 'IN'), ('every', 'DT'), ('detailed', 'JJ'), ('action', 'NN'), ('taken', 'VBN'), ('by', 'IN'), ('every', 'DT'), ('officer', 'NN'), ('of', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('Nations', 'NNPS'), (',', ','), ('or', 'CC'), ('with', 'IN'), ('every', 'DT'), ('voting', 'NN'), ('majority', 'NN'), ('.', '.')]
[('But', 'CC'), ('as', 'IN'), ('an', 'DT'), ('institution', 'NN'), (',', ','), ('it', 'PRP'), ('should', 'MD'), ('have', 'VB'), ('in', 'IN'), ('the', 'DT'), ('future', 'NN'), (',', ','), ('as', 'IN'), ('it', 'PRP'), ('has', 'VBZ'), ('had', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('past', 'JJ'), ('since', 'IN'), ('its', 'PRP$'), ('inception', 'NN'), (',', ','), ('no', 'DT'), ('stronger', 'JJR'), ('or', 'CC'), ('more', 'RBR'), ('faithful', 'JJ'), ('member', 'NN'), ('than', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('of', 'IN'), ('America', 'NNP'), ('.', '.')]
[('In', 'IN'), ('1961

[('I', 'PRP'), ('believe', 'VBP'), ('such', 'PDT'), ('a', 'DT'), ('resolution', 'NN'), ('can', 'MD'), ('be', 'VB'), ('found', 'VBN'), (',', ','), ('and', 'CC'), ('with', 'IN'), ('it', 'PRP'), ('an', 'DT'), ('improvement', 'NN'), ('in', 'IN'), ('our', 'PRP$'), ('relations', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('Soviet', 'NNP'), ('Union', 'NNP'), (',', ','), ('if', 'IN'), ('only', 'RB'), ('the', 'DT'), ('leaders', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('Kremlin', 'NNP'), ('will', 'MD'), ('recognize', 'VB'), ('the', 'DT'), ('basic', 'JJ'), ('rights', 'NNS'), ('and', 'CC'), ('interests', 'NNS'), ('involved', 'VBN'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('interest', 'NN'), ('of', 'IN'), ('all', 'DT'), ('mankind', 'NN'), ('in', 'IN'), ('peace', 'NN'), ('.', '.')]
[('But', 'CC'), ('the', 'DT'), ('Atlantic', 'NNP'), ('Community', 'NNP'), ('is', 'VBZ'), ('no', 'RB'), ('longer', 'RBR'), ('concerned', 'JJ'), ('with', 'IN'), ('purely', 'RB'), ('military', 'JJ'), ('aims', 'NNS'), ('.', '.')]

The POS tagging gives you a tuple as an output, with the first element in the tuple the word in the text and the second element gives you POS tag of the first element, like whether it is a proposition, or a verb or a noun

## Fourth method: Chunking

In [14]:
def processing_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
    except Exception as e:
        print(str(e))

processing_content()

(S
  (Chunk PRESIDENT/NNP JOHN/NNP F./NNP KENNEDY/NNP)
  'S/POS
  (Chunk
    ANNUAL/NNP
    ADDRESS/NNP
    TO/NNP
    A/NNP
    JOINT/NNP
    SESSION/NNP
    OF/NNP
    CONGRESS/NNP
    ON/NNP
    THE/NNP
    STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP)
  This/DT
  week/NN
  we/PRP
  begin/VBP
  anew/RB
  our/PRP$
  joint/NN
  and/CC
  separate/JJ
  efforts/NNS
  to/TO
  build/VB
  the/DT
  American/JJ
  future/NN
  ./.)
(S
  But/CC
  ,/,
  sadly/RB
  ,/,
  we/PRP
  build/VBP
  without/IN
  a/DT
  man/NN
  who/WP
  linked/VBD
  a/DT
  long/JJ
  past/NN
  with/IN
  the/DT
  present/JJ
  and/CC
  looked/VBD
  strongly/RB
  to/TO
  the/DT
  future/NN
  ./.)
(S
  ``/``
  (Chunk Mister/NNP Sam/NNP)
  ''/''
  (Chunk Rayburn/NNP)
  is/VBZ
  gone/VBN
  ./.)
(S
  Neither/CC
  this/DT
  (Chunk House/NNP)
  nor/CC
  the/DT
  Nation/NN
  is/VBZ
  the/DT
  same/JJ
  without/IN
  him/PRP
  ./.)
(S
  Members/NNS
  of/IN
  the/DT
  (Chunk Congress/NNP)
  ,/,
  the/DT
  (Chunk Constitution/NNP)
  ma

(S
  This/DT
  administration/NN
  has/VBZ
  shown/VBN
  as/IN
  never/RB
  before/IN
  how/WRB
  much/JJ
  could/MD
  be/VB
  done/VBN
  through/IN
  the/DT
  full/JJ
  use/NN
  of/IN
  (Chunk Executive/NNP)
  powers/NNS
  -/:
  through/IN
  the/DT
  enforcement/NN
  of/IN
  laws/NNS
  already/RB
  passed/VBN
  by/IN
  the/DT
  (Chunk Congress/NNP)
  -/:
  through/IN
  persuasion/NN
  ,/,
  negotiation/NN
  ,/,
  and/CC
  litigation/NN
  ,/,
  to/TO
  secure/VB
  the/DT
  constitutional/JJ
  rights/NNS
  of/IN
  all/DT
  :/:
  the/DT
  right/NN
  to/TO
  vote/VB
  ,/,
  the/DT
  right/NN
  to/TO
  travel/VB
  without/IN
  hindrance/NN
  across/IN
  (Chunk State/NNP)
  lines/NNS
  ,/,
  and/CC
  the/DT
  right/NN
  to/TO
  free/VB
  public/JJ
  education/NN
  ./.)
(S
  I/PRP
  issued/VBD
  last/JJ
  (Chunk March/NNP)
  a/DT
  comprehensive/JJ
  order/NN
  to/TO
  guarantee/VB
  the/DT
  right/NN
  to/TO
  equal/JJ
  employment/NN
  opportunity/NN
  in/IN
  all/DT
  (Chunk Federal/NNP)


  ./.)
(S
  (Chunk OUR/NNP MILITARY/NNP STRENGTH/NNP)
  Our/PRP$
  moral/JJ
  and/CC
  physical/JJ
  strength/NN
  begins/VBZ
  at/IN
  home/NN
  as/IN
  already/RB
  discussed/VBN
  ./.)
(S
  But/CC
  it/PRP
  includes/VBZ
  our/PRP$
  military/JJ
  strength/NN
  as/RB
  well/RB
  ./.)
(S
  So/RB
  long/RB
  as/IN
  fanaticism/NN
  and/CC
  fear/NN
  brood/NN
  over/IN
  the/DT
  affairs/NNS
  of/IN
  men/NNS
  ,/,
  we/PRP
  must/MD
  arm/VB
  to/TO
  deter/VB
  others/NNS
  from/IN
  aggression/NN
  ./.)
(S
  In/IN
  the/DT
  past/JJ
  12/CD
  months/NNS
  our/PRP$
  military/JJ
  posture/NN
  has/VBZ
  steadily/RB
  improved/VBN
  ./.)
(S
  We/PRP
  increased/VBD
  the/DT
  previous/JJ
  defense/NN
  budget/NN
  by/IN
  15/CD
  percent/NN
  -/:
  not/RB
  in/IN
  the/DT
  expectation/NN
  of/IN
  war/NN
  but/CC
  for/IN
  the/DT
  preservation/NN
  of/IN
  peace/NN
  ./.)
(S
  We/PRP
  more/RBR
  than/IN
  doubled/VBD
  our/PRP$
  acquisition/NN
  rate/NN
  of/IN
  (Chunk Polaris/

  ./.)
(S
  A/DT
  (Chunk newly/RB unified/VBN Agency/NNP)
  for/IN
  (Chunk International/NNP Development/NNP)
  is/VBZ
  reorienting/VBG
  our/PRP$
  foreign/JJ
  assistance/NN
  to/TO
  emphasize/VB
  long-term/JJ
  development/NN
  loans/NNS
  instead/RB
  of/IN
  grants/NNS
  ,/,
  more/JJR
  economic/JJ
  aid/NN
  instead/RB
  of/IN
  military/JJ
  ,/,
  individual/JJ
  plans/NNS
  to/TO
  meet/VB
  the/DT
  individual/JJ
  needs/NNS
  of/IN
  the/DT
  nations/NNS
  ,/,
  and/CC
  new/JJ
  standards/NNS
  on/IN
  what/WP
  they/PRP
  must/MD
  do/VB
  to/TO
  marshal/VB
  their/PRP$
  own/JJ
  resources/NNS
  ./.)
(S
  A/DT
  (Chunk newly/RB conceived/VBN Peace/NNP Corps/NNP)
  is/VBZ
  winning/VBG
  friends/NNS
  and/CC
  helping/VBG
  people/NNS
  in/IN
  fourteen/JJ
  countries/NNS
  -/:
  supplying/NN
  trained/JJ
  and/CC
  dedicated/JJ
  young/JJ
  men/NNS
  and/CC
  women/NNS
  ,/,
  to/TO
  give/VB
  these/DT
  new/JJ
  nations/NNS
  a/DT
  hand/NN
  in/IN
  building/VBG


(S
  Our/PRP$
  farm/NN
  surpluses/VBZ
  -/:
  our/PRP$
  balance/NN
  of/IN
  trade/NN
  ,/,
  as/IN
  you/PRP
  all/DT
  know/VBP
  ,/,
  to/TO
  (Chunk Europe/NNP)
  ,/,
  the/DT
  (Chunk Common/NNP Market/NNP)
  ,/,
  in/IN
  farm/NN
  products/NNS
  ,/,
  is/VBZ
  nearly/RB
  three/CD
  or/CC
  four/CD
  to/TO
  one/CD
  in/IN
  our/PRP$
  favor/NN
  ,/,
  amounting/VBG
  to/TO
  one/CD
  of/IN
  the/DT
  best/JJS
  earners/NNS
  of/IN
  dollars/NNS
  in/IN
  our/PRP$
  balance/NN
  of/IN
  payments/NNS
  structure/NN
  ,/,
  and/CC
  without/IN
  entrance/NN
  to/TO
  this/DT
  (Chunk Market/NNP)
  ,/,
  without/IN
  the/DT
  ability/NN
  to/TO
  enter/VB
  it/PRP
  ,/,
  our/PRP$
  farm/NN
  surpluses/NNS
  will/MD
  pile/VB
  up/RP
  in/IN
  the/DT
  (Chunk Middle/NNP West/NNP)
  ,/,
  tobacco/NN
  in/IN
  the/DT
  (Chunk South/NNP)
  ,/,
  and/CC
  other/JJ
  commodities/NNS
  ,/,
  which/WDT
  have/VBP
  gone/VBN
  through/IN
  (Chunk Western/NNP Europe/NNP)
  for/IN
  15/CD

Above you can see how the different words are chunked based on the POS tags, you can also draw these and see how they are chunked using the chunked.draw() method

## Fifth Method : Named Entity Recognition

In [15]:
def processing_content():
    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            tagged = nltk.pos_tag(words)
            
            named_entity = nltk.ne_chunk(tagged)
            print(named_entity)
            
            
    except Exception as e:
        print(str(e))

processing_content()

(S
  PRESIDENT/NNP
  (PERSON JOHN/NNP F./NNP KENNEDY/NNP)
  'S/POS
  (ORGANIZATION ANNUAL/NNP)
  (ORGANIZATION ADDRESS/NNP)
  TO/NNP
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/NNP
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  This/DT
  week/NN
  we/PRP
  begin/VBP
  anew/RB
  our/PRP$
  joint/NN
  and/CC
  separate/JJ
  efforts/NNS
  to/TO
  build/VB
  the/DT
  (GPE American/JJ)
  future/NN
  ./.)
(S
  But/CC
  ,/,
  sadly/RB
  ,/,
  we/PRP
  build/VBP
  without/IN
  a/DT
  man/NN
  who/WP
  linked/VBD
  a/DT
  long/JJ
  past/NN
  with/IN
  the/DT
  present/JJ
  and/CC
  looked/VBD
  strongly/RB
  to/TO
  the/DT
  future/NN
  ./.)
(S
  ``/``
  (PERSON Mister/NNP Sam/NNP)
  ''/''
  (PERSON Rayburn/NNP)
  is/VBZ
  gone/VBN
  ./.)
(S
  Neither/CC
  this/DT
  (ORGANIZATION House/NNP)
  nor/CC
  the/DT
  (ORGANIZATION Nation/NN)
  is/VBZ
  the/DT
  same/JJ
  without/IN
  him/PRP
  ./.)
(S
  

(S
  Within/IN
  this/DT
  same/JJ
  framework/NN
  of/IN
  growth/NN
  and/CC
  wage-price/JJ
  stability/NN
  :/:
  This/DT
  administration/NN
  has/VBZ
  helped/VBN
  keep/VB
  our/PRP$
  economy/NN
  competitive/JJ
  by/IN
  widening/VBG
  the/DT
  access/NN
  of/IN
  small/JJ
  business/NN
  to/TO
  credit/NN
  and/CC
  Government/NNP
  contracts/NNS
  ,/,
  and/CC
  by/IN
  stepping/VBG
  up/RP
  the/DT
  drive/NN
  against/IN
  monopoly/NN
  ,/,
  price-fixing/NN
  ,/,
  and/CC
  racketeering/NN
  ;/:
  We/PRP
  will/MD
  submit/VB
  a/DT
  (ORGANIZATION Federal/NNP Pay/NNP Reform/NNP)
  bill/NN
  aimed/VBN
  at/IN
  giving/VBG
  our/PRP$
  classified/JJ
  ,/,
  postal/JJ
  ,/,
  and/CC
  other/JJ
  employees/NNS
  new/JJ
  pay/NN
  scales/NNS
  more/RBR
  comparable/JJ
  to/TO
  those/DT
  of/IN
  private/JJ
  industry/NN
  ;/:
  We/PRP
  are/VBP
  holding/VBG
  the/DT
  fiscal/JJ
  1962/CD
  budget/NN
  deficit/NN
  far/RB
  below/IN
  the/DT
  level/NN
  incurred/VBN
  after

(S
  As/IN
  we/PRP
  approach/VBP
  the/DT
  100th/JJ
  anniversary/NN
  ,/,
  next/JJ
  January/NNP
  ,/,
  of/IN
  the/DT
  (ORGANIZATION Emancipation/NNP Proclamation/NNP)
  ,/,
  let/VB
  the/DT
  acts/NNS
  of/IN
  every/DT
  branch/NN
  of/IN
  the/DT
  (ORGANIZATION Government/NNP)
  -/:
  and/CC
  every/DT
  citizen/NN
  -/:
  portray/NN
  that/IN
  ``/``
  righteousness/NN
  does/VBZ
  exalt/VB
  a/DT
  nation/NN
  ./.
  ''/'')
(S
  (ORGANIZATION HEALTH/NN)
  (ORGANIZATION AND/CC)
  (ORGANIZATION WELFARE/NNP Finally/NNP)
  ,/,
  a/DT
  strong/JJ
  (GPE America/NNP)
  can/MD
  not/RB
  neglect/VB
  the/DT
  aspirations/NNS
  of/IN
  its/PRP$
  citizens/NNS
  -/:
  the/DT
  welfare/NN
  of/IN
  the/DT
  needy/NN
  ,/,
  the/DT
  health/NN
  care/NN
  of/IN
  the/DT
  elderly/JJ
  ,/,
  the/DT
  education/NN
  of/IN
  the/DT
  young/JJ
  ./.)
(S
  For/IN
  we/PRP
  are/VBP
  not/RB
  developing/VBG
  the/DT
  (ORGANIZATION Nation/NN)
  's/POS
  wealth/NN
  for/IN
  its/PRP$
  ow

(S
  Our/PRP$
  program/NN
  is/VBZ
  to/TO
  open/VB
  to/TO
  all/PDT
  the/DT
  opportunity/NN
  for/IN
  steady/JJ
  and/CC
  productive/JJ
  employment/NN
  ,/,
  to/TO
  remove/VB
  from/IN
  all/PDT
  the/DT
  handicap/NN
  of/IN
  arbitrary/JJ
  or/CC
  irrational/JJ
  exclusion/NN
  ,/,
  to/TO
  offer/VB
  to/TO
  all/PDT
  the/DT
  facilities/NNS
  for/IN
  education/NN
  and/CC
  health/NN
  and/CC
  welfare/NN
  ,/,
  to/TO
  make/VB
  society/NN
  the/DT
  servant/NN
  of/IN
  the/DT
  individual/NN
  and/CC
  the/DT
  individual/JJ
  the/DT
  source/NN
  of/IN
  progress/NN
  ,/,
  and/CC
  thus/RB
  to/TO
  realize/VB
  for/IN
  all/PDT
  the/DT
  full/JJ
  promise/NN
  of/IN
  (GPE American/JJ)
  life/NN
  ./.)
(S
  OUR/NNP
  GOALS/NNP
  ABROAD/NNP
  All/NNP
  of/IN
  these/DT
  efforts/NNS
  at/IN
  home/NN
  give/VBP
  meaning/NN
  to/TO
  our/PRP$
  efforts/NNS
  abroad/RB
  ./.)
(S
  Since/IN
  the/DT
  close/NN
  of/IN
  the/DT
  (ORGANIZATION Second/NNP)
  World/

(S
  And/CC
  those/DT
  who/WP
  preferred/VBD
  or/CC
  predicted/VBD
  its/PRP$
  demise/NN
  ,/,
  envisioning/VBG
  a/DT
  troika/NN
  in/IN
  the/DT
  seat/NN
  of/IN
  (GPE Hammarskjold/NNP)
  -/:
  or/CC
  (ORGANIZATION Red/JJ China/NNP)
  inside/IN
  the/DT
  (ORGANIZATION Assembly/NNP)
  -/:
  have/VBP
  seen/VBN
  instead/RB
  a/DT
  new/JJ
  vigor/NN
  ,/,
  under/IN
  a/DT
  new/JJ
  Secretary/NNP
  General/NNP
  and/CC
  a/DT
  fully/RB
  independent/JJ
  Secretariat/NNP
  ./.)
(S
  In/IN
  making/VBG
  plans/NNS
  for/IN
  a/DT
  new/JJ
  forum/NN
  and/CC
  principles/NNS
  on/IN
  disarmament/NN
  -/:
  for/IN
  peace-keeping/NN
  in/IN
  outer/NN
  space/NN
  -/:
  for/IN
  a/DT
  decade/NN
  of/IN
  development/NN
  effort/NN
  -/:
  the/DT
  (ORGANIZATION U.N/NNP)
  ./.)
(S fulfilled/VBN its/PRP$ Charter/NNP 's/POS lofty/JJ aim/NN ./.)
(S
  Eighteen/JJ
  months/NNS
  ago/IN
  the/DT
  tangled/JJ
  and/CC
  turbulent/JJ
  (PERSON Congo/NNP)
  presented/VBD
  the/DT
 

  ./.)
(S
  THE/DT
  (ORGANIZATION NEW/NNP)
  AND/NNP
  (ORGANIZATION DEVELOPING/NNP)
  NATIONS/NNP
  Our/PRP$
  efforts/NNS
  to/TO
  help/VB
  other/JJ
  new/JJ
  or/CC
  developing/JJ
  nations/NNS
  ,/,
  and/CC
  to/TO
  strengthen/VB
  their/PRP$
  stand/NN
  for/IN
  freedom/NN
  ,/,
  have/VBP
  also/RB
  made/VBN
  progress/NN
  ./.)
(S
  A/DT
  newly/RB
  unified/VBN
  (PERSON Agency/NNP)
  for/IN
  (ORGANIZATION International/NNP Development/NNP)
  is/VBZ
  reorienting/VBG
  our/PRP$
  foreign/JJ
  assistance/NN
  to/TO
  emphasize/VB
  long-term/JJ
  development/NN
  loans/NNS
  instead/RB
  of/IN
  grants/NNS
  ,/,
  more/JJR
  economic/JJ
  aid/NN
  instead/RB
  of/IN
  military/JJ
  ,/,
  individual/JJ
  plans/NNS
  to/TO
  meet/VB
  the/DT
  individual/JJ
  needs/NNS
  of/IN
  the/DT
  nations/NNS
  ,/,
  and/CC
  new/JJ
  standards/NNS
  on/IN
  what/WP
  they/PRP
  must/MD
  do/VB
  to/TO
  marshal/VB
  their/PRP$
  own/JJ
  resources/NNS
  ./.)
(S
  A/DT
  newly/RB
 

(S
  It/PRP
  is/VBZ
  a/DT
  matter/NN
  of/IN
  undramatic/JJ
  daily/JJ
  cooperation/NN
  in/IN
  hundreds/NNS
  of/IN
  workaday/JJ
  tasks/NNS
  :/:
  of/IN
  currencies/NNS
  kept/VBN
  in/IN
  effective/JJ
  relation/NN
  ,/,
  of/IN
  development/NN
  loans/NNS
  meshed/VBN
  together/RB
  ,/,
  of/IN
  standardized/JJ
  weapons/NNS
  ,/,
  and/CC
  concerted/VBD
  diplomatic/JJ
  positions/NNS
  ./.)
(S
  The/DT
  (ORGANIZATION Atlantic/NNP)
  Community/NNP
  grows/NNS
  ,/,
  not/RB
  like/IN
  a/DT
  volcanic/JJ
  mountain/NN
  ,/,
  by/IN
  one/CD
  mighty/NN
  explosion/NN
  ,/,
  but/CC
  like/IN
  a/DT
  coral/JJ
  reef/NN
  ,/,
  from/IN
  the/DT
  accumulating/VBG
  activity/NN
  of/IN
  all/DT
  ./.)
(S
  Thus/RB
  ,/,
  we/PRP
  in/IN
  the/DT
  free/JJ
  world/NN
  are/VBP
  moving/VBG
  steadily/RB
  toward/IN
  unity/NN
  and/CC
  cooperation/NN
  ,/,
  in/IN
  the/DT
  teeth/NN
  of/IN
  that/DT
  old/JJ
  Bolshevik/NNP
  prophecy/NN
  ,/,
  and/CC
  at/IN
  the

(S
  Our/PRP$
  farm/NN
  surpluses/VBZ
  -/:
  our/PRP$
  balance/NN
  of/IN
  trade/NN
  ,/,
  as/IN
  you/PRP
  all/DT
  know/VBP
  ,/,
  to/TO
  (GPE Europe/NNP)
  ,/,
  the/DT
  (ORGANIZATION Common/NNP Market/NNP)
  ,/,
  in/IN
  farm/NN
  products/NNS
  ,/,
  is/VBZ
  nearly/RB
  three/CD
  or/CC
  four/CD
  to/TO
  one/CD
  in/IN
  our/PRP$
  favor/NN
  ,/,
  amounting/VBG
  to/TO
  one/CD
  of/IN
  the/DT
  best/JJS
  earners/NNS
  of/IN
  dollars/NNS
  in/IN
  our/PRP$
  balance/NN
  of/IN
  payments/NNS
  structure/NN
  ,/,
  and/CC
  without/IN
  entrance/NN
  to/TO
  this/DT
  Market/NNP
  ,/,
  without/IN
  the/DT
  ability/NN
  to/TO
  enter/VB
  it/PRP
  ,/,
  our/PRP$
  farm/NN
  surpluses/NNS
  will/MD
  pile/VB
  up/RP
  in/IN
  the/DT
  (GPE Middle/NNP West/NNP)
  ,/,
  tobacco/NN
  in/IN
  the/DT
  (GPE South/NNP)
  ,/,
  and/CC
  other/JJ
  commodities/NNS
  ,/,
  which/WDT
  have/VBP
  gone/VBN
  through/IN
  (LOCATION Western/NNP Europe/NNP)
  for/IN
  15/CD
  y

(S
  WEB/NNP
  RESOURCES/NNP
  >/NNP
  >/NNP
  State/NNP
  of/IN
  the/DT
  (ORGANIZATION Union/NNP Archives/NNP Video/NNP)
  &/CC
  (ORGANIZATION Text/NNP)
  of/IN
  Presidential/NNP
  State/NNP
  of/IN
  the/DT
  (ORGANIZATION Union/NNP Addresses/NNP)
  Executive/NNP
  &/CC
  (ORGANIZATION Cabinet/NNP Links/NNP Links/NNP)
  to/TO
  (GPE Executive/NNP)
  ,/,
  (ORGANIZATION Federal/NNP)
  &/CC
  Cabinet-level/NNP
  Agencies/NNPS
  (ORGANIZATION State/NNP Dinners/NNPS Guest/NNP Lists/NNPS)
  ,/,
  (PERSON Menus/NNP)
  &/CC
  Entertainment/NNP
  from/IN
  (FACILITY White/NNP House/NNP)
  State/NNP
  &/CC
  Official/NNP
  Dinners/NNP
  Reagan/NNP
  Memorial/NNP
  Page/NNP
  Video/NNP
  ,/,
  (ORGANIZATION Textual/NNP Resources/NNPS)
  and/CC
  (PERSON Info/NNP)
  on/IN
  President/NNP
  (PERSON
    Ronald/NNP
    Reagan/NNP
    Campaign/NNP
    Finance/NNP
    Database/NNP
    Search/NNP)
  donations/NNS
  reported/VBD
  to/TO
  the/DT
  (ORGANIZATION Federal/NNP Election/NNP Commission/

Named Entity Recognition helps you information like whether the word in your text is a Person, an organisation, a location and much more. But the issue with NLTK's named entity recognition is that there is a high false positive rate, where a lot of misclassifications will be done and two words which should have been used as a chunk and then classified, are done separately. One of the parameters you can use with NLTK's named entity recognition is use:

        nltk.ne_chunk(tagged, binary = True)

This helps with chunking two words together and then doing their named entity recognition, but you are not able to see the tags assined to them.

## Sixth Method: Lemmatization

Lemmatization is very similar to Stemming, but different!
In this you get the word which shows the real meaning of the word that is being lemmatized. You can think of this as, you take a word, then look up its meaning in the dictionary and then replace it. The replaced word might be the same word or be completely different, but will carry the same meaning

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
lemmatizer = WordNetLemmatizer()

In [18]:
print(lemmatizer.lemmatize('better', pos = 'a'))

good


In [19]:
print(lemmatizer.lemmatize('walks'))

walk


### Finding synonyms and antonyms using NLTK

In [27]:
from nltk.corpus import wordnet
"""Finding synonyms of the word 'Good'."""
synonyms = wordnet.synsets("good")
print(synonyms)
"""getting the first one only"""
print(synonyms[0].lemmas()[0].name())
"""getting the description and then example"""
print(synonyms[0].definition())
print(synonyms[0].examples())

[Synset('good.n.01'), Synset('good.n.02'), Synset('good.n.03'), Synset('commodity.n.01'), Synset('good.a.01'), Synset('full.s.06'), Synset('good.a.03'), Synset('estimable.s.02'), Synset('beneficial.s.01'), Synset('good.s.06'), Synset('good.s.07'), Synset('adept.s.01'), Synset('good.s.09'), Synset('dear.s.02'), Synset('dependable.s.04'), Synset('good.s.12'), Synset('good.s.13'), Synset('effective.s.04'), Synset('good.s.15'), Synset('good.s.16'), Synset('good.s.17'), Synset('good.s.18'), Synset('good.s.19'), Synset('good.s.20'), Synset('good.s.21'), Synset('well.r.01'), Synset('thoroughly.r.02')]
good
benefit
['for your own good', "what's the good of worrying?"]


### Finding the similarity of the words used

In NLTK we can compare how similar two words are

In [31]:
word1 = wordnet.synset("apple.n.01")
word2 = wordnet.synset("orange.n.01")
print(word1.wup_similarity(word2))

0.782608695652174


In [32]:
word1 = wordnet.synset("orange.n.01")
word2 = wordnet.synset("lemon.n.01")
print(word1.wup_similarity(word2))

0.75


In [33]:
word1 = wordnet.synset("apple.n.01")
word2 = wordnet.synset("okra.n.01")
print(word1.wup_similarity(word2))

0.8181818181818182


In [35]:
word1 = wordnet.synset("apple.n.01")
word2 = wordnet.synset("pea.n.01")
print(word1.wup_similarity(word2))

0.7058823529411765


Now if you had used something of totally different kind, like car or even chocolate might be, then you would have seen the similarity score go down.

## Text Classification

Now we would be trying to classify the text based on the information we can get from the text itself and the tags associated with it. We cannot do this without having a tagged dataset, as we need the tags to train our classifiers.

In [41]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
all_words = []

for word in movie_reviews.words():
    all_words.append(word.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [54]:
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
        
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))        
featuresets = [(find_features(rev),category) for (rev,category) in documents]




Creating our first classifier using the Naive Bayes Classifier of NLTK, the accuracy of this particular classifier is very unstable and therefore is not really reliable, sometimes it might go as high as 81.5%(as in my case), and the very next time you rearrange your data and run it, the accuracy might go around 55 to 60%.

In [55]:
training_set = featuresets[:1800]
test_set = featuresets[1800:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100)

classifier.show_most_informative_features(15)


Naive Bayes Algo accuracy percent: 81.5
Most Informative Features
               atrocious = True              neg : pos    =      9.6 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                   sucks = True              neg : pos    =      9.0 : 1.0
                 frances = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 idiotic = True              neg : pos    =      6.5 : 1.0
                 singers = True              pos : neg    =      6.3 : 1.0
                 cunning = True              pos : neg    =      6.3 : 1.0
               pregnancy = True              neg : pos    =      6.3 : 1.0
                    mena = True              neg : pos    =      6.3 : 1.0
                  shoddy = True   

In [58]:
training_set = featuresets[:1900]
test_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100)

classifier.show_most_informative_features(15)


Naive Bayes Algo accuracy percent: 83.0
Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                 frances = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                 singers = True              pos : neg    =      6.4 : 1.0
                obstacle = True   

NLTK has a wrapper function using which we can call Scikitlearn's classifiers through NLTK only. Here we will be getting all the classifiers we can possibly think of, then build an ensemble out of it.

In [60]:
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
training_set = featuresets[:1800]
test_set = featuresets[1800:]
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_set))*100)

#GaussianNB_classifier = SklearnClassifier(GaussianNB())
#GaussianNB_classifier.train(training_set)
#print("GaussianNB_classifier accuracy percent:", (nltk.classify.accuracy(GaussianNB_classifier, test_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_set))*100)


Original Naive Bayes Algo accuracy percent: 86.5
MNB_classifier Naive Bayes Algo accuracy percent: 84.5
BernoulliNB_classifier accuracy percent: 82.0


In [62]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test_set))*100)



LogisticRegression_classifier accuracy percent: 84.0
SGDClassifier_classifier accuracy percent: 82.0
SVC_classifier accuracy percent: 77.0
LinearSVC_classifier accuracy percent: 85.0
NuSVC_classifier accuracy percent: 84.5


Here we have started making our ensemble method, where this classifier will be taking votes from each and every classifier and outputing the one with the highest vote count and the related confidence, where the confidence is how many of the classifiers classified it in the same tag as the output.

In [64]:
from nltk.classify import ClassifierI
from statistics import mode

In [67]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
    
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier,
                                 LogisticRegression_classifier,
                                 SGDClassifier_classifier,LinearSVC_classifier,
                                 NuSVC_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, test_set))*100)

print("Classification:", voted_classifier.classify(test_set[0][0]),"Confidence %:", voted_classifier.confidence(test_set[0][0]))

voted_classifier accuracy percent: 85.5
Classification: pos Confidence %: 0.7142857142857143


In [69]:
print("Classification:", voted_classifier.classify(test_set[1][0]),"Confidence %:", voted_classifier.confidence(test_set[1][0])*100)

print("Classification:", voted_classifier.classify(test_set[2][0]),"Confidence %:", voted_classifier.confidence(test_set[2][0])*100)
print("Classification:", voted_classifier.classify(test_set[3][0]),"Confidence %:", voted_classifier.confidence(test_set[3][0])*100)
print("Classification:", voted_classifier.classify(test_set[4][0]),"Confidence %:", voted_classifier.confidence(test_set[4][0])*100)
print("Classification:", voted_classifier.classify(test_set[5][0]),"Confidence %:", voted_classifier.confidence(test_set[5][0])*100)

Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: neg Confidence %: 100.0
