# today's challenges



*   Named Entity Recognition: NER using spaCy vs NER using NLTK
*   Part-of-speech tagging: POS using spaCy vs POS using NLTK
*   **THE** **REAL** **CHALLENGE**




## POS using spaCy

In [None]:
import spacy
  
# Load English tokenizer, tagger, 
# parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
  
# Process whole documents
text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."
  
doc = nlp(text)
  
# Token and Tag
for token in doc:
  print(token, token.pos_)

NASA PROPN
awarded VERB
Elon PROPN
Musk PROPN
’s PROPN
SpaceX VERB
a DET
$ SYM
2.9 NUM
billion NUM
contract NOUN
to PART
build VERB
the DET
lunar ADJ
lander NOUN
. PUNCT


In [None]:
for token in doc:
  print(token, token.lemma_)

NASA NASA
awarded award
Elon Elon
Musk Musk
’s ’s
SpaceX SpaceX
a a
$ $
2.9 2.9
billion billion
contract contract
to to
build build
the the
lunar lunar
lander lander
. .


## POS using NLTK

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
import nltk
from nltk import word_tokenize,pos_tag

text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."

tokens = word_tokenize(text)

tag=pos_tag(tokens)

for pos in tag:
  print(pos[1])

#print(tag)

NNP
VBD
NNP
NNP
NNP
VBD
NNP
DT
$
CD
CD
NN
TO
VB
DT
NN
NN
.


## NER using spaCy

In [None]:
# Perform standard imports 
import spacy 

nlp = spacy.load('en_core_web_sm')
# Write a function to display basic entity info: 
def show_ents(doc): 
  if doc.ents: 
    for ent in doc.ents: 
      print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_))) 
  else: 
      print('No named entities found.')


doc1 = nlp("Apple is looking at buying U.K. startup for $1 billion") 

show_ents(doc1)

Apple - 0 - 5 - ORG - Companies, agencies, institutions, etc.
U.K. - 27 - 31 - GPE - Countries, cities, states
$1 billion - 44 - 54 - MONEY - Monetary values, including unit


## NER using NLTK

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize,pos_tag

text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."

tokens = word_tokenize(text)

tag=pos_tag(tokens)

print(tag)

ne_tree = nltk.ne_chunk(tag)
print(ne_tree)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[('NASA', 'NNP'), ('awarded', 'VBD'), ('Elon', 'NNP'), ('Musk', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('SpaceX', 'NNP'), ('a', 'DT'), ('$', '$'), ('2.9', 'CD'), ('billion', 'CD'), ('contract', 'NN'), ('to', 'TO'), ('build', 'VB'), ('the', 'DT'), ('lunar', 'NN'), ('lander', 'NN'), ('.', '.')]
(S
  (ORGANIZATION NASA/NNP)
  awarded/VBD
  (PERSON Elon/NNP Musk/NNP)
  ’/NNP
  s/VBD
  (ORGANIZATION SpaceX/NNP)
  a/DT
  $/$
  2.9/CD
  billion/

In [None]:
import numpy as np
import pandas as pd

## Reading Data with pandas

In [None]:
df = pd.read_csv('amazon_rev_pol.csv')

In [None]:
df.head()

Unnamed: 0,0,1,2
0,2,An inspirational story about this remarkable w...,"Rhonda Gowler Greene's thoroughly ""kid friendl..."
1,2,Deborah Knott books in general,While not a great mystery in general all of th...
2,1,not enough info,because it wasn't quite clear on which garmin ...
3,1,It happens.........once again!,This book starts off with an interesting story...
4,1,Why why why. . .,did they have to mess up a great thing. As the...


In [None]:
a = df.iloc[0]['2']
tokens = word_tokenize(a)
tag=pos_tag(tokens)
tag

[('Rhonda', 'NNP'),
 ('Gowler', 'NNP'),
 ('Greene', 'NNP'),
 ("'s", 'POS'),
 ('thoroughly', 'JJ'),
 ('``', '``'),
 ('kid', 'FW'),
 ('friendly', 'RB'),
 ("''", "''"),
 ('text', 'JJ'),
 ('combines', 'NNS'),
 ('perfectly', 'RB'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('joyous', 'JJ'),
 ('illustrations', 'NNS'),
 ('of', 'IN'),
 ('Janet', 'NNP'),
 ('Broxon', 'NNP'),
 ('in', 'IN'),
 ('Sing', 'NNP'),
 ('Praise', 'NNP'),
 (',', ','),
 ('a', 'DT'),
 ('creative', 'JJ'),
 ('and', 'CC'),
 ('informative', 'JJ'),
 ('picturebook', 'NN'),
 ('exploring', 'VBG'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('which', 'WDT'),
 ('God', 'NNP'),
 ('has', 'VBZ'),
 ('provided', 'VBN'),
 ('us', 'PRP'),
 ('.', '.'),
 ('Carrying', 'VBG'),
 ('young', 'JJ'),
 ('readers', 'NNS'),
 ('through', 'IN'),
 ('a', 'DT'),
 ('triumphant', 'JJ'),
 ('collection', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('wonderful', 'JJ'),
 ('creations', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Lord', 'NNP'),
 (',', ','),
 ('Sing', 'NNP'),
 ('Praise', 'NNP'

In [None]:
df['1']

0        An inspirational story about this remarkable w...
1                           Deborah Knott books in general
2                                          not enough info
3                           It happens.........once again!
4                                         Why why why. . .
                               ...                        
19995                            dime a dozen american dnb
19996                                            humm. . .
19997          What? Are people still buying this baloney?
19998    BUYER BEWARE: Bad product, incompetent manufac...
19999                                                Junk!
Name: 1, Length: 20000, dtype: object

## construction of POS for each example in the dataset

In [None]:
list_review = df['2']

In [None]:
def sentences_to_POS_NLTK(list_of_sentences):
  list_of_POS = []
  for sentence in list_of_sentences:
    pos_sentence = ''
    tokens = word_tokenize(sentence)
    tag=pos_tag(tokens)

    for pos in tag:
      pos_sentence = pos_sentence+pos[1]+' '

    list_of_POS.append(pos_sentence)
  return list_of_POS

In [None]:
def sentences_to_POS_spaCy(list_of_sentences):
  ####...TO DO....
  return 0

In [None]:
list_review_POS_NLTK = sentences_to_POS(list_review)

## Splitting Dataset

**hints** (Lesson 2, Lesson 3, Lesson 4)

## Vectorization 

**hints** (Lesson 3 (Tf-Idf), Lesson 4 (other type))

## Models

**hints** (Lesson 2, Lesson 3, Lesson 4)