# Advanced Natural Language Processing

### Extracting Noun Phrases

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
cd ../gdrive/My Drive/Colab Notebooks/_NLP/_nlp_data

[Errno 2] No such file or directory: '../gdrive/My Drive/Colab Notebooks/_NLP/_nlp_data'
/gdrive/My Drive/Colab Notebooks/_NLP/_nlp_data


In [None]:
#Import libraries
import nltk
nltk.download('punkt')
nltk.download('brown')
from textblob import TextBlob

#Extract noun
blob = TextBlob("John is learning natural language processing in India and focused on Google")
for np in blob.noun_phrases:
    print(np)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


john
natural language processing
india
google


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
nltk.download('averaged_perceptron_tagger')

text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

blob = TextBlob(text)

for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

0.06000000000000001
-0.34166666666666673


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# blob.tags

### Finding Similarity Between Texts

In [None]:
documents = (
"I like NLP",
"I am exploring NLP",
"I am a beginn er in NLP",
"I want to learn NLP",
"I like advanced NLP")

#Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Compute tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix.shape
#output
(5, 10)

(5, 10)

In [None]:
tfidf_matrix[0:1]

<1x11 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [None]:
#compute similarity for first sentence with rest of the sentences
cosine_similarity(tfidf_matrix[0:1],tfidf_matrix)

array([[1.        , 0.17682765, 0.12305308, 0.13489366, 0.68374784]])

### Phonetic Matching 

In [None]:
!pip install fuzzy
!pip install soundex
!pip install jellyfish

# https://www.informit.com/articles/article.aspx?p=1848528

import fuzzy
import soundex 
import jellyfish

soundex = fuzzy.Soundex(4) 
dmetaphone = fuzzy.DMetaphone(4)
dmetaphone('natural')

# nysiis
#Demaphone 

# soundex('natural') # throwing error - UnicodeDecodeError                        
#soundex('natuaral')

#ob = soundex.Soundex()
#ob.soundex('Test')

#jf = jellyfish.soundex(str('natural line')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzy
  Downloading Fuzzy-1.2.2.tar.gz (14 kB)
Building wheels for collected packages: fuzzy
  Building wheel for fuzzy (setup.py) ... [?25l[?25hdone
  Created wheel for fuzzy: filename=Fuzzy-1.2.2-cp37-cp37m-linux_x86_64.whl size=164038 sha256=f19b1ea93108031b697b463055d8637ecb8c76d8ab384985c2e51fa165eecaac
  Stored in directory: /root/.cache/pip/wheels/c8/52/8a/bb2d05fbf343752a8546682cb5b2d775cc0d1f27f6c43f95dd
Successfully built fuzzy
Installing collected packages: fuzzy
Successfully installed fuzzy-1.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting soundex
  Downloading soundex-1.1.3.tar.gz (9.1 kB)
Collecting silpa_common>=0.3
  Downloading silpa_common-0.3.tar.gz (9.4 kB)
Building wheels for collected packages: soundex, silpa-common
  Building wheel for soundex (setup.py) ... [?25l[?25hdone
  Created whe

[b'NTRL', None]

### Tagging Part of Speech

In [None]:
Text  =  "I love NLP and I will learn NLP in 2 month"

# Importing necessary packages and stopwords
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 

# Tokenize the text
tokens = sent_tokenize(text) 

#Generate tagging for all the tokens using loop
for i in tokens: 
    words = nltk.word_tokenize(i) 
    words = [w for w in words if not w in stop_words]  
    #  POS-tagger.  
    tags = nltk.pos_tag(words) 

tags

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[('Snide', 'NNP'),
 ('comparisons', 'NNS'),
 ('gelatin', 'NN'),
 ('damned', 'VBD'),
 (',', ','),
 ("'s", 'POS'),
 ('concept', 'NN'),
 ('devastating', 'VBG'),
 ('potential', 'JJ'),
 ('consequences', 'NNS'),
 (',', ','),
 ('unlike', 'IN'),
 ('grey', 'JJ'),
 ('goo', 'NN'),
 ('scenario', 'NN'),
 ('proposed', 'VBD'),
 ('technological', 'JJ'),
 ('theorists', 'NNS'),
 ('fearful', 'JJ'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('run', 'NN'),
 ('rampant', 'NN'),
 ('.', '.')]

In [None]:
# from textblob lib import TextBlob method
from textblob import TextBlob
nltk.download('omw-1.4')
  
text = ("Sukanya, Rajib and Naba are my good friends. " +
    "Sukanya is getting married next year. " +
    "Marriage is a big step in one’s life." +
    "It is both exciting and frightening. " + 
    "But friendship is a sacred bond between people." +
    "It is a special kind of love between us. " +
    "Many of you must have tried searching for a friend "+ 
    "but never found the right one.")
  
# create a textblob object
blob_object = TextBlob(text)
  
# Part-of-speech tags can be accessed 
# through the tags property of blob object.'
  
# print word with pos tag.
print(blob_object.tags)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


[('Sukanya', 'NNP'), ('Rajib', 'NNP'), ('and', 'CC'), ('Naba', 'NNP'), ('are', 'VBP'), ('my', 'PRP$'), ('good', 'JJ'), ('friends', 'NNS'), ('Sukanya', 'NNP'), ('is', 'VBZ'), ('getting', 'VBG'), ('married', 'VBN'), ('next', 'JJ'), ('year', 'NN'), ('Marriage', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('big', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('one', 'CD'), ('’', 'NN'), ('s', 'NN'), ('life.It', 'NN'), ('is', 'VBZ'), ('both', 'DT'), ('exciting', 'VBG'), ('and', 'CC'), ('frightening', 'NN'), ('But', 'CC'), ('friendship', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('sacred', 'JJ'), ('bond', 'NN'), ('between', 'IN'), ('people.It', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('special', 'JJ'), ('kind', 'NN'), ('of', 'IN'), ('love', 'NN'), ('between', 'IN'), ('us', 'PRP'), ('Many', 'JJ'), ('of', 'IN'), ('you', 'PRP'), ('must', 'MD'), ('have', 'VB'), ('tried', 'VBN'), ('searching', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('friend', 'NN'), ('but', 'CC'), ('never', 'RB'), ('found', 'VBD'), ('the', 'DT'), ('right', 'JJ'),

### Extract Entities From Text

In [None]:
sent = "John is studying at Stanford University in California"

#import libraries
import nltk
from nltk import ne_chunk
from nltk import word_tokenize
# nltk.download('maxent_ne_chunker')
# nltk.download('words')


#NER
# ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False)

In [None]:
# using spaCy for NER
import spacy
#nlp = spacy.load('en')
nlp = spacy.blank("en")


# Read/create a sentence
doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

### Extracting Topics From Text
#### Document tagging and clustering 

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

doc1 = "I am learning NLP, it is very interesting and exciting. it includes machine learning and deep learning" 
doc2 = "My father is a data scientist and he is nlp expert"
doc3 = "My sister has good exposure into android development"

doc_complete = [doc1, doc2, doc3] 
doc_complete

# Install and import libraries

!pip install gensim
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

# Text preprocessing as discussed in chapter 2

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]  
doc_clean

# Importing gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is   #assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting a list of documents (corpus) into Document-Term Matrix using #dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix for 3 topics.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

# Results
print(ldamodel.print_topics())

[(0, '0.129*"sister" + 0.129*"good" + 0.129*"exposure" + 0.129*"development" + 0.129*"android" + 0.032*"nlp" + 0.032*"father" + 0.032*"data" + 0.032*"expert" + 0.032*"scientist"'), (1, '0.129*"nlp" + 0.129*"father" + 0.129*"data" + 0.129*"scientist" + 0.129*"expert" + 0.032*"exposure" + 0.032*"development" + 0.032*"android" + 0.032*"good" + 0.032*"sister"'), (2, '0.233*"learning" + 0.093*"deep" + 0.093*"includes" + 0.093*"interesting" + 0.093*"machine" + 0.093*"exciting" + 0.093*"nlp" + 0.023*"scientist" + 0.023*"data" + 0.023*"father"')]


### Classifying Text

In [None]:
pwd

'/gdrive/MyDrive/Colab Notebooks/_NLP/_nlp_data'

In [None]:
#Read the data
import pandas as pd

In [None]:
Email_Data = pd.read_csv('spam.csv',sep='\t')
Email_Data.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Data undestanding
Email_Data.columns

Index(['Type', 'Message'], dtype='object')

In [None]:
Email_Data = Email_Data.rename(columns={"Type":"Target", "Message":"Email"})

In [None]:
#import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
from textblob import TextBlob
from nltk.stem import PorterStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

#pre processing steps like lower case, stemming and lemmatization 

Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join(x.lower() for x in x.split()))
stop = stopwords.words('english')
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
st = PorterStemmer()
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
Email_Data['Email'] =Email_Data['Email'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

Email_Data.head()

Unnamed: 0,Target,Email
0,ham,"go jurong point, crazy.. avail bugi n great wo..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,"nah think goe usf, live around though"


In [None]:
#Splitting data into train and validation

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(Email_Data['Email'], Email_Data['Target'])

# TFIDF feature generation for a maximum of 5000 features

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(Email_Data['Email'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

xtrain_tfidf.data

array([0.56514748, 0.60101303, 0.56514748, ..., 0.23140609, 0.27739829,
       0.30891207])

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

# Naive Bayes trainig
accuracy = train_model(naive_bayes.MultinomialNB(alpha=0.2), xtrain_tfidf, train_y, xvalid_tfidf)
print ("Accuracy: ", accuracy)


Accuracy:  0.975609756097561


In [None]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("Accuracy: ", accuracy)


Accuracy:  0.8902439024390244


### Carrying Out Sentiment Analysis

In [None]:
review = "I like this phone. screen quality and camera clarity is really good."
review2 = "This tv is not good. Bad quality, no clarity, worst experience"

#import libraries
from textblob import TextBlob

#TextBlob has a pre trained sentiment prediction model
blob = TextBlob(review)
blob.sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [None]:
#now lets look at the sentiment of review2
blob = TextBlob(review2)
blob.sentiment

Sentiment(polarity=-0.6833333333333332, subjectivity=0.7555555555555555)

### Disambiguating Text

In [None]:
Text1 = 'I went to the bank to deposit my money'
Text2 = 'The river bank was full of dead fishes'

#Install pywsd

!pip install pywsd

#Import functions

from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from itertools import chain
from pywsd.lesk import simple_lesk

# Sentences

bank_sents = ['I went to the bank to deposit my money',
'The river bank was full of dead fishes']

# calling the lesk function and printing results for both the sentences

print ("Context-1:", bank_sents[0])
answer = simple_lesk(bank_sents[0],'bank')
print ("Sense:", answer)
print ("Definition : ", answer.definition())


print ("Context-2:", bank_sents[1])
answer = simple_lesk(bank_sents[1],'bank','n')
print ("Sense:", answer)
print ("Definition : ", answer.definition())
