# NLP Project

#### Using Amazon data set, perform the following: 

##### POS-Part of Speech tagging

In [1]:
#First,need to download the following:
import nltk
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\97150\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\97150\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
#Importing the needed libraries
from nltk import pos_tag,word_tokenize
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
#To remove unwanted warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
#Import the dataset
df=pd.read_excel("Amazon.xlsx")
df.head(2)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,Allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams


In [6]:
#Separate the review.text column from the dataframe df
df_review=df[["reviews.text"]]

In [7]:
#If we want to read any specific review:
df_review["reviews.text"][0]

"I initially had trouble deciding between the paperwhite and the voyage because reviews more or less said the same thing: the paperwhite is great, but if you have spending money, go for the voyage.Fortunately, I had friends who owned each, so I ended up buying the paperwhite on this basis: both models now have 300 ppi, so the 80 dollar jump turns out pricey the voyage's page press isn't always sensitive, and if you are fine with a specific setting, you don't need auto light adjustment).It's been a week and I am loving my paperwhite, no regrets! The touch screen is receptive and easy to use, and I keep the light at a specific setting regardless of the time of day. (In any case, it's not hard to change the setting either, as you'll only be changing the light level at a certain time of day, not every now and then while reading).Also glad that I went for the international shipping option with Amazon. Extra expense, but delivery was on time, with tracking, and I didnt need to worry about cu

In [8]:
#Step1:Convert  the text into lower case
df_review['lower_text']=df_review['reviews.text'].str.lower()
df_review.head(2)

Unnamed: 0,reviews.text,lower_text
0,I initially had trouble deciding between the p...,i initially had trouble deciding between the p...
1,Allow me to preface this with a little history...,allow me to preface this with a little history...


In [9]:
#Step2:Removing Punc,special characters etc from the lower_text
df_review['new_text']=df_review['lower_text'].str.replace("[^a-z' ]" , "")
df_review['new_text'][0]

"i initially had trouble deciding between the paperwhite and the voyage because reviews more or less said the same thing the paperwhite is great but if you have spending money go for the voyagefortunately i had friends who owned each so i ended up buying the paperwhite on this basis both models now have  ppi so the  dollar jump turns out pricey the voyage's page press isn't always sensitive and if you are fine with a specific setting you don't need auto light adjustmentit's been a week and i am loving my paperwhite no regrets the touch screen is receptive and easy to use and i keep the light at a specific setting regardless of the time of day in any case it's not hard to change the setting either as you'll only be changing the light level at a certain time of day not every now and then while readingalso glad that i went for the international shipping option with amazon extra expense but delivery was on time with tracking and i didnt need to worry about customs which i may have if i use

In [10]:
# STEP 3 - Removing the stopwords

from nltk.corpus import stopwords

# Create a list of stopwords

stop = stopwords.words('english')

# stop

In [11]:
# Write a user define function to split the text of your review, then do
# a match of words with the stop list and return the words which are not 
# present in the stop list

def sw(x):
    x = [y for y in x.split() if y not in stop]
    return " ".join(x)

# Lets apply the UDF sw on the new_text column of the data set

df_review['clean_text'] = df_review['new_text'].apply(sw)

df_review['clean_text'][0]

"initially trouble deciding paperwhite voyage reviews less said thing paperwhite great spending money go voyagefortunately friends owned ended buying paperwhite basis models ppi dollar jump turns pricey voyage's page press always sensitive fine specific setting need auto light adjustmentit's week loving paperwhite regrets touch screen receptive easy use keep light specific setting regardless time day case hard change setting either changing light level certain time day every readingalso glad went international shipping option amazon extra expense delivery time tracking didnt need worry customs may used third party shipping service"

In [12]:
#Step4:Create a user defined function to apply the pos tags on each word and 
#filter all the nouns

def nouns(x):
    #filter condition using lambda function
    is_noun=lambda x:(x == "NN" or x == "NNP")
    
    #Word tokenizer using word tokenize()
    token=word_tokenize(x)
    
    #Apply the pos tags & filter the nouns 
    all_nouns = [y for (y,x) in pos_tag(token) if is_noun(x)]
    
    #before returning the words it should join with a space to create a sentence.
    #Thus:
    return' '.join(all_nouns)
    

In [13]:
#Step 5 :Lets apply the UDF nouns on the clean_text column of your dataframe(df_text)

df_review['Final_text'] = df_review['clean_text'].apply(nouns)

df_review.head(2)

Unnamed: 0,reviews.text,lower_text,new_text,clean_text,Final_text
0,I initially had trouble deciding between the p...,i initially had trouble deciding between the p...,i initially had trouble deciding between the p...,initially trouble deciding paperwhite voyage r...,trouble voyage thing paperwhite spending money...
1,Allow me to preface this with a little history...,allow me to preface this with a little history...,allow me to preface this with a little history...,allow preface little history casual reader own...,preface history reader touch series girl serie...


In [14]:
# STEP 6 - Creating the BIGRAM DTM

# STEP 6.1 - Create a count vectorizer object

tfidf_vec_bigram = TfidfVectorizer(min_df=0.001, ngram_range=(2,2))

# STEP 6.2 - Fit this count vectorizer object on the Final_text column of dt_reviews

tfidf_vec_bigram.fit(df_review['Final_text'])

# STEP 6.3 - Create a DTM by using a command fit_transform

DTM_bigram = tfidf_vec_bigram.fit_transform(df_review['Final_text'])

DTM_bigram

<1597x5015 sparse matrix of type '<class 'numpy.float64'>'
	with 31190 stored elements in Compressed Sparse Row format>

In [15]:
# FINAL STEP - convert our DTM into a data frame

DTM_BIGRAM_DF = pd.DataFrame(DTM_bigram.toarray(), 
                      columns = tfidf_vec_bigram.get_feature_names())

DTM_BIGRAM_DF.head()

Unnamed: 0,aa aa,aa energizer,ability display,ability download,ability filter,ability plug,ability screen,ability storage,ability stream,ability try,...,youbattery life,youi ereader,youtube chance,youtube fire,youtube hdx,youtube hear,youtube video,youtube videos,zen type,zink appmy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Finding the Total tfidf score of the words

word_tfidf_bigram = DTM_BIGRAM_DF.sum().reset_index()

word_tfidf_bigram

# Lets rename the columns 

word_table_bigram = word_tfidf_bigram.rename(columns = {'index' : 'TERMS', 
                                        0 : 'TF_IDF Score'})

word_table_bigram

# Finding the top 20 words

word_table_bigram.sort_values(by= 'TF_IDF Score', ascending = False).head(20)

Unnamed: 0,TERMS,TF_IDF Score
1600,fire hd,27.705073
237,apple buds,27.676492
1635,fire tv,26.14969
254,apple tv,16.031785
2234,kindle fire,14.447122
1604,fire hdx,14.238316
344,battery life,13.015562
4547,tv tv,12.888519
4022,star rating,10.845378
4768,voice search,10.363506


### Word similarity using Cosine similarity:

In [17]:
#Import the function
from sklearn.metrics.pairwise import cosine_similarity 

In [19]:
#create the cosine similarity matrics 
sim_metrics=cosine_similarity(DTM_BIGRAM_DF.T)
sim_metrics

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [20]:
#Convert sim_metrics into dataframe
sim_df=pd.DataFrame(sim_metrics,columns=DTM_BIGRAM_DF.columns,index=DTM_BIGRAM_DF.columns)
sim_df.head(2)

Unnamed: 0,aa aa,aa energizer,ability display,ability download,ability filter,ability plug,ability screen,ability storage,ability stream,ability try,...,youbattery life,youi ereader,youtube chance,youtube fire,youtube hdx,youtube hear,youtube video,youtube videos,zen type,zink appmy
aa aa,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aa energizer,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Hence ,Cosine Similarity matrics is created

In [24]:
#With this matrics we can find out top n number of words for any particular word
#create a user defined function for that

def get_sim_words(input_word,sim_df,n_words):
    
    #for a given inp words the user def funct will find the more similar words
    #by sorting the similarity score
    
    val=sim_df[input_word].sort_values(ascending=False)
    
    #it should drop input word from the final list
    
    words=val.drop(input_word).head(n_words)
    
    #returning the list of words
     
    return words
    

In [26]:
get_sim_words("amazon sound",sim_df,10)

apple budsread         0.859233
year sound             0.859233
tug stay               0.859233
fall validity          0.859233
rating product         0.859233
midrange musician      0.859233
ignorance apple        0.859233
musician something     0.859233
fall cord              0.859233
something ignorance    0.859233
Name: amazon sound, dtype: float64

In [27]:
get_sim_words("fire hd",sim_df,10)

hd fire                 0.672875
hd year                 0.641897
kindle fire             0.599957
hdx fire                0.570118
year hd                 0.555280
fire hdx                0.532366
firstgeneration fire    0.492099
thing software          0.490400
year firstgeneration    0.490400
show book               0.490400
Name: fire hd, dtype: float64

In [28]:
#End