## NLP Project

## Topic Analysis of Review





In [1]:
# importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk


In [2]:
#importing data
df= pd.read_csv(r'E:\Simplilearn\Live clases\NLP\Projects\Topic Analysis of Review Data\K8 Reviews v0.2.csv')

In [3]:
#viewing top 5 records
df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [4]:
#info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14675 entries, 0 to 14674
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  14675 non-null  int64 
 1   review     14675 non-null  object
dtypes: int64(1), object(1)
memory usage: 229.4+ KB


In [5]:
df.describe()

Unnamed: 0,sentiment
count,14675.0
mean,0.47448
std,0.499365
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [6]:
#columns of dataset
df.columns

Index(['sentiment', 'review'], dtype='object')

### 2.Normalize casings for the review text and extract the text into a list for easier manipulation

In [7]:
reviews_lower = [sent.lower() for sent in df.review.values]
reviews_lower[0]


'good but need updates and improvements'


### 3.Tokenize the reviews using NLTKs word_tokenize function.


In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
#Tokenize the reviews using NLTKs word_tokenize function.
reviews_token = [nltk.word_tokenize(sent) for sent in reviews_lower]
reviews_token[0]



['good', 'but', 'need', 'updates', 'and', 'improvements']

### 4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [11]:
#Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.
#For a single sentence: 
nltk.pos_tag(reviews_token[0])


[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [12]:
 #for all the sentences using list comprehension:
reviews_tagged = [nltk.pos_tag(tokens) for tokens in reviews_token]

reviews_tagged[0]


[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

### 5. For the topic model, we should  want to include only nouns.
**a. First, find out all the POS tags that correspond to nouns.**

**b. Limit the data to only terms with these tags**

finding out the different POS tags using the NLTK help utility.


In [13]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [14]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [15]:
#The tags we are interested in are NN, NNP, NNS, and NNPS, all tags that begin with ‘NN’.

tagged_tuple = nltk.pos_tag(['great'])
tagged_tuple[0]


('great', 'JJ')

In [16]:
import re
reviews_noun=[]
for sent in reviews_tagged:
    reviews_noun.append([token for token in sent if re.search("NN.*", token[1])])

reviews_noun[0]


[('updates', 'NNS'), ('improvements', 'NNS')]

### 6.Lemmatize
**a. Different forms of the terms needs to be treated as one.**

**b. No need to provide POS tag to lemmatizer for now.**


In [17]:
from nltk import WordNetLemmatizer
lemm = WordNetLemmatizer()


In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [19]:
reviews_lemm=[]
for sent in reviews_noun:
    reviews_lemm.append([lemm.lemmatize(word[0]) for word in sent])    

reviews_lemm[0]


['update', 'improvement']

### 7. Remove stop words and punctuation (if there are any). 

In [20]:
from string import punctuation
from nltk.corpus import stopwords


In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
stop_nltk = stopwords.words("english")
stop_updated = stop_nltk + list(punctuation) + ["..."] + [".."]

In [23]:
#Creating an empty list for the result.
reviews_sw_removed=[]


In [24]:
#Looping over the reviews:
for sent in reviews_lemm:
    reviews_sw_removed.append([term for term in sent if term not in stop_updated])


In [25]:
#Examining one of them: 
reviews_sw_removed[1]


['mobile',
 'battery',
 'hell',
 'backup',
 'hour',
 'us',
 'idle',
 'discharged.this',
 'lie',
 'amazon',
 'lenove',
 'battery',
 'charger',
 'hour']

### 8.Create a topic model using LDA on the cleaned up data with 12 topics.
**a. Print out the top terms for each topic.**

**b. What is the coherence of the model with the c_v metric?**


In [28]:
#Importing the required modules:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import ldamodel
from gensim import corpora

In [29]:
id2word = corpora.Dictionary(reviews_sw_removed)
texts = reviews_sw_removed


In [30]:
#Applying id2word on our reviews data to get the indices, instead of words for each review.
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[200])


[(36, 1), (143, 1), (314, 1), (415, 1), (416, 1)]


In [31]:
#Building the topic model using LDA, with 12 topics.
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, 
                                           random_state=42,
                                           passes=10,
                                           per_word_topics=True)


In [32]:
pprint(lda_model.print_topics())

Pretty printing has been turned OFF


In [33]:
#Calculating the coherence of the model using the c_v metric.
coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews_sw_removed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.475339388396195



### 9. Analyze the topics through the business lens.
a. Determine which of the topics can be combined.

Looking at the topics and each terms following can be combined 

--Topic 2 and 5 possibly talks about 'pricing'

--Topic 4, 6 and 10 closely talks about 'battery related issues'

--Topic 3 and 11 vaguely talks about 'performance'


### 10. Create topic model using LDA with what you think is the optimal number of topics

8 topics seems to be the right number of topics from the data.
We’ll create a topic model with 8 topics.


In [34]:
# Build LDA model:
lda_model8 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=42,
                                           passes=10,
                                           per_word_topics=True)



In [35]:
#Printing the coherence of the model:
coherence_model_lda = CoherenceModel(model=lda_model8, texts=reviews_sw_removed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.5351527233521374


The coherence is now 0.58 which is a significant increase from 0.53 previously.





### 11. The business should  be able to interpret the topics.
a. Name each of the identified topics.

b. Create a table with the topic name and the top 10 terms in each to present to the business.

In [36]:
x = lda_model8.show_topics(formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

for topic,words in topics_words:
    print(str(topic)+ "::"+ str(words))
print()


0::['mobile', 'charging', 'hour', 'charger', 'charge', 'battery', 'turbo', 'hr', 'card', 'notification']
1::['money', 'waste', 'value', 'screen', 'glass', 'speaker', 'call', 'handset', 'box', 'headphone']
2::['note', 'camera', 'quality', 'k8', 'feature', 'lenovo', 'sound', 'phone', 'music', 'speaker']
3::['phone', 'day', 'issue', 'time', 'battery', 'lenovo', 'month', 'problem', 'service', 'update']
4::['product', 'problem', 'network', 'issue', 'heating', 'amazon', 'sim', 'return', '....', 'delivery']
5::['camera', 'battery', 'phone', 'performance', 'quality', 'backup', '....', 'issue', 'life', 'processor']
6::['price', 'phone', 'range', 'superb', 'device', 'super', 'feature', 'excellent', 'specification', 'k']
7::['charger', 'hai', 'h', 'ho', 'cable', 'bill', 'bhi', 'hi', 'offer', 'ye']

