In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import corpus

import gensim

from tabulate import tabulate

import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

## 1. Read the .csv file using Pandas. Take a look at the top few records.

In [2]:
df = pd.read_csv(r'C:\Users\jlod9\OneDrive\Desktop\AI__Projects\NLP\K8 Reviews v0.2.csv')

In [3]:
df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


## 2. Normalize casings for the review text and extract the text into a list for easier manipulation.

In [4]:
df['review']=df['review'].apply(str)

In [5]:
norm_reviews = []
for r in df['review']:
        norm_reviews.append(r.lower())
norm_reviews[:4]

['good but need updates and improvements',
 "worst mobile i have bought ever, battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if i put mobile idle its getting discharged.this is biggest lie from amazon & lenove which is not at all expected, they are making full by saying that battery is 4000mah & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.don't know how lenovo will survive by making full of us.please don;t go for this else you will regret like me.",
 'when i will get my 10% cash back.... its already 15 january..',
 'good']

## 3. Tokenize the reviews using NLTKs word_tokenize function.

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
tokenized = [word_tokenize(i) for i in norm_reviews]
tokenized[:4]

[['good', 'but', 'need', 'updates', 'and', 'improvements'],
 ['worst',
  'mobile',
  'i',
  'have',
  'bought',
  'ever',
  ',',
  'battery',
  'is',
  'draining',
  'like',
  'hell',
  ',',
  'backup',
  'is',
  'only',
  '6',
  'to',
  '7',
  'hours',
  'with',
  'internet',
  'uses',
  ',',
  'even',
  'if',
  'i',
  'put',
  'mobile',
  'idle',
  'its',
  'getting',
  'discharged.this',
  'is',
  'biggest',
  'lie',
  'from',
  'amazon',
  '&',
  'lenove',
  'which',
  'is',
  'not',
  'at',
  'all',
  'expected',
  ',',
  'they',
  'are',
  'making',
  'full',
  'by',
  'saying',
  'that',
  'battery',
  'is',
  '4000mah',
  '&',
  'booster',
  'charger',
  'is',
  'fake',
  ',',
  'it',
  'takes',
  'at',
  'least',
  '4',
  'to',
  '5',
  'hours',
  'to',
  'be',
  'fully',
  'charged.do',
  "n't",
  'know',
  'how',
  'lenovo',
  'will',
  'survive',
  'by',
  'making',
  'full',
  'of',
  'us.please',
  'don',
  ';',
  't',
  'go',
  'for',
  'this',
  'else',
  'you',
  'will

## 4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

In [8]:
pos_tagged = []
for sent in tokenized:
    pos_tagged.append(nltk.pos_tag(sent))
pos_tagged[:4]

[[('good', 'JJ'),
  ('but', 'CC'),
  ('need', 'VBP'),
  ('updates', 'NNS'),
  ('and', 'CC'),
  ('improvements', 'NNS')],
 [('worst', 'JJS'),
  ('mobile', 'NN'),
  ('i', 'NN'),
  ('have', 'VBP'),
  ('bought', 'VBN'),
  ('ever', 'RB'),
  (',', ','),
  ('battery', 'NN'),
  ('is', 'VBZ'),
  ('draining', 'VBG'),
  ('like', 'IN'),
  ('hell', 'NN'),
  (',', ','),
  ('backup', 'NN'),
  ('is', 'VBZ'),
  ('only', 'RB'),
  ('6', 'CD'),
  ('to', 'TO'),
  ('7', 'CD'),
  ('hours', 'NNS'),
  ('with', 'IN'),
  ('internet', 'JJ'),
  ('uses', 'NNS'),
  (',', ','),
  ('even', 'RB'),
  ('if', 'IN'),
  ('i', 'JJ'),
  ('put', 'VBP'),
  ('mobile', 'JJ'),
  ('idle', 'NN'),
  ('its', 'PRP$'),
  ('getting', 'VBG'),
  ('discharged.this', 'NN'),
  ('is', 'VBZ'),
  ('biggest', 'JJS'),
  ('lie', 'NN'),
  ('from', 'IN'),
  ('amazon', 'NN'),
  ('&', 'CC'),
  ('lenove', 'NN'),
  ('which', 'WDT'),
  ('is', 'VBZ'),
  ('not', 'RB'),
  ('at', 'IN'),
  ('all', 'DT'),
  ('expected', 'VBN'),
  (',', ','),
  ('they', 'PRP'),


## 5. For the topic model, we should  want to include only nouns.

###    1. Find out all the POS tags that correspond to nouns.

###    2. Limit the data to only terms with these tags.

In [9]:
tokenized[:4]

[['good', 'but', 'need', 'updates', 'and', 'improvements'],
 ['worst',
  'mobile',
  'i',
  'have',
  'bought',
  'ever',
  ',',
  'battery',
  'is',
  'draining',
  'like',
  'hell',
  ',',
  'backup',
  'is',
  'only',
  '6',
  'to',
  '7',
  'hours',
  'with',
  'internet',
  'uses',
  ',',
  'even',
  'if',
  'i',
  'put',
  'mobile',
  'idle',
  'its',
  'getting',
  'discharged.this',
  'is',
  'biggest',
  'lie',
  'from',
  'amazon',
  '&',
  'lenove',
  'which',
  'is',
  'not',
  'at',
  'all',
  'expected',
  ',',
  'they',
  'are',
  'making',
  'full',
  'by',
  'saying',
  'that',
  'battery',
  'is',
  '4000mah',
  '&',
  'booster',
  'charger',
  'is',
  'fake',
  ',',
  'it',
  'takes',
  'at',
  'least',
  '4',
  'to',
  '5',
  'hours',
  'to',
  'be',
  'fully',
  'charged.do',
  "n't",
  'know',
  'how',
  'lenovo',
  'will',
  'survive',
  'by',
  'making',
  'full',
  'of',
  'us.please',
  'don',
  ';',
  't',
  'go',
  'for',
  'this',
  'else',
  'you',
  'will

In [10]:
pos_tagged

[[('good', 'JJ'),
  ('but', 'CC'),
  ('need', 'VBP'),
  ('updates', 'NNS'),
  ('and', 'CC'),
  ('improvements', 'NNS')],
 [('worst', 'JJS'),
  ('mobile', 'NN'),
  ('i', 'NN'),
  ('have', 'VBP'),
  ('bought', 'VBN'),
  ('ever', 'RB'),
  (',', ','),
  ('battery', 'NN'),
  ('is', 'VBZ'),
  ('draining', 'VBG'),
  ('like', 'IN'),
  ('hell', 'NN'),
  (',', ','),
  ('backup', 'NN'),
  ('is', 'VBZ'),
  ('only', 'RB'),
  ('6', 'CD'),
  ('to', 'TO'),
  ('7', 'CD'),
  ('hours', 'NNS'),
  ('with', 'IN'),
  ('internet', 'JJ'),
  ('uses', 'NNS'),
  (',', ','),
  ('even', 'RB'),
  ('if', 'IN'),
  ('i', 'JJ'),
  ('put', 'VBP'),
  ('mobile', 'JJ'),
  ('idle', 'NN'),
  ('its', 'PRP$'),
  ('getting', 'VBG'),
  ('discharged.this', 'NN'),
  ('is', 'VBZ'),
  ('biggest', 'JJS'),
  ('lie', 'NN'),
  ('from', 'IN'),
  ('amazon', 'NN'),
  ('&', 'CC'),
  ('lenove', 'NN'),
  ('which', 'WDT'),
  ('is', 'VBZ'),
  ('not', 'RB'),
  ('at', 'IN'),
  ('all', 'DT'),
  ('expected', 'VBN'),
  (',', ','),
  ('they', 'PRP'),


In [11]:
pos_nouns = []
for i in range(len(pos_tagged)):
    row = []
    for word, pos in pos_tagged[i]:
        if (pos == 'NNS' or pos == 'NN' or pos == 'NNP' or pos == 'NNPS'):
            row.append(word)
    pos_nouns.append(row)
pos_nouns        

[['updates', 'improvements'],
 ['mobile',
  'i',
  'battery',
  'hell',
  'backup',
  'hours',
  'uses',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hours',
  'don'],
 ['i', '%', 'cash', 'january..'],
 [],
 ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'],
 ['camerawaste', 'money'],
 ['phone', 'reason', 'k8'],
 ['battery', 'level'],
 ['problems',
  'phone',
  'hanging',
  'problems',
  'note',
  'station',
  'ahmedabad',
  'years',
  'phone',
  'lenovo'],
 ['lot', 'glitches', 'thing', 'options'],
 ['wrost'],
 ['phone', 'charger', 'damage', 'months'],
 ['item', 'battery', 'life'],
 ['i',
  'battery',
  'problem',
  'motherboard',
  'problem',
  'months',
  'mobile',
  'life'],
 ['phone', 'slim', 'battry', 'backup', 'screen'],
 ['headset'],
 ['time', 'i'],
 ['product',
  'prize',
  'range',
  'specification',
  'comparison',
  'mobile',
  'range',
  'i',
  'phone',
  'seal',
  'i',
  'credit',
  'card',
  'i',
  '..',

## 6. Lemmatize. 

###    1.Different forms of the terms need to be treated as one.

###    2. No need to provide POS tag to lemmatizer for now.

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
lem = []
for i in range(len(pos_nouns)):
    row = []
    for ele in pos_nouns[i]:
        lemm_word = lemmatizer.lemmatize(ele)
        row.append(lemm_word)
    lem.append(row)
lem

[['update', 'improvement'],
 ['mobile',
  'i',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour',
  'don'],
 ['i', '%', 'cash', 'january..'],
 [],
 ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'],
 ['camerawaste', 'money'],
 ['phone', 'reason', 'k8'],
 ['battery', 'level'],
 ['problem',
  'phone',
  'hanging',
  'problem',
  'note',
  'station',
  'ahmedabad',
  'year',
  'phone',
  'lenovo'],
 ['lot', 'glitch', 'thing', 'option'],
 ['wrost'],
 ['phone', 'charger', 'damage', 'month'],
 ['item', 'battery', 'life'],
 ['i',
  'battery',
  'problem',
  'motherboard',
  'problem',
  'month',
  'mobile',
  'life'],
 ['phone', 'slim', 'battry', 'backup', 'screen'],
 ['headset'],
 ['time', 'i'],
 ['product',
  'prize',
  'range',
  'specification',
  'comparison',
  'mobile',
  'range',
  'i',
  'phone',
  'seal',
  'i',
  'credit',
  'card',
  'i',
  '..',
  'deal',
  '

## 7. Remove stopwords and punctuation (if there are any). 

In [14]:
lem_words = []
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
for i in range(len(lem)):
    row = []
    for ele in lem[i]:
        if ele not in stop_words:
            row.append(ele)
    lem_words.append(row)
lem_words

[['update', 'improvement'],
 ['mobile',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour'],
 ['%', 'cash', 'january..'],
 [],
 ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon'],
 ['camerawaste', 'money'],
 ['phone', 'reason', 'k8'],
 ['battery', 'level'],
 ['problem',
  'phone',
  'hanging',
  'problem',
  'note',
  'station',
  'ahmedabad',
  'year',
  'phone',
  'lenovo'],
 ['lot', 'glitch', 'thing', 'option'],
 ['wrost'],
 ['phone', 'charger', 'damage', 'month'],
 ['item', 'battery', 'life'],
 ['battery', 'problem', 'motherboard', 'problem', 'month', 'mobile', 'life'],
 ['phone', 'slim', 'battry', 'backup', 'screen'],
 ['headset'],
 ['time'],
 ['product',
  'prize',
  'range',
  'specification',
  'comparison',
  'mobile',
  'range',
  'phone',
  'seal',
  'credit',
  'card',
  '..',
  'deal',
  'amazon..'],
 ['battery', 'solution', 'battery', 'life'],
 ['smartp

In [15]:
lem_final = []
table = str.maketrans('', '', string.punctuation)
for i in range(len(lem_words)):
    row = []
    for w in lem_words[i]:
        row.append(w)
    lem_final.append(row)        

In [16]:
lem_final[:4]

[['update', 'improvement'],
 ['mobile',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour'],
 ['%', 'cash', 'january..'],
 []]

## 8. Create a topic model using LDA on the cleaned-up data with 12 topics.

###     1. Print out the top terms for each topic.

###     2. What is the coherence of the model with the c_v metric?

In [17]:
dictionary = gensim.corpora.Dictionary(lem_final)

In [18]:
bow_corpus = [dictionary.doc2bow(word) for word in lem_final]
bow_corpus[:4]

[[(0, 1), (1, 1)],
 [(2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(14, 1), (15, 1), (16, 1)],
 []]

In [19]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow_corpus,
                                           id2word=dictionary,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=lem_final, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.40848791742022456


## 9. Analyze the topics through the business lens.

###    1. Determine which of the topics can be combined.

In [21]:
for i,topic in lda_model.print_topics(-1,num_words=10):
    print("Topic: {} \nWords: {}".format(i + 1, topic ))

Topic: 1 
Words: 0.201*"quality" + 0.057*"mode" + 0.052*"display" + 0.036*"usage" + 0.028*"depth" + 0.027*"glass" + 0.027*"buy" + 0.024*"sensor" + 0.019*"thanks" + 0.018*"piece"
Topic: 2 
Words: 0.159*"feature" + 0.096*"month" + 0.093*"call" + 0.057*"sound" + 0.053*"delivery" + 0.042*"bit" + 0.039*"handset" + 0.036*"work" + 0.035*"budget" + 0.028*"key"
Topic: 3 
Words: 0.123*"day" + 0.105*"lenovo" + 0.081*"k8" + 0.074*"amazon" + 0.030*"hai" + 0.029*"h" + 0.027*"please" + 0.025*"support" + 0.024*"game" + 0.023*"dolby"
Topic: 4 
Words: 0.284*"battery" + 0.091*"time" + 0.048*"backup" + 0.040*"hour" + 0.035*"processor" + 0.033*"speaker" + 0.030*"charge" + 0.028*"%" + 0.026*".." + 0.024*"use"
Topic: 5 
Words: 0.398*"product" + 0.164*"price" + 0.030*"photo" + 0.026*"box" + 0.024*"button" + 0.022*"flash" + 0.022*"memory" + 0.020*"message" + 0.015*"volume" + 0.013*"point"
Topic: 6 
Words: 0.070*"service" + 0.067*"device" + 0.059*"charger" + 0.043*"sim" + 0.037*"waste" + 0.033*"customer" + 0.03

In [22]:
print("I have combineed the 12 topics into the following 6 topics:")
print("Topics 1, 4, 5")
print("Topics 2, 3, 6")
print("Topics 7")
print("Topics 8")
print("Topics 9, 10, 11")
print("Topics 12")

I have combineed the 12 topics into the following 6 topics:
Topics 1, 4, 5
Topics 2, 3, 6
Topics 7
Topics 8
Topics 9, 10, 11
Topics 12


## 10. Create a topic model using LDA with what you think is the optimal number of topics

###    1. What is the coherence of the model?

In [23]:
def compute_coherence_score(corpus,dictionary,k,a):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           alpha=a,
                                           passes=10,
                                           random_state=100)
    lda_model_coherence = CoherenceModel(model=lda_model,
                                         texts=lem_final,
                                         dictionary=dictionary,coherence='c_v')
    return lda_model_coherence.get_coherence()

In [24]:
alpha =['symmetric','asymmetric']
opt_num = 0
opt_score = 0
opt_alpha = 'symmetric'
for a in alpha:
    print('Coherence Model where alpha = {}'.format(a))
    print('\n')
    for i in range(6,12):
        score = compute_coherence_score(corpus=bow_corpus,dictionary=dictionary,k=i,a=a)        
        print(f'Coherence score with {i} topics is {score}')     
        if score > opt_score:
            opt_score = score
            opt_num = i
            if a == 'asymmetric':
                opt_alpha = a
    print('\n')


Coherence Model where alpha = symmetric


Coherence score with 6 topics is 0.5098122558499899
Coherence score with 7 topics is 0.5360946256722587
Coherence score with 8 topics is 0.5233852624130158
Coherence score with 9 topics is 0.5302128973544324
Coherence score with 10 topics is 0.5299833629132256
Coherence score with 11 topics is 0.5188960018814192


Coherence Model where alpha = asymmetric


Coherence score with 6 topics is 0.5506177794878443
Coherence score with 7 topics is 0.5108171055659081
Coherence score with 8 topics is 0.5163527877113094
Coherence score with 9 topics is 0.527714324382923
Coherence score with 10 topics is 0.5196283252625122
Coherence score with 11 topics is 0.49262293505748045




In [25]:
print(f'The optimal number of topics is {opt_num} where alpha is {opt_alpha} with a coherence of {opt_score}')

The optimal number of topics is 6 where alpha is asymmetric with a coherence of 0.5506177794878443


## 11. The business should be able to interpret the topics.

###    1. Name each of the identified topics.

###    2. Create a table with the topic name and the top 10 terms in each to present to the business.

In [26]:
table1 = [  
    ['Device Information', 'Topics 1, 4, 5'], 
    ['Customer Service', 'Topics 2, 3, 6'], 
    ['Contract Information', 'Topic 7'],
    ['Quality', 'Topic 8'],
    ['User Experience', 'Topics 9, 10, 11'],
    ['Issues', 'Topics 12']  
         ]

In [27]:
print(tabulate(table1))

--------------------  ----------------
Device Information    Topics 1, 4, 5
Customer Service      Topics 2, 3, 6
Contract Information  Topic 7
Quality               Topic 8
User Experience       Topics 9, 10, 11
Issues                Topics 12
--------------------  ----------------


In [30]:
table2 = [
    ['Device Information','product, quality, battery, price, time, mode, display, backup, hour, process'],
    ['Customer Service', 'feature, day, lenova, month, call, k8, amazon, service, device, charger'],
    ['Contract Information','money, heat, range,value, replacement, mark, internet, specification, function, mi'],
    ['Quality', 'performance, network, speed, superb, clarity, smartphone, bug, people, gb, excellent'],
    ['User Experience', 'camera, mobile, note, option, lot, screen, thing, ram, user, life'],
    ['Issues', 'phone, problem, issue, heating, update, software, everything, app, volta, model']
    
]

In [31]:
print(tabulate(table2))

--------------------  ------------------------------------------------------------------------------------
Device Information    product, quality, battery, price, time, mode, display, backup, hour, process
Customer Service      feature, day, lenova, month, call, k8, amazon, service, device, charger
Contract Information  money, heat, range,value, replacement, mark, internet, specification, function, mi
Quality               performance, network, speed, superb, clarity, smartphone, bug, people, gb, excellent
User Experience       camera, mobile, note, option, lot, screen, thing, ram, user, life
Issues                phone, problem, issue, heating, update, software, everything, app, volta, model
--------------------  ------------------------------------------------------------------------------------
