In [2]:
# import Python packages
import pandas as pd
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt

# regression package
import statsmodels.api as sm

# sentiment analysis packages
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from textblob import TextBlob

# topic modeling packages
import gensim
from gensim import corpora

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
!jupyter nbconvert --to html /content/MA_ind1.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

## **Part 1: Sentiment Analysis**

In [4]:
#Product review data
uploaded = files.upload()
reviews = pd.read_csv('product_reviews.csv')
# take a look at the data
reviews.head(2)

Saving product_reviews.csv to product_reviews.csv


Unnamed: 0,Review_ID,Item_ID,Base_item_ID,Review_date,Reviewer_ID,Real_name,Verified_purchase,Rating,Title,Content,...,Helpful_votes_week15,Helpful_votes_week16,Helpful_votes_week17,Helpful_votes_week18,Helpful_votes_week19,Helpful_votes_week20,Helpful_votes_week21,Helpful_votes_week22,Helpful_votes_week23,Helpful_votes_week24
0,R100E6MT94PK6L,B0051VVOB2,,1/8/2012,A1HGATCAMGXTGF,False,True,5,Love My Kindle Fire!,I love my fire and highly recommend it to anyo...,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5
1,R100HU42LKLLD0,B0057O9O6K,,4/10/2012,A3GGO95QT2PP47,False,True,2,Not the best Tablet or a good buy,The operating system is is an early android. Y...,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [5]:
# keep only the review text (Content) and star ratings
reviews_sample = reviews[['Rating','Content']]
reviews_sample.head()

Unnamed: 0,Rating,Content
0,5,I love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...
2,2,I have been reading on Kindle since the Kindle...
3,5,I bought the Fire because I wanted access to a...
4,5,I got the Galaxy Tab because I wanted a comput...


Text Processing

In [6]:
# remove punctuation and numbers, lower case the text
def clean(text):
    # replace any non-letters with a space
    text = re.sub('[^A-Za-z]+', ' ', text)
    # lower case the text
    text = text.lower()
    return text

# apply the function clean to each review
reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)
reviews_sample.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_sample['Cleaned Reviews'] = reviews_sample['Content'].apply(clean)


Unnamed: 0,Rating,Content,Cleaned Reviews
0,5,I love my fire and highly recommend it to anyo...,i love my fire and highly recommend it to anyo...
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...


In [7]:
#filter reviews related to kindle
reviews_sample = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('kindle')]
reviews_sample.head()

Unnamed: 0,Rating,Content,Cleaned Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...
3,5,I bought the Fire because I wanted access to a...,i bought the fire because i wanted access to a...
4,5,I got the Galaxy Tab because I wanted a comput...,i got the galaxy tab because i wanted a comput...
9,4,"This is a nice little mini computer, but I am ...",this is a nice little mini computer but i am a...


In [8]:
#screen, customer service, weight, price
sample_screen = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('screen')]
sample_weight = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('weight')]
sample_price = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('price')]

In [9]:
# tokenize, remove stop words, stem
# we use the Porter stemmer, a process for removing suffixes from words in English
ps = PorterStemmer()

def token_stop_stem(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    new_review = ""
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          word_stem = ps.stem(word) # stem each word
          newlist.append(word_stem)
          new_review = new_review + " " + word_stem
    return new_review

## **Question 1: Analyze Polarity**

\

 1. Use the tablet review data to calculate the distribution of sentiment polarities (i.e., positive, neutral, negative) for the following Kindle attributes: screen, customer service, weight, price. Hint: First clean, remove stop words, and stem. \
\
A.What is the ratio of positive reviews to negative reviews for each of these attributes? Hint: Use value_counts from pandas.

In [10]:
sample_screen['Final Reviews'] = sample_screen['Cleaned Reviews'].apply(token_stop_stem)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_screen['Final Reviews'] = sample_screen['Cleaned Reviews'].apply(token_stop_stem)


In [11]:
# we will score the polarity of each review
# polarity ranges from -1 (negative) to 1 (positive)
# under the hood, TextBlob uses a lexicon-based method for scoring
# for details, see https://github.com/sloria/TextBlob/blob/dev/textblob/_text.py
def getPolarityScore(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def getPolarity(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

Screen Reviews

In [12]:
sample_screen['Score'] = sample_screen['Final Reviews'].apply(getPolarityScore)
sample_screen['Polarity'] = sample_screen['Score'].apply(getPolarity)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_screen['Score'] = sample_screen['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_screen['Polarity'] = sample_screen['Score'].apply(getPolarity)


In [13]:
a = sample_screen[['Polarity']].value_counts()
positive = a[0]
negative = a[1]
ratio = positive/negative
ratio
#positive reviews to negative reviews for screen is 15.793002915451895.

15.793002915451895

Customer Service Reviews

In [14]:
sample_service = reviews_sample[reviews_sample['Cleaned Reviews'].str.contains('customer service')]

In [15]:
sample_service['Final Reviews'] = sample_service['Cleaned Reviews'].apply(token_stop_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_service['Final Reviews'] = sample_service['Cleaned Reviews'].apply(token_stop_stem)


In [16]:
sample_service['Score'] = sample_service['Final Reviews'].apply(getPolarityScore)
sample_service['Polarity'] = sample_service['Score'].apply(getPolarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_service['Score'] = sample_service['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_service['Polarity'] = sample_service['Score'].apply(getPolarity)


In [17]:
b = sample_service[['Polarity']].value_counts()
positive_s = b[0]
negative_s = b[1]
service_ratio = positive_s/negative_s
service_ratio
#positive reviews to negative reviews for customer service is 7.6231884057971016.

7.6231884057971016

Weight Review

In [18]:
sample_weight['Final Reviews'] = sample_weight['Cleaned Reviews'].apply(token_stop_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_weight['Final Reviews'] = sample_weight['Cleaned Reviews'].apply(token_stop_stem)


In [None]:
sample_weight['Score'] = sample_weight['Final Reviews'].apply(getPolarityScore)
sample_weight['Polarity'] = sample_weight['Score'].apply(getPolarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_weight['Score'] = sample_weight['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_weight['Polarity'] = sample_weight['Score'].apply(getPolarity)


In [None]:
c = sample_weight[['Polarity']].value_counts()
positive_w = c[0]
negative_w = c[1]
weight_ratio = positive_w/negative_w
weight_ratio
#positive reviews to negative reviews for weight is 25.514285714285716.

25.514285714285716

Price Review

In [None]:
sample_price['Final Reviews'] = sample_price['Cleaned Reviews'].apply(token_stop_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_price['Final Reviews'] = sample_price['Cleaned Reviews'].apply(token_stop_stem)


In [None]:
sample_price['Score'] = sample_price['Final Reviews'].apply(getPolarityScore)
sample_price['Polarity'] = sample_price['Score'].apply(getPolarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_price['Score'] = sample_price['Final Reviews'].apply(getPolarityScore)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_price['Polarity'] = sample_price['Score'].apply(getPolarity)


In [None]:
d = sample_price[['Polarity']].value_counts()
positive_p = d[0]
negative_p = d[1]
price_ratio = positive_p/negative_p
price_ratio
#positive reviews to negative reviews for price is 25.514285714285716.

26.76923076923077

# **Question 2: Topic Modeling - Latent Dirichlet Allocation**

Run Latent Dirichlet Allocation for the Amazon Kindle reviews after cleaning and removing the stop words (do not stem) for 3, 4, and 5, topics (you can use the same hyperparameters and seed (9651) in the text analysis ipynb reviewed in class). \
Give names to the topics based on the top six words of each model.
Do the topics overlap?

In [19]:
#recap: reviews that have the word 'Kindle'
reviews_sample

Unnamed: 0,Rating,Content,Cleaned Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...
3,5,I bought the Fire because I wanted access to a...,i bought the fire because i wanted access to a...
4,5,I got the Galaxy Tab because I wanted a comput...,i got the galaxy tab because i wanted a comput...
9,4,"This is a nice little mini computer, but I am ...",this is a nice little mini computer but i am a...
...,...,...,...
40730,5,"Oh how I love my Kindle Fire, let me count the...",oh how i love my kindle fire let me count the ...
40734,5,"I can read, work, play, hear music and manny t...",i can read work play hear music and manny thin...
40736,5,I got my Kindle Fire for Christmas and have us...,i got my kindle fire for christmas and have us...
40737,5,Kindle Fire extremely user friendly. Does ever...,kindle fire extremely user friendly does every...


In [21]:
# tokenize, remove stop words, return tokens
def token_stop(text):
    tokens = word_tokenize(text) # tokenize the text
    newlist = []
    for word in tokens:
        if word not in set(stopwords.words('english')): # remove stop words
          newlist.append(word)
    return newlist

In [22]:
reviews_sample['LDA Reviews'] = reviews_sample['Cleaned Reviews'].apply(token_stop)
reviews_sample.head(2)

Unnamed: 0,Rating,Content,Cleaned Reviews,LDA Reviews
1,2,The operating system is is an early android. Y...,the operating system is is an early android yo...,"[operating, system, early, android, cant, use,..."
2,2,I have been reading on Kindle since the Kindle...,i have been reading on kindle since the kindle...,"[reading, kindle, since, kindle, released, pre..."


In [23]:
dict_ = corpora.Dictionary(reviews_sample['LDA Reviews'])
print(dict_)

Dictionary<28518 unique tokens: ['android', 'barns', 'cant', 'com', 'download']...>


In [24]:
# convert list of documents (corpus) into Document Term Matrix using the dictionary
doc_term_matrix = [dict_.doc2bow(i) for i in reviews_sample['LDA Reviews']]

In [None]:
# set training parameters
num_topics = 3
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_3 = lda(doc_term_matrix,
    num_topics=num_topics,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)

In [None]:
lda_model_3.log_perplexity(doc_term_matrix)

-7.4003671271955005

In [None]:
lda_model_3.print_topics(num_words=6)

[(0,
  '0.040*"kindle" + 0.039*"fire" + 0.014*"books" + 0.012*"love" + 0.012*"great" + 0.012*"ipad"'),
 (1,
  '0.038*"kindle" + 0.027*"fire" + 0.017*"amazon" + 0.012*"one" + 0.010*"would" + 0.009*"get"'),
 (2,
  '0.013*"tablet" + 0.010*"device" + 0.010*"android" + 0.009*"app" + 0.009*"apps" + 0.009*"screen"')]

In [25]:
num_topics_four = 4
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_4 = lda(doc_term_matrix,
    num_topics=num_topics_four,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)
lda_model_4.log_perplexity(doc_term_matrix)

-7.404928372868024

In [26]:
lda_model_4.print_topics(num_words=6)

[(0,
  '0.021*"tablet" + 0.019*"screen" + 0.012*"battery" + 0.010*"use" + 0.009*"great" + 0.008*"good"'),
 (1,
  '0.049*"kindle" + 0.045*"fire" + 0.016*"books" + 0.014*"love" + 0.011*"read" + 0.011*"amazon"'),
 (2,
  '0.014*"device" + 0.012*"ipad" + 0.011*"apps" + 0.011*"amazon" + 0.010*"app" + 0.010*"android"'),
 (3,
  '0.031*"kindle" + 0.019*"fire" + 0.018*"amazon" + 0.011*"one" + 0.010*"get" + 0.010*"would"')]

In [27]:
num_topics_five = 5
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # don't evaluate model perplexity, takes too much time

lda = gensim.models.LdaModel

lda_model_5 = lda(doc_term_matrix,
    num_topics=num_topics_five,
    id2word = dict_,
    iterations=iterations,
    passes=passes,
    eval_every=eval_every,
    random_state=9651
)
lda_model_5.log_perplexity(doc_term_matrix)

-7.421720794404842

In [28]:
lda_model_5.print_topics(num_words=6)

[(0,
  '0.026*"screen" + 0.015*"battery" + 0.012*"use" + 0.011*"reading" + 0.010*"touch" + 0.010*"life"'),
 (1,
  '0.033*"fire" + 0.031*"kindle" + 0.020*"amazon" + 0.012*"books" + 0.010*"device" + 0.010*"like"'),
 (2,
  '0.017*"tablet" + 0.013*"android" + 0.012*"ipad" + 0.012*"apps" + 0.011*"app" + 0.011*"device"'),
 (3,
  '0.033*"kindle" + 0.018*"fire" + 0.015*"amazon" + 0.012*"one" + 0.011*"get" + 0.010*"would"'),
 (4,
  '0.057*"kindle" + 0.048*"fire" + 0.026*"love" + 0.021*"great" + 0.018*"ipad" + 0.013*"use"')]

In [None]:
count = 0
for i in range(5):
    print("doc : ",count,lda_model[doc_term_matrix][i])
    count += 1

doc :  0 [(0, 0.020316448), (1, 0.38897434), (2, 0.5907092)]
doc :  1 [(0, 0.60845625), (2, 0.38908654)]
doc :  2 [(0, 0.7599936), (1, 0.17298336), (2, 0.067023024)]
doc :  3 [(0, 0.7323753), (2, 0.26040417)]
doc :  4 [(0, 0.86940974), (1, 0.016111584), (2, 0.11447866)]


In [None]:
reviews_sample["Content"]

1        The operating system is is an early android. Y...
2        I have been reading on Kindle since the Kindle...
3        I bought the Fire because I wanted access to a...
4        I got the Galaxy Tab because I wanted a comput...
9        This is a nice little mini computer, but I am ...
                               ...                        
40730    Oh how I love my Kindle Fire, let me count the...
40734    I can read, work, play, hear music and manny t...
40736    I got my Kindle Fire for Christmas and have us...
40737    Kindle Fire extremely user friendly. Does ever...
40738    I purchased the Kindle Fire in December for my...
Name: Content, Length: 16381, dtype: object