## Connecting to Data

In [1]:
from pymongo import MongoClient

# Connect the notebook to the MongoDB database - "amazonreviews"
client = MongoClient()
db = client.amazonreviews

In [2]:
# Show one of the documnets in the collection "Books"
list(db.Books.find().limit(1))

[{'_id': ObjectId('5f391bb6bd04e741588262bc'),
  'marketplace': 'US',
  'customer_id': 32715830,
  'review_id': 'R2GANXKDIFZ6OI',
  'product_id': '014241543X',
  'product_parent': 712432151,
  'product_title': 'If I Stay',
  'product_category': 'Books',
  'star_rating': 5,
  'helpful_votes': 0,
  'total_votes': 0,
  'vine': 'N',
  'verified_purchase': 'N',
  'review_headline': 'Five Stars',
  'review_body': 'So beautiful',
  'review_date': '2015-08-31'}]

In [3]:
import pandas as pd

# Create a new dataframe for the Harry Potter by using "product_parent': 667539744"
HarryPotter_cursor = db.Books.find({'product_parent': 667539744 })
HarryPotter_df_raw = pd.DataFrame(list(HarryPotter_cursor))  

## Exploratory Data Analysis

In [4]:
# Show the first five rows in the dataframe "HP_df_raw"
HarryPotter_df_raw.head(5)

Unnamed: 0,_id,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,5f391bb6bd04e74158826435,US,42198815,R1L0NE9TE6EAYU,7020033458,667539744,Harry Potter and the Prisoner of Azkaban (Simp...,Books,5,0,0,N,Y,Five Stars,GREAT THANKS.,2015-08-31
1,5f391bb6bd04e74158827679,US,5328185,RD5V8C95DUZZ7,059035342X,667539744,Harry Potter and the Sorcerer's Stone,Books,5,0,0,N,N,This book is absolutely amazing! It is a favor...,This book is absolutely amazing! It is a favor...,2015-08-31
2,5f391bb6bd04e741588280ad,US,42237878,R3LW2TZQ5FLYGF,545162076,667539744,Harry Potter Paperback Box Set (Books 1-7),Books,5,0,1,N,Y,Five Stars,What's not to love about Harry Potter? Books w...,2015-08-31
3,5f391bb6bd04e741588280fc,US,12175857,R26KVAWWVTNZHF,439136369,667539744,Harry Potter and the Prisoner of Azkaban,Books,4,0,0,N,N,Rowling escalates her game and ups the ante,Prisoner_of_Azkaban_coverDo I need to put a su...,2015-08-31
4,5f391bb7bd04e741588290c6,US,16802733,RWIEHV6WZYGD7,545010225,667539744,Harry Potter and the Deathly Hallows (Book 7),Books,5,0,0,N,Y,Harry Potter... enough said.,Harry Potter... enough said.,2015-08-31


In [5]:
# Get info on "HarryPotter_df_raw"
HarryPotter_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28440 entries, 0 to 28439
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   _id                28440 non-null  object
 1   marketplace        28440 non-null  object
 2   customer_id        28440 non-null  int64 
 3   review_id          28440 non-null  object
 4   product_id         28440 non-null  object
 5   product_parent     28440 non-null  int64 
 6   product_title      28440 non-null  object
 7   product_category   28440 non-null  object
 8   star_rating        28440 non-null  int64 
 9   helpful_votes      28440 non-null  int64 
 10  total_votes        28440 non-null  int64 
 11  vine               28440 non-null  object
 12  verified_purchase  28440 non-null  object
 13  review_headline    28440 non-null  object
 14  review_body        28440 non-null  object
 15  review_date        28440 non-null  object
dtypes: int64(5), object(11)
memory usage: 3.

In [6]:
# Check for NaN values
HarryPotter_df_raw.isnull().any()

_id                  False
marketplace          False
customer_id          False
review_id            False
product_id           False
product_parent       False
product_title        False
product_category     False
star_rating          False
helpful_votes        False
total_votes          False
vine                 False
verified_purchase    False
review_headline      False
review_body          False
review_date          False
dtype: bool

In [7]:
# Filter columns and delete "HarryPotter_df_raw"
HarryPotter_df = HarryPotter_df_raw.filter(['marketplace','customer_id','review_id','product_id','product_title','roduct_title','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date'])
del HarryPotter_df_raw

In [8]:
# Revome any duplicates
HarryPotter_df = HarryPotter_df.drop_duplicates(subset=['review_id'])

In [9]:
# Change "Y" and "N" to integers 1 and 0
HarryPotter_df = HarryPotter_df.replace('Y', 1)
HarryPotter_df = HarryPotter_df.replace('N', 0)

In [10]:
import bs4

# Revome HTML
HarryPotter_df['review_body'] = HarryPotter_df['review_body'].apply(lambda x: bs4.BeautifulSoup(x, 'lxml').get_text())

 ### Hyperparameter

In [11]:
# Create sentiment parameter based on star rating
def get_sentiment(value):
    if value > 3:
        return 1
    elif value < 3:
        return -1
    else:
        return 0

HarryPotter_df['star_sentiment'] = HarryPotter_df.star_rating.apply(get_sentiment)

In [12]:
# Find number of review for each sentiment
print(HarryPotter_df['star_sentiment'].value_counts())

 1    25750
 0     1415
-1     1275
Name: star_sentiment, dtype: int64


In [13]:
# Find number of words in review_body
HarryPotter_df["num_words"] = HarryPotter_df["review_body"].apply(lambda x: len(str(x).split()))

In [14]:
# Find number of unique words in review_body
HarryPotter_df["num_unique_words"] = HarryPotter_df["review_body"].apply(lambda x: len(set(str(x).split())))

In [15]:
# Find number of characters in review_body
HarryPotter_df["num_chars"] = HarryPotter_df["review_body"].apply(lambda x: len(str(x)))

In [16]:
import string

# Find number of punctuation marks in review_body
HarryPotter_df["num_punctuations"] = HarryPotter_df['review_body'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

In [17]:
import numpy as np

# Find average length of the words in review_body
HarryPotter_df["mean_word_len"] = HarryPotter_df["review_body"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [18]:
# Find general statistics on hyperparameters
HarryPotter_df.describe()

Unnamed: 0,customer_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,star_sentiment,num_words,num_unique_words,num_chars,num_punctuations,mean_word_len
count,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0,28440.0
mean,37716650.0,4.621624,1.789768,3.215084,0.0,0.3077,0.860584,130.238432,84.6282,721.107419,22.695464,4.490633
std,14750480.0,0.860609,16.472034,20.485293,0.0,0.46155,0.457875,164.755524,77.379865,943.888726,32.694961,7.702693
min,15584.0,1.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,1.0,0.0,1.0
25%,26500010.0,5.0,0.0,0.0,0.0,0.0,1.0,38.0,33.0,202.0,6.0,4.153846
50%,43317180.0,5.0,0.0,1.0,0.0,0.0,1.0,82.0,63.0,439.0,13.0,4.411494
75%,50660110.0,5.0,1.0,2.0,0.0,1.0,1.0,160.0,111.0,879.0,27.0,4.666667
max,53096190.0,5.0,1550.0,1646.0,0.0,1.0,1.0,6556.0,1587.0,38667.0,1508.0,1300.0


In [19]:
# Filter reviews with less than 20 words
# HarryPotter_df = HarryPotter_df[HarryPotter_df.num_words > 20]

### NLP Preprocessing (SpaCy)

In [25]:
# Import needed libraries 
import spacy
import en_core_web_md
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

# Load SpaCy model
nlp = en_core_web_md.load()

In [21]:
# Preprocess review_body
HarryPotter_df['review_body'] = HarryPotter_df.review_body.str.lower()
HarryPotter_df['spacy_body'] = list(nlp.pipe(HarryPotter_df.review_body))

In [22]:
# Preprocess review_headline
HarryPotter_df['review_headline'] = HarryPotter_df.review_headline.str.lower()
HarryPotter_df['spacy_headline'] = list(nlp.pipe(HarryPotter_df.review_headline))

In [23]:
# Seperate dataframe by sentiment
positive_reviews = HarryPotter_df[HarryPotter_df.star_sentiment==1]
negative_reviews = HarryPotter_df[HarryPotter_df.star_sentiment==-1]
neutral_reviews = HarryPotter_df[HarryPotter_df.star_sentiment==0]

In [27]:
# Get top adjectives
pos_adj = [token.text for doc in positive_reviews.spacy_body for token in doc if token.pos_=='ADJ']
print(Counter(pos_adj).most_common(10))
neg_adj = [token.text for doc in negative_reviews.spacy_body for token in doc if token.pos_=='ADJ']
print(Counter(neg_adj).most_common(10))

[('great', 8600), ('first', 7996), ('good', 7265), ('more', 6790), ('best', 5891), ('other', 5513), ('many', 5241), ('new', 5211), ('old', 3990), ('little', 3268)]
[('good', 538), ('first', 462), ('other', 426), ('more', 350), ('many', 303), ('bad', 282), ('new', 245), ('great', 228), ('better', 219), ('much', 212)]


In [28]:
# Get top nouns
pos_noun = [token.text for doc in positive_reviews.spacy_body for token in doc if token.pos_=='NOUN']
print(Counter(pos_noun).most_common(10))
neg_noun = [token.text for doc in negative_reviews.spacy_body for token in doc if token.pos_=='NOUN']
print(Counter(neg_noun).most_common(10))

[('book', 53047), ('books', 21413), ('series', 11315), ('story', 7965), ('characters', 7518), ('time', 6380), ('world', 6177), ('year', 4876), ('children', 4730), ('people', 4345)]
[('book', 2954), ('books', 1413), ('series', 559), ('story', 520), ('characters', 490), ('time', 377), ('plot', 358), ('pages', 354), ('character', 346), ('children', 297)]


## Latent Dirichlet Allocation (LDA) Topic Modeling (with SpaCy and Gensim)

In [29]:
# Import needed libraries 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [30]:
# Function to prepare corpus
def prep_tm_corp(corpus):
    new_corpus=[]
    for doc in corpus:
        doc_list=[]
        for token in doc:
            if token.lemma_ != '-PRON-' and token.is_stop != True and token.is_punct != True:
                doc_list.append(token.lemma_)
        new_corpus.append(doc_list)
    return new_corpus

In [32]:
# Prepare positives reviews
corpus = positive_reviews.spacy_body
prep_corpus = prep_tm_corp(corpus)
words = corpora.Dictionary(prep_corpus)
corpus = [words.doc2bow(doc) for doc in prep_corpus]

In [41]:
# Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [42]:
# Print topics with key words
lda_model.print_topics()

[(0,
  '0.059*"wow" + 0.058*"recomend" + 0.043*"definately" + 0.042*"reccomend" + 0.021*"mood" + 0.016*"yr" + 0.013*"latin" + 0.013*"un" + 0.012*"weak" + 0.011*"winner"'),
 (1,
  '0.017*"de" + 0.014*"la" + 0.014*"credit" + 0.011*"y" + 0.010*"pen" + 0.008*"en" + 0.008*"que" + 0.006*"al" + 0.006*"dictionary" + 0.006*"et"'),
 (2,
  '0.066*"son" + 0.055*"daughter" + 0.022*"christmas" + 0.020*"english" + 0.020*"gift" + 0.017*"old" + 0.013*"fresh" + 0.013*"confusing" + 0.010*"nephew" + 0.010*"hurry"'),
 (3,
  '0.137*" " + 0.042*"harry" + 0.014*"rowling" + 0.014*"book" + 0.013*"potter" + 0.010*"character" + 0.008*"child" + 0.007*"year" + 0.007*"story" + 0.007*"world"'),
 (4,
  '0.048*"tape" + 0.026*"king" + 0.024*"stage" + 0.022*"rare" + 0.019*"manner" + 0.013*"hungry" + 0.013*"performance" + 0.009*"overly" + 0.009*"narration" + 0.009*"stephen"'),
 (5,
  '0.162*"book" + 0.087*"read" + 0.031*"potter" + 0.028*"good" + 0.020*"love" + 0.019*"great" + 0.019*"series" + 0.018*"think" + 0.018*"like" 