In [36]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import seaborn as sns
import gensim
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline
from textblob import TextBlob 
from textblob.classifiers import NaiveBayesClassifier
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, log_loss
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle
from statistics import mean

In [2]:
data = pd.read_csv('amazon_alexa.csv')
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [3]:
data.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [4]:
dataset = data.drop('date', axis = 1)
dataset.head()

Unnamed: 0,rating,variation,verified_reviews,feedback
0,5,Charcoal Fabric,Love my Echo!,1
1,5,Charcoal Fabric,Loved it!,1
2,4,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,Charcoal Fabric,Music,1


### FINDING SENTIMENT AND ADDING IT TO THE DATASET


In [11]:
dataset['feedback'] = np.where(dataset['feedback'].isin([1]), 'pos', 'neg')
dataset.head(3)

Unnamed: 0,rating,variation,verified_reviews,feedback
0,5,Charcoal Fabric,love echo,neg
1,5,Charcoal Fabric,loved,neg
2,4,Walnut Finish,sometimes playing game answer question correct...,neg


In [12]:
dataset['verified_reviews'].apply(lambda x: len(x.split(' '))).sum()

40695

There are 40695 words in the verified_reviews data

### TEXT PRE-PROCESSING

In [40]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(["everyone", "sometimes", "Echo", "verified_reviews", "dtype", "object", "Alexa", "thing", "dot", "now", "one", "item", "bought", "will", "still", "without"])


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
dataset['verified_reviews'] = dataset['verified_reviews'].apply(clean_text)
print (dataset['verified_reviews'])

0                                               love echo
1                                                   loved
2       playing game answer question correctly alexa s...
3       lot fun 4 yr old learns dinosaurs control ligh...
4                                                   music
                              ...                        
3145                                  perfect kids adults
3146    listening music searching locations checking t...
3147    love things running entire home tv lights ther...
3148    complaint sound quality isnt great mostly use ...
3149                                                 good
Name: verified_reviews, Length: 3150, dtype: object


In [18]:
dataset['verified_reviews'].apply(lambda x: len(x.split(' '))).sum()

39361

After cleaning up the data we have 39361 words to work with

### TRANSFORMING REVIEWS INTO A CORPUS (CREATE DICTIONARY) AND FINDING WORD FREQUENCY

In [41]:
# CREATE DICTIONARY TO COUNT THE WORDS
count_dict_alexareviews = {}
for doc in dataset['verified_reviews']:
    for word in doc.split():
        if word in count_dict_alexareviews.keys():
            count_dict_alexareviews[word] +=1
        else:
            count_dict_alexareviews[word] = 1
            
for key, value in sorted(count_dict_alexareviews.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

dinosaurs: 1
categories: 1
grandaughter: 1
2000: 1
miles: 1
duty: 1
boost: 1
definitively: 1
input: 1
select: 1
encyclopedias: 1
greatit: 1
lightswithout: 1
chair: 1
refunds: 1
pauses: 1
pattern: 1
promp: 1
poop: 1
joke#34: 1
naw: 1
dollar: 1
outsmart: 1
youll: 1
foot: 1
performing: 1
6th: 1
divertido: 1
1000: 1
speaknspell: 1
iove: 1
attach: 1
soundgood: 1
fm: 1
native: 1
americans: 1
whose: 1
texas: 1
birth: 1
additionally: 1
marginal: 1
extender: 1
ten: 1
dust: 1
collectors: 1
fixes#34: 1
accomplished: 1
acknowledge: 1
indicated: 1
perfectlyvery: 1
productsand: 1
journey: 1
daytoday: 1
surely: 1
portion: 1
leary: 1
puny: 1
wear: 1
tear: 1
shuts: 1
mint: 1
teenagers: 1
insist: 1
lifechanging: 1
young: 1
simplicity: 1
gazebo: 1
sortof: 1
complaini: 1
timeworks: 1
diagnostics: 1
corrected: 1
roomba: 1
reboot: 1
sentence: 1
holding: 1
action: 1
onehonestly: 1
productonce: 1
itthis: 1
shocked: 1
clone: 1
boat: 1
garage: 1
gf: 1
intermittently: 1
channel: 1
pulsate: 1
pulsed: 1
buddies: 1

invention: 2
tricky: 2
stayed: 2
hmmmm: 2
appreciated: 2
disconnecting: 2
tickled: 2
keeper: 2
rough: 2
patch: 2
sunroom: 2
helful: 2
supported: 2
reached: 2
sprint: 2
bare: 2
minimum: 2
hospital: 2
grands: 2
welcome: 2
serius: 2
sm: 2
alot: 2
fashioned: 2
digitol: 2
vacations: 2
everytime: 2
paranoid: 2
vlan: 2
require: 2
questionable: 2
carful: 2
vetted: 2
askes: 2
thirdparty: 2
handled: 2
lonely: 2
spark: 2
itthe: 2
ifs: 2
ands: 2
buts: 2
minorly: 2
comands: 2
capasity: 2
complained: 2
medium: 2
youve: 2
echostill: 2
doors: 2
barn: 2
horse: 2
healing: 2
continuous: 2
spa: 2
par: 2
nite: 2
boombox: 2
11: 2
thick#34: 2
dated: 2
champ: 2
yeah: 2
itjust: 2
insert: 2
un: 2
equipo: 2
inteligente: 2
estar: 2
conectado: 2
nos: 2
provee: 2
mltiples: 2
funciones: 2
tales: 2
como: 2
tiempo: 2
hora: 2
informacin: 2
cualquier: 2
estudio: 2
ests: 2
realizando: 2
atrs: 2
vez: 2
del: 2
itsuch: 2
forgot: 2
backthe: 2
amaxing: 2
greeting: 2
oven: 2
temps: 2
speakeri: 2
simplified: 2
bandwagon: 2
stru

In [42]:
# REMOVE WORDS THAT OCCUR LESS THAN 10 TIMES
low_value = 10
bad_words = [key for key in count_dict_alexareviews.keys() if count_dict_alexareviews[key] < low_value]

# CREATE A LIST OF LISTS - EACH DOCUMENT IS A STRING BROKEN INTO A LIST OF WORDS
corpus = [doc.split() for doc in dataset['verified_reviews']]
clean_list = []
for document in corpus:
    clean_list.append([word for word in document if word not in bad_words])
clean_list

[['love', 'echo'],
 ['loved'],
 ['playing',
  'answer',
  'question',
  'correctly',
  'alexa',
  'says',
  'got',
  'wrong',
  'answers',
  'like',
  'able',
  'turn',
  'lights',
  'away',
  'home'],
 ['lot',
  'fun',
  '4',
  'old',
  'control',
  'lights',
  'play',
  'games',
  'like',
  'nice',
  'sound',
  'playing',
  'music',
  'well'],
 ['music'],
 ['received',
  'echo',
  'gift',
  'needed',
  'another',
  'bluetooth',
  'something',
  'play',
  'music',
  'easily',
  'found',
  'smart',
  'speaker',
  'cant',
  'wait',
  'see',
  'else'],
 ['cannot',
  'use',
  'many',
  'features',
  'see',
  'use',
  'great',
  'alarm',
  'u',
  'almost',
  'hear',
  'alarm',
  'bedroom',
  'living',
  'room',
  'reason',
  'enough',
  'keep',
  'fun',
  'ask',
  'questions',
  'hear',
  'response',
  'seem',
  'yet'],
 ['think',
  'ive',
  'purchased',
  'im',
  'working',
  'getting',
  'every',
  'room',
  'house',
  'really',
  'like',
  'features',
  'playing',
  'music',
  'echos',


### Let’s create the inputs of model using corpora

In [44]:
corpora_dict = corpora.Dictionary(clean_list)
corpus = [corpora_dict.doc2bow(line) for line in clean_list]