### New reviews analysis

#### 1 Sentiment analysis

#### 1.1 Loading/Creating and preprocessing new reviews

In [0]:
import pandas as pd
import numpy as np

# selv-defined functions
from Preprocessing.data_preprocess import preprocessing
from Preprocessing.helper_functions import *

# for data preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# loading the tokenizer for vectorize function
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# loading the sentiment analysis model
from keras.models import load_model
model = load_model('model-005-0.860694-0.854000.h5')

# loading dictionary for LDA model
from gensim.corpora import Dictionary
dictionary = Dictionary.load_from_text("corpora_dictionary")

# loading the LDA model
from gensim.models import ldamodel
lda_model = ldamodel.LdaModel.load("lda_model")

# getting the predicted topics
import gensim.corpora as corpora

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
W0626 15:26:16.319957 140735978423168 deprecation_wrapper.py:119] From /Users/admin/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0626 15:26:22.550916 140735978423168 deprecation_wrapper.py:119] From /Users/admin/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0626 15:26:22.552987 140735978423168 deprecation_wrapper.py:119] From /Users/admin/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0626 15:26:23.085658 140735978423168 deprecation_wrapper.py:119] From /Users/admin/anaconda3/li

In [0]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know',
                   'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 
                   'right', 'line', 'even', 'also', 'may', 'take', 'come',
                  'like', 'well', 'point', 'much'])

In [0]:
# Create a new corpus, made of previously unseen documents.
unseen_texts = [
     ["The waitress seemed annoyed and didn't even apologize for bringing wrong order."],
     ["I had to wait very long for my food and after all I didn't received what I ordered. What I received tasted terrible and the prices were also quite high."],
     ["I was here with my best friend to celebrate her birthday. The food was very tasty and the waitress was nice."]
]
print(unseen_texts[0])

["The waitress seemed annoyed and didn't even apologize for bringing wrong order."]


In [0]:
# preprocessing 
new_reviews = preprocessing(unseen_texts, stop_words)
new_reviews

[['waitress', 'seem', 'annoy', 'apolog', 'bring', 'wrong', 'order'],
 ['wait',
  'long',
  'food',
  'receiv',
  'order',
  'receiv',
  'tast',
  'terribl',
  'price',
  'quit',
  'high'],
 ['best', 'friend', 'celebr', 'birthday', 'food', 'tasti', 'waitress']]

In [0]:
# vectorizing
new_reviews = vectorize(new_reviews, tokenizer , max_len = 400)
new_reviews[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

#### 1.2 Predicting the polarity

In [0]:
# getting the predictions
predicted_polarity = model.predict_classes(new_reviews)
predicted_polarity.shape

(3,)

In [0]:
# creating dataframe with texts
unseen_predicted = pd.DataFrame(unseen_texts)  
unseen_predicted.columns = ['text']
unseen_predicted

Unnamed: 0,text
0,The waitress seemed annoyed and didn't even ap...
1,I had to wait very long for my food and after ...
2,I was here with my best friend to celebrate he...


In [0]:
# adding polarity to the dataframe
unseen_predicted['polarity'] = predicted_polarity #INDEX OF THE COLUMN positive =0, negative = 1
unseen_predicted

Unnamed: 0,text,polarity
0,The waitress seemed annoyed and didn't even ap...,1
1,I had to wait very long for my food and after ...,1
2,I was here with my best friend to celebrate he...,0


#### 2 Topic modeling

#### 2.1 Getting and preprocessing the negative reviews

In [0]:
negative_reviews = unseen_predicted[unseen_predicted['polarity'] == 1]
negative_reviews

Unnamed: 0,text,polarity
0,The waitress seemed annoyed and didn't even ap...,1
1,I had to wait very long for my food and after ...,1


In [0]:
unseen_corpus  = preprocessing(negative_reviews['text'], stop_words)
unseen_corpus

[['waitress', 'seem', 'annoy', 'apolog', 'bring', 'wrong', 'order'],
 ['wait',
  'long',
  'food',
  'receiv',
  'order',
  'receiv',
  'tast',
  'terribl',
  'price',
  'quit',
  'high']]

In [0]:
unseen_bag_of_words = [dictionary.doc2bow(text) for text in unseen_corpus]
unseen_bag_of_words

[[(23, 1), (56, 1), (89, 1), (262, 1), (374, 1), (588, 1), (930, 1)],
 [(23, 1),
  (97, 1),
  (261, 1),
  (424, 1),
  (453, 1),
  (455, 2),
  (522, 1),
  (620, 1),
  (1201, 1),
  (1265, 1)]]

#### 2.2 Predicting the topics

In [0]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 500

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=unseen_bag_of_words, texts=negative_reviews['text'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.9119,"one, time, servic, place, us, ask, wait, tabl, back, never",The waitress seemed annoyed and didn't even apologize for bringing wrong order.
1,1,2.0,0.5041,"one, time, servic, place, us, ask, wait, tabl, back, never",I had to wait very long for my food and after all I didn't received what I ordered. What I received tasted terrible and the prices were also quite high.
