# Senimental analysis on one plus 6T Amazon reviews

## Connecting and fetching data from mongoDB

In [1]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017')



In [89]:
exclude_data = {'_id': False, 'ProductDescription': False, 'ProductEnlargeImage': False , }

raw_data = client.amazon.products.find({})

testData = []


In [90]:
for rev in raw_data[0]['MostRecent100Reviews'].values():
    if rev != None:
        testData.append(rev)
        
     
    

In [91]:
CopyTestData = testData

## Model Building

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential 
from keras.layers import Dense, Dropout,Embedding, Activation,LSTM

from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline


In [7]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('test.ft.txt.bz2')


## Text preProcessing

### Normalize strings

In [8]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)

### Data splitting for training and validation

In [9]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=1235, test_size=0.2)

### Tokenizer

In [10]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)

### Padding

In [11]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)



## Model Building using Recurrent Neural Net Model

In [16]:
seq_len = 231
embedding_size = 100

In [17]:
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = MAX_FEATURES, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model.add(LSTM(5, return_sequences=False))
model.add(Dense(1, activation='sigmoid')) # Add an ouput layer.

Instructions for updating:
Colocations handled automatically by placer.


### Model Training And validation

In [18]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [19]:
model.compile(optimizer=adam,  # 'Adam' is a variant of gradient descent technique                
              loss='binary_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['binary_accuracy'])

In [20]:
model.fit(train_texts,train_labels,batch_size=128,epochs=1,validation_data=(val_texts, val_labels), )

Instructions for updating:
Use tf.cast instead.
Train on 320000 samples, validate on 80000 samples
Epoch 1/1


<keras.callbacks.History at 0xb164161780>

## Testing on one plus 6 data

### Pre processing on test data


In [56]:
testData = normalize_texts(testData)

In [57]:
testData = tokenizer.texts_to_sequences(testData)

In [58]:
testData = pad_sequences(testData, maxlen=MAX_LENGTH)

### Prediction on test Data

In [59]:
preds = model.predict(testData)

In [65]:
classesPred = model.predict_classes(testData)

### sentiments in %

In [78]:
unique_elements, counts_elements = np.unique(classesPred, return_counts=True)
    

In [87]:
print('Negative reviews of one plus 6T :',(counts_elements[0]*100)/np.prod(classesPred.shape))
print('Positive reviews of one plus 6T :',(counts_elements[1]*100)/np.prod(classesPred.shape))

Negative reviews of one plus 6T : 24.444444444444443
Positive reviews of one plus 6T : 75.55555555555556


### Data frame of reviews and their predictions

In [71]:
sentiment = []

In [72]:
for s in classesPred:
    
    if s[0] == 1:
        sentiment.append('Positive')
    else:
        sentiment.append('Negative')
        
        

In [94]:
sentimentdf = pd.DataFrame(list(zip(CopyTestData, sentiment)), 
               columns =['Review', 'Sentiment']) 

In [96]:
sentimentdf.head(10)

Unnamed: 0,Review,Sentiment
0,The best!,Positive
1,Good features,Positive
2,Overall good phone,Positive
3,The hope and name OnePlus brand is created in ...,Negative
4,Just like that!! A super smooth phone for ever...,Positive
5,"Front camera, face recognition and battery lif...",Positive
6,Best One Plus phone,Positive
7,"Amazing packing, original product, awesome",Positive
8,Whoever's looking for next gen smartphone.. th...,Negative
9,Good,Positive
