# Assignment project 7 : 
# Amazon reviews analysis. This dataset consists of a few million Amazon customer reviews (input text) and star ratings (output labels) for learning how to train fastText for sentiment analysis. -->

In [1]:
# Loading the required libraries

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import bz2
import re

from tensorflow.keras import models, layers, optimizers
from sklearn.metrics import f1_score, accuracy_score


In [2]:
# Creating a fucntion to load the text and labels from train and test set

def get_labels_texts(file):
    labels= []
    texts= []
    for line in bz2.BZ2File(file):
        x= line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts




In [3]:
# use the function to get the labels and texts

train_labels, train_texts= get_labels_texts('train.ft.txt.bz2')
test_labels, test_texts= get_labels_texts('test.ft.txt.bz2')

In [4]:
train_labels[0]    # just seeing what is in train and test 

1

In [5]:
train_texts[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [6]:
test_labels[5]

0

In [7]:
test_texts[5]

"DVD Player crapped out after one year: I also began having the incorrect disc problems that I've read about on here. The VCR still works, but hte DVD side is useless. I understand that DVD players sometimes just quit on you, but after not even one year? To me that's a sign on bad quality. I'm giving up JVC after this as well. I'm sticking to Sony or giving another brand a shot."

In [8]:
# As said in the question we have millions of review so to make it little easier will t ake only 1000 reviews 

train_labels= train_labels[0:500]
train_texts= train_texts[0:500]

test_labels= test_labels[0:500]
test_texts= test_texts[0:500]

In [9]:
len(train_labels), len(train_texts), len(test_labels)

(500, 500, 500)

# Text preprocessing

In [10]:
non_alphanum= re.compile(r'[\W]')
non_ascii= re.compile(r'[^a-z0-1\s]')

def normalize_texts(texts):
    normalized_texts = []
    for text in texts: 
        lower= text.lower()
        no_punctuation= non_alphanum.sub(r' ', lower)
        no_non_ascii= non_ascii.sub(r'', no_punctuation )
        normalized_texts.append(no_non_ascii)
    return normalized_texts


  # lets use this function for train and test set
train_texts= normalize_texts(train_texts)
test_texts= normalize_texts(test_texts)



In [11]:
train_texts[0]

'stuning even for the non gamer  this sound track was beautiful  it paints the senery in your mind so well i would recomend it even to people who hate vid  game music  i have played the game chrono cross but out of all of the games i have ever played it has the best music  it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  it would impress anyone who cares to listen    '

# Now we will start with Model

In [12]:
# we will first split the train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(train_texts, train_labels, train_size=0.80)


In [13]:
# Tokenization is the process of dividing the sentence into a series of tokens. 
# Simply put, whenever there is a space in a sentence, the process of tokenization adds a comma between them so the sentence will be broken down into tokens.
# Each word gets a unique integer value so that my model can process the text for further analysis.

MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(x_train)
train_texts = tokenizer.texts_to_sequences(x_train)
test_texts = tokenizer.texts_to_sequences(x_test)

In [14]:
# The max() function returns the item with the highest value in an iterable.
# Here, the variable 'MAX_LENGTH' returns the longest text in the training dataset.
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)

train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

# 'pad_sequences' is used to ensure that all sequences in a list have the same length.
# By default this is done by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence.
# Simply put, padding is the method of converting the integer array of variable length into fixed-length when the length is shorter than the max_length. 

# Convolutional Neural Net Model
I'm just using fairly simple models here. This CNN has an embedding with a dimension of 64, 3 convolutional layers with the first two having batch normalization and max pooling and the last with global max pooling. The results are then passed to a dense layer and then the output.

In [15]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    
    x = layers.Dense(100, activation='relu')(x)
    
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [16]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 195)]             0         
                                                                 
 embedding (Embedding)       (None, 195, 64)           768000    
                                                                 
 conv1d (Conv1D)             (None, 193, 64)           12352     
                                                                 
 batch_normalization (BatchN  (None, 193, 64)          256       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 64, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 60, 64)            20544 

In [18]:
#  now evrything is ready, lets fit the model 

model.fit(train_texts, y_train, batch_size= 128, epochs= 3)    #just giving 2,3 epochs to save time.. 


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x257a042ef40>

In [19]:
# lets evealuate train set 
model.evaluate(train_texts, y_train)



[0.6896663904190063, 0.8974999785423279]

In [20]:
#  we then evaluate the test set 
model.evaluate(test_texts, y_test)                  # here x_test is test_texts 



[0.6932892799377441, 0.5199999809265137]

In [21]:
y_pred= model.predict(test_texts)



In [23]:
print('Accuracy score: {:0.4}'.format(accuracy_score(y_test, 1 * (y_pred > 0.5))))

print('F1 score: {:0.4}'.format(f1_score(y_test, 1 * (y_pred > 0.5))))


Accuracy score: 0.52
F1 score: 0.5636


# Trying RNN model 

In [42]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    
    x = layers.LSTM(128, return_sequences=True)(embedded)
    x= layers.LSTM(128)(x)
    x= layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    rnn_model = models.Model(inputs=sequences, outputs=predictions)
   
    
    rnn_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model

        
rnn_model = build_rnn_model()




In [44]:
rnn_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 195)]             0         
                                                                 
 embedding (Embedding)       (None, 195, 64)           768000    
                                                                 
 conv1d (Conv1D)             (None, 193, 64)           12352     
                                                                 
 batch_normalization (BatchN  (None, 193, 64)          256       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 64, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 60, 64)            20544 

In [45]:
rnn_model.fit(train_texts, y_train, batch_size= 128, epochs= 3) 


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x257a5197e50>

In [58]:
rnn_model.evaluate(train_texts, y_train)



[0.6902111172676086, 0.5350000262260437]

In [59]:
rnn_model.evaluate(test_texts, y_test)



[0.6923419833183289, 0.550000011920929]

In [57]:
y_pred1= rnn_model.predict(test_texts)
y_pred1[:5]



array([[0.5044147 ],
       [0.50553006],
       [0.5010786 ],
       [0.50521123],
       [0.50689816]], dtype=float32)

In [60]:
preds = rnn_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(y_test, 1 * (y_pred1 > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(y_test, 1 * (y_pred1 > 0.5))))


Accuracy score: 0.55
F1 score: 0.7097


# Conclusion: F1 score for RNN model was better than CNN model.. 