In [8]:
#IMPORTS
import pandas as pd
import sklearn
import tensorflow as tf
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
#Pre-process the documents
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
    #Lowercase Converter
i=0
df = pd.read_csv('Reviews.csv')
for row in df.itertuples():
    Document1 = row.Text
    Document1 = Document1.lower()
        #Sentence Splitter
    arr1 = Document1.split()
        #Spelling Corrector - not needed
        #Contraction Expander - not needed
        #Punctuation Remover
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for i in range(len(arr1)):
        arr1[i] = arr1[i].translate(str.maketrans('', '', punc))
        #Non-alphanumeric Remover - not needed
        #Stopword Remover
    arr1 = [s for s in arr1 if s not in stopwords.words('english')]
        #Emoji Remover - not needed
        #Hashtag Remover - not needed
        #Word Lemmatizer from https://stackoverflow.com/questions/52393591/nltk-lemmatizer-extract-meaningful-words
    lemmatizer = WordNetLemmatizer()
    for i in range(len(arr1)):
        arr1[i] = lemmatizer.lemmatize(arr1[i])
        #Rejoin
    arr1 = ' '.join(arr1)
    df.at[i, 'Text'] = arr1
    i = i + 1
    #print(arr1)
df.to_csv('Processed.csv')


In [10]:
#TOKINER AN DATA SET-UP
pdf = pd.read_csv('Processed.csv')
pdf['Sentiments'] = pdf.Score.apply(lambda x: 0 if x in [1, 2] else 1)

train=pdf.sample(frac=0.8,random_state=200) #random state is a seed value
test=pdf.drop(train.index)

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train.Text)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train.Text)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=120)
testing_sentences = tokenizer.texts_to_sequences(test.Text)
testing_padded = tf.keras.preprocessing.sequence.pad_sequences(testing_sentences, maxlen=120)

training_labels_final = np.array(train.Sentiments)
testing_labels_final = np.array(test.Sentiments)


In [11]:
#RNN
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(training_labels_final), 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model1.summary())

history1 = model1.fit(padded, training_labels_final, epochs=10, steps_per_epoch=1000, batch_size=64, validation_data=(testing_padded, testing_labels_final))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           7276208   
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 7,276,317
Trainable params: 7,276,317
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10

<keras.callbacks.History at 0x1f64d992e80>

In [12]:
#RNN RESULTS
print("RNN:")
pred = model1.predict(testing_padded)
pred[pred>=0.5]=1
pred[pred<0.5]=0
pred.astype(int)
print(sklearn.metrics.classification_report(y_true=testing_labels_final, y_pred=pred))

RNN:
              precision    recall  f1-score   support

           0       0.80      0.62      0.70     16493
           1       0.94      0.97      0.96     97198

    accuracy                           0.92    113691
   macro avg       0.87      0.80      0.83    113691
weighted avg       0.92      0.92      0.92    113691



In [14]:
#LSTM
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(training_labels_final), 16, input_length=120),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

history2 = model2.fit(padded, training_labels_final, epochs=10, steps_per_epoch=1000, batch_size=64, validation_data=(testing_padded, testing_labels_final))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           7276208   
                                                                 
 lstm_1 (LSTM)               (None, 100)               46800     
                                                                 
 dense_4 (Dense)             (None, 6)                 606       
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 7,323,621
Trainable params: 7,323,621
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
#LSTM RESULTS
print("LSTM:")
pred = model2.predict(testing_padded)
pred[pred>=0.5]=1
pred[pred<0.5]=0
pred.astype(int)
print(sklearn.metrics.classification_report(y_true=testing_labels_final, y_pred=pred))

LSTM:
              precision    recall  f1-score   support

           0       0.80      0.77      0.78     16493
           1       0.96      0.97      0.96     97198

    accuracy                           0.94    113691
   macro avg       0.88      0.87      0.87    113691
weighted avg       0.94      0.94      0.94    113691

