In [50]:
import random
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [51]:
df = pd.read_csv('ReviewsEN.csv')

In [52]:
print("-1 :", df['sentiment'].value_counts()[-1])
print("0 :", df['sentiment'].value_counts()[0])
print("1 :", df['sentiment'].value_counts()[1])

-1 : 870
0 : 639
1 : 1518


In [53]:
# Replace values in pandas DataFrame.
df['sentiment'] = df['sentiment'].replace([1], 2)
df['sentiment'] = df['sentiment'].replace([0], 1)
df['sentiment'] = df['sentiment'].replace([-1], 0)

In [54]:
# # Apply lower function
# df['reviews'] = df['reviews'].apply(str.lower)
# print(df)

In [55]:
# Global Variables

EMBEDDING_DIM = 100
MAXLEN = 16
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = len(df)
TRAINING_SPLIT = 0.9

In [56]:
import random

random.seed(42)

# Get the indices of the DataFrame
indices = df.index.tolist()

# Perform random sampling on the indices
selected_indices = random.sample(indices, MAX_EXAMPLES)

# Select the corresponding sentences and labels based on the sampled indices
sentences = df.loc[selected_indices, 'reviews']
labels = df.loc[selected_indices, 'sentiment']

print(f"There are {len(sentences)} sentences and {len(labels)} labels after random sampling\n")


There are 3027 sentences and 3027 labels after random sampling



# Training - Validation Split

In [57]:
def train_val_split(sentences, labels, training_split):
    ### START CODE HERE
    
    # Compute the number of sentences that will be used for training (should be an integer)
    train_size = int(len(sentences)*training_split)

    # Split the sentences and labels into train/validation splits
    train_sentences = sentences[:train_size]
    train_labels = labels[:train_size]

    validation_sentences = sentences[train_size:]
    validation_labels = labels[train_size:]
    
    ### END CODE HERE
    
    return train_sentences, validation_sentences, train_labels, validation_labels

In [58]:
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)

print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")

There are 2724 sentences for training.

There are 2724 labels for training.

There are 303 sentences for validation.

There are 303 labels for validation.


# Tokenization - Sequences, Truncating, and Padding

In [59]:
# GRADED FUNCTION: fit_tokenizer
def fit_tokenizer(train_sentences, oov_token):
    ### START CODE HERE
    
    # Instantiate the Tokenizer class, passing in the correct values for oov_token
    tokenizer = Tokenizer(oov_token = OOV_TOKEN)
    
    # Fit the tokenizer to the training sentences
    tokenizer.fit_on_texts(train_sentences)
    
    ### END CODE HERE
    
    return tokenizer

In [60]:
# Test your function
tokenizer = fit_tokenizer(train_sentences, OOV_TOKEN)

word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)

print(f"Vocabulary contains {VOCAB_SIZE} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
print(f"\nindex of word 'i' should be {word_index['i']}")
word_index

Vocabulary contains 3969 words

<OOV> token included in vocabulary

index of word 'i' should be 9


{'<OOV>': 1,
 'the': 2,
 'and': 3,
 'to': 4,
 'is': 5,
 'a': 6,
 'he': 7,
 'you': 8,
 'i': 9,
 'class': 10,
 'of': 11,
 'in': 12,
 'very': 13,
 'professor': 14,
 'but': 15,
 'not': 16,
 'was': 17,
 'are': 18,
 'for': 19,
 'it': 20,
 'she': 21,
 'good': 22,
 'students': 23,
 'his': 24,
 'this': 25,
 'on': 26,
 'with': 27,
 'that': 28,
 'if': 29,
 'course': 30,
 'be': 31,
 'or': 32,
 'have': 33,
 'her': 34,
 'material': 35,
 'easy': 36,
 'as': 37,
 'all': 38,
 'were': 39,
 'teacher': 40,
 'about': 41,
 'so': 42,
 'really': 43,
 'take': 44,
 'will': 45,
 'do': 46,
 'an': 47,
 'great': 48,
 'understand': 49,
 'lecturer': 50,
 'can': 51,
 'learning': 52,
 'him': 53,
 'lectures': 54,
 'at': 55,
 'get': 56,
 'help': 57,
 'helpful': 58,
 'teaching': 59,
 'hard': 60,
 'well': 61,
 'my': 62,
 'had': 63,
 'more': 64,
 'questions': 65,
 'they': 66,
 'work': 67,
 'assignments': 68,
 'just': 69,
 'difficult': 70,
 'like': 71,
 'lot': 72,
 'there': 73,
 'best': 74,
 'your': 75,
 'what': 76,
 'one': 7

In [61]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    ### START CODE HERE
       
    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # Pad the sequences using the correct padding, truncating and maxlen
    pad_trunc_sequences = pad_sequences(sequences, maxlen= MAXLEN, padding = PADDING, truncating = TRUNCATING)
    
    ### END CODE HERE
    
    return pad_trunc_sequences

In [62]:
train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)

print(f"Padded and truncated training sequences have shape: {train_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {val_pad_trunc_seq.shape}")

Padded and truncated training sequences have shape: (2724, 16)

Padded and truncated validation sequences have shape: (303, 16)


In [63]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

# Using pre-defined Embeddings

In [64]:
# Define path to file containing the embeddings
GLOVE_FILE = 'glove.6B.100d.txt'

# Initialize an empty embeddings index dictionary
GLOVE_EMBEDDINGS = {}

# Read file and fill GLOVE_EMBEDDINGS with its contents
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

# Represent the words in your vocabulary using the embeddings

In [65]:
# Initialize an empty numpy array with the appropriate size
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

# Iterate all of the words in the vocabulary and if the vector representation for 
# each word exists within GloVe's representations, save it in the EMBEDDINGS_MATRIX array
for word, i in word_index.items():
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

# Define a model that does not overfit

Model with 0.001 learning rate

In [66]:
# def create_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):
#     model = tf.keras.Sequential([ 
#         # This is how you need to set the Embedding layer when using pre-trained embeddings
#         tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
#         tf.keras.layers.Conv1D(32, 5, activation='relu'),
#         tf.keras.layers.GlobalMaxPooling1D(),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(32, activation='relu'),
#         tf.keras.layers.Dense(3, activation='softmax'),
#     ])
    
#     model.compile(loss='sparse_categorical_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy']) 
#     return model

Model with 0.002 learning rate

In [67]:
from tensorflow.keras import optimizers

def create_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
        tf.keras.layers.Conv1D(64, 3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(3, activation='softmax'),
    ])
    optimizer = optimizers.Adam(learning_rate = 0.002)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy']) 
    return model


Model with 0.0025 learning rate

In [68]:
# from tensorflow.keras import optimizers

# def create_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):
#     model = tf.keras.Sequential([
#         tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
#         tf.keras.layers.Conv1D(64, 3, activation='relu'),
#         tf.keras.layers.MaxPooling1D(pool_size=2),
#         tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
#         tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(3, activation='softmax'),
#     ])
#     optimizer = optimizers.Adam(learning_rate = 0.0025)
#     model.compile(loss='sparse_categorical_crossentropy',
#                   optimizer=optimizer,
#                   metrics=['accuracy']) 
#     return model


In [69]:
model = create_model(VOCAB_SIZE, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)

In [70]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 16, 100)           397000    
                                                                 
 conv1d_1 (Conv1D)           (None, 14, 64)            19264     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 7, 64)            0         
 1D)                                                             
                                                                 
 bidirectional_2 (Bidirectio  (None, 7, 128)           49920     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               31104     
 nal)                                                            
                                                      

In [71]:
# Train the model and save the training history
history = model.fit(train_pad_trunc_seq, train_labels, epochs=200, validation_data=(val_pad_trunc_seq, val_labels))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [72]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Make predictions on the validation set
val_predictions = model.predict(val_pad_trunc_seq)
val_predicted_labels = np.argmax(val_predictions, axis=1)

# Calculate precision, recall, and F1-score
precision = precision_score(val_labels, val_predicted_labels, average='weighted')
recall = recall_score(val_labels, val_predicted_labels, average='weighted')
f1 = f1_score(val_labels, val_predicted_labels, average='weighted')

print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1)


Precision:  0.7381191607532847
Recall:  0.7392739273927392
F1-score:  0.7385938226649057


Based on the evaluation metrics, our model performs relatively well with relatively high values of precision, recall, and F1-score