In [75]:
import os
from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from keras.layers import Embedding
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras import layers, Input, Model
from sklearn.metrics import precision_score, recall_score 

In [76]:
X_pathneg = 'rt-polarity.neg'
X_pathpos = 'rt-polarity.pos'

with open(X_pathneg, errors='ignore') as file:
    X_listneg = file.readlines()

with open(X_pathpos, errors='ignore') as file:
    X_listpos = file.readlines()
 
X_list = X_listneg + X_listpos
y_list = [0]*len(X_listneg) + [1]*len(X_listpos)

X_list = [classval[:-1] for classval in X_list]
classes = np.unique(y_list)
unique_letters = np.unique(X_list)
#class_to_index = dict((c,i) for i, c in enumerate(classes))

In [77]:
embed_dim = 100
vectorizer = TextVectorization(max_tokens=20500, output_sequence_length=embed_dim)
text_ds = tf.data.Dataset.from_tensor_slices(X_list).batch(128) ## Read batches of 128 samples
vectorizer.adapt(text_ds)

In [78]:
print(len(vectorizer.get_vocabulary())) ## We set max_tokens=10000
vocab = vectorizer.get_vocabulary()
vocab_to_index = dict(zip(vocab,range(len(vocab))))
index_to_vocab = dict(zip(range(len(vocab)),vocab))

X_train, X_test, y_train, y_test = train_test_split(X_list, y_list, train_size = 7/10, random_state = 1)

X_train = vectorizer(np.array([[s] for s in X_train])).numpy()
X_test = vectorizer(np.array([[s] for s in X_test])).numpy()

y_train = to_categorical(y_train).astype(np.int64)
y_test = to_categorical(y_test).astype(np.int64)
y_test_labels = np.argmax(y_test, axis = 1)

20476


In [79]:
words_per_sen = np.count_nonzero(X_test, axis = 1)
pertile = np.array([np.percentile(words_per_sen,(1/3*100)), np.percentile(words_per_sen,(2/3*100))])

shortind = np.nonzero(words_per_sen <= pertile[0])[0]
mediumind = np.nonzero(np.logical_and(words_per_sen >= pertile[0], words_per_sen <= pertile[1]))[0]
longind = np.nonzero(words_per_sen > pertile[1])[0]

shortlist = [X_test[shortind,:], y_test[shortind,:], np.argmax(y_test[shortind,:], axis = 1)]
mediumlist = [X_test[mediumind,:], y_test[mediumind,:], np.argmax(y_test[mediumind,:], axis = 1)]
longlist = [X_test[longind,:], y_test[longind,:], np.argmax(y_test[longind,:], axis = 1)]

In [80]:
# Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(len(vocab), embed_dim, trainable=True)
modmetrics = []

In [81]:
# Saving and recalling the model:
class models:

  def __init__(self, xtrain, ytrain, embed_layer):
    self.X_train = xtrain
    self.y_train = ytrain
    self.embedding_layer = embed_layer
    self.modmetrics = []

  def conf_mat(self, y_test):
    y_test = np.argmax(y_test, axis = 1)
    conf_mat = confusion_matrix(y_test, self.y_pred)
    return conf_mat

  def get_metrics(self,ytest,ypred):
    self.modmetrics.append([precision_score(ytest, ypred), recall_score(ytest, ypred)])
    return self

  def get_pred(self, X_test, y_test):
    modpreds = np.argmax(self.savedmodel.predict(X_test), axis = 1)
    y_test = np.argmax(y_test, axis = 1)
    self.get_metrics(y_test,modpreds)
    self.y_pred = modpreds
    return self

  def lstm_mod(self):
    classes = self.y_train.shape[1]
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.LSTM(20, return_sequences=True))(embedded_sequences)
    x = layers.Bidirectional(layers.LSTM(20))(x)
    preds = layers.Dense(classes, activation="softmax")(x)
    model1 = Model(int_sequences_input, preds)
    model1.summary()

    model1.compile(loss="categorical_crossentropy", optimizer="adam")
    model1.fit(self.X_train, self.y_train, batch_size=128, epochs=10)
    self.savedmodel = model1

    return self

  def gru_mod(self):
    classes = self.y_train.shape[1]
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.GRU(20, return_sequences=True))(embedded_sequences)
    x = layers.Bidirectional(layers.GRU(20))(x)
    preds = layers.Dense(classes, activation="softmax")(x)
    model2 = Model(int_sequences_input, preds)
    model2.summary()

    model2.compile(loss="categorical_crossentropy", optimizer="adam")
    model2.fit(self.X_train, self.y_train, batch_size=128, epochs=10)
    self.savedmodel = model2

    return self

  def rnn_mod(self):
    classes = self.y_train.shape[1]
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.SimpleRNN(20, return_sequences=True))(embedded_sequences)
    x = layers.Bidirectional(layers.SimpleRNN(20))(x)
    preds = layers.Dense(classes, activation="softmax")(x)
    model3 = Model(int_sequences_input, preds)
    model3.summary()

    model3.compile(loss="categorical_crossentropy", optimizer="adam")
    model3.fit(self.X_train, self.y_train, batch_size=128, epochs=10)
    self.savedmodel = model3

    return self

In [82]:
rnn_obj = models(X_train, y_train, embedding_layer).rnn_mod()
rnn_obj.get_pred(X_test, y_test)
rnn_mets = rnn_obj.modmetrics

lstm_obj = models(X_train, y_train, embedding_layer).lstm_mod()
lstm_obj.get_pred(X_test, y_test)
lstm_mets = lstm_obj.modmetrics

gru_obj = models(X_train, y_train, embedding_layer).gru_mod()
gru_obj.get_pred(X_test, y_test)
gru_mets = gru_obj.modmetrics

print(f'\nThe RNN model\'s precision is {rnn_mets[0][0]} and the RNN model\'s recall is {rnn_mets[0][1]}')
print(f'\nThe LSTM model\'s precision is {lstm_mets[0][0]} and the LSTM model\'s recall is {lstm_mets[0][1]}')
print(f'\nThe GRU model\'s precision is {gru_mets[0][0]} and the GRU model\'s recall is {gru_mets[0][1]}')

Model: "model_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_31 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         2047600   
                                                                 
 bidirectional_60 (Bidirecti  (None, None, 40)         4840      
 onal)                                                           
                                                                 
 bidirectional_61 (Bidirecti  (None, 40)               2440      
 onal)                                                           
                                                                 
 dense_29 (Dense)            (None, 2)                 82        
                                                                 
Total params: 2,054,962
Trainable params: 2,054,962
Non-tr

In [83]:
rnn_obj = models(X_train, y_train, embedding_layer).rnn_mod()
rnn_obj.get_pred(shortlist[0], shortlist[1])
rnn_obj.get_pred(mediumlist[0], mediumlist[1])
rnn_obj.get_pred(longlist[0], longlist[1])
rnn_mets_txt = rnn_obj.modmetrics

lstm_obj = models(X_train, y_train, embedding_layer).lstm_mod()
lstm_obj.get_pred(shortlist[0], shortlist[1])
lstm_obj.get_pred(mediumlist[0], mediumlist[1])
lstm_obj.get_pred(longlist[0], longlist[1])
lstm_mets_txt = lstm_obj.modmetrics

gru_obj = models(X_train, y_train, embedding_layer).gru_mod()
gru_obj.get_pred(shortlist[0], shortlist[1])
gru_obj.get_pred(mediumlist[0], mediumlist[1])
gru_obj.get_pred(longlist[0], longlist[1])
gru_mets_txt = gru_obj.modmetrics

print(rnn_mets_txt)
print(lstm_mets_txt)
print(gru_mets_txt)

Model: "model_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_34 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         2047600   
                                                                 
 bidirectional_66 (Bidirecti  (None, None, 40)         4840      
 onal)                                                           
                                                                 
 bidirectional_67 (Bidirecti  (None, 40)               2440      
 onal)                                                           
                                                                 
 dense_32 (Dense)            (None, 2)                 82        
                                                                 
Total params: 2,054,962
Trainable params: 2,054,962
Non-tr

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [85]:
path_to_glove_file = "glove.6B.100d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [None]:
num_tokens = len(vocab) 

hits = 0 ## number of words that were found in the pretrained model
misses = 0 ## number of words that were missing in the pretrained model
word_index = dict(zip(vocab, range(len(vocab))))
# Prepare embedding matrix for our word list
embedding_matrix = np.zeros((num_tokens, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

embedding_layer = Embedding(num_tokens, embed_dim,
                            embeddings_initializer= Constant(embedding_matrix), 
                            trainable=False,
)

In [87]:
rnn_obj = models(X_train, y_train, embedding_layer).rnn_mod()
rnn_obj.get_pred(X_test, y_test)
rnn_mets = rnn_obj.modmetrics

lstm_obj = models(X_train, y_train, embedding_layer).lstm_mod()
lstm_obj.get_pred(X_test, y_test)
lstm_mets = lstm_obj.modmetrics

Model: "model_35"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_37 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 100)         2047600   
                                                                 
 bidirectional_72 (Bidirecti  (None, None, 40)         4840      
 onal)                                                           
                                                                 
 bidirectional_73 (Bidirecti  (None, 40)               2440      
 onal)                                                           
                                                                 
 dense_35 (Dense)            (None, 2)                 82        
                                                                 
Total params: 2,054,962
Trainable params: 7,362
Non-traina

KeyboardInterrupt: ignored

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

rnn_mat = rnn_obj.conf_mat(y_test)
lstm_mat = lstm_obj.conf_mat(y_test)

myax = sns.heatmap(rnn_mat.T, square = True, annot = True, fmt = 'd', \
cbar = False).set(title='RNN Confusion matrix with GloVE',xlabel='True Label', ylabel='Predicted Label')
plt.show()
plt.savefig('rnn_conf_mat.png')

myax = sns.heatmap(lstm_mat.T, square = True, annot = True, fmt = 'd', \
cbar = False).set(title='LSTM Confusion matrix with GloVE',xlabel='True Label', ylabel='Predicted Label')
plt.show()
plt.savefig('lstm_conf_mat.png')
print(f'\nThe RNN model\'s precision is {rnn_mets[0][0]} and the RNN model\'s recall is {rnn_mets[0][1]}')
print(f'\nThe LSTM model\'s precision is {lstm_mets[0][0]} and the LSTM model\'s recall is {lstm_mets[0][1]}\n')