In [None]:
import os
from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from keras.layers import Embedding
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras import layers, Input, Model
from sklearn.metrics import precision_score, recall_score 

In [None]:
X_pathneg = 'rt-polarity.neg'
X_pathpos = 'rt-polarity.pos'

with open(X_pathneg, encoding = "ISO-8859-1") as file:
    X_listneg = file.readlines()

with open(X_pathpos, encoding = "ISO-8859-1") as file:
    X_listpos = file.readlines()
 
X_list = X_listneg + X_listpos
y_list = [0]*len(X_listneg) + [1]*len(X_listpos)

X_list = [classval[:-1] for classval in X_list]
classes = np.unique(y_list)
unique_letters = np.unique(X_list)

In [None]:
embed_dim = 100
vectorizer = TextVectorization(max_tokens=20600, output_sequence_length=embed_dim)
text_ds = tf.data.Dataset.from_tensor_slices(X_list).batch(128) ## Read batches of 128 samples
vectorizer.adapt(text_ds)

In [None]:
vocab = vectorizer.get_vocabulary()
vocab_to_index = dict(zip(vocab,range(len(vocab))))
index_to_vocab = dict(zip(range(len(vocab)),vocab))

X_train, X_test, y_train, y_test = train_test_split(X_list, y_list, train_size = 7/10, random_state = 1)

X_train = vectorizer(np.array([[s] for s in X_train])).numpy()
X_test = vectorizer(np.array([[s] for s in X_test])).numpy()

y_train = to_categorical(y_train).astype(np.int64)
y_test = to_categorical(y_test).astype(np.int64)
y_test_labels = np.argmax(y_test, axis = 1)

In [None]:
words_per_sen = np.count_nonzero(X_test, axis = 1)
pertile = np.array([np.percentile(words_per_sen,(1/3*100)), np.percentile(words_per_sen,(2/3*100))])

shortind = np.nonzero(words_per_sen <= pertile[0])[0]
mediumind = np.nonzero(np.logical_and(words_per_sen >= pertile[0], words_per_sen <= pertile[1]))[0]
longind = np.nonzero(words_per_sen > pertile[1])[0]

shortlist = [X_test[shortind,:], y_test[shortind,:], np.argmax(y_test[shortind,:], axis = 1)]
mediumlist = [X_test[mediumind,:], y_test[mediumind,:], np.argmax(y_test[mediumind,:], axis = 1)]
longlist = [X_test[longind,:], y_test[longind,:], np.argmax(y_test[longind,:], axis = 1)]

In [None]:
class models:

  def __init__(self, xtrain, ytrain, embed_layer):
    self.X_train = xtrain
    self.y_train = ytrain
    self.embedding_layer = embed_layer
    self.modmetrics = []

  def get_metrics(self,ytest,ypred):
    self.modmetrics.append([precision_score(ytest, ypred), recall_score(ytest, ypred)])
    return self

  def get_pred(self, X_test, y_test):
    modpreds = np.argmax(self.savedmodel.predict(X_test), axis = 1)
    y_test = np.argmax(y_test, axis = 1)
    self.get_metrics(y_test,modpreds)
    self.y_pred = modpreds
    return self

  def lstm_mod(self):
    classes = self.y_train.shape[1]
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.LSTM(20))(embedded_sequences)
    preds = layers.Dense(classes, activation="softmax")(x)
    model1 = Model(int_sequences_input, preds)
    #model1.summary()

    model1.compile(loss="categorical_crossentropy", optimizer="adam")
    model1.fit(self.X_train, self.y_train, batch_size=128, epochs=2)
    
    self.savedmodel = model1

    return self

  def gru_mod(self):
    classes = self.y_train.shape[1]    
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.GRU(20))(embedded_sequences)
    preds = layers.Dense(classes, activation="softmax")(x)
    model2 = Model(int_sequences_input, preds)
    #model2.summary()

    model2.compile(loss="categorical_crossentropy", optimizer="adam")
    model2.fit(self.X_train, self.y_train, batch_size=128, epochs=2)
    
    self.savedmodel = model2

    return self

  def rnn_mod(self):
    classes = self.y_train.shape[1]
    int_sequences_input = Input(shape=(None,), dtype="int64")
    embedded_sequences = self.embedding_layer(int_sequences_input)
    x = layers.Bidirectional(layers.SimpleRNN(20))(embedded_sequences)
    preds = layers.Dense(classes, activation="softmax")(x)
    model3 = Model(int_sequences_input, preds)
    #model3.summary()

    model3.compile(loss="categorical_crossentropy", optimizer="adam")
    model3.fit(self.X_train, self.y_train, batch_size=128, epochs=2)

    self.savedmodel = model3

    return self

In [None]:

embedding_layer = tf.keras.layers.Embedding(len(vocab), embed_dim, trainable=True)
rnn_obj1 = models(X_train, y_train, embedding_layer).rnn_mod()
rnn_obj1.get_pred(X_test, y_test)
rnn_mets1 = rnn_obj1.modmetrics
del rnn_obj1

embedding_layer = tf.keras.layers.Embedding(len(vocab), embed_dim, trainable=True)
lstm_obj1 = models(X_train, y_train, embedding_layer).lstm_mod()
lstm_obj1.get_pred(X_test, y_test)
lstm_mets1 = lstm_obj1.modmetrics
del lstm_obj1

embedding_layer = tf.keras.layers.Embedding(len(vocab), embed_dim, trainable=True)
gru_obj1 = models(X_train, y_train, embedding_layer).gru_mod()
gru_obj1.get_pred(X_test, y_test)
gru_mets1 = gru_obj1.modmetrics
del gru_obj1

print(f'\nThe RNN model\'s precision is {rnn_mets1[0][0]} and the RNN model\'s recall is {rnn_mets1[0][1]}')
print(f'\nThe LSTM model\'s precision is {lstm_mets1[0][0]} and the LSTM model\'s recall is {lstm_mets1[0][1]}')
print(f'\nThe GRU model\'s precision is {gru_mets1[0][0]} and the GRU model\'s recall is {gru_mets1[0][1]}')

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2

The RNN model's precision is 0.6339479392624728 and the RNN model's recall is 0.7329153605015674

The LSTM model's precision is 0.7684279191128506 and the LSTM model's recall is 0.7385579937304075

The GRU model's precision is 0.741751269035533 and the GRU model's recall is 0.7329153605015674


In [None]:
rnn_obj2 = models(X_train, y_train, embedding_layer).rnn_mod()
rnn_obj2.get_pred(shortlist[0], shortlist[1])
rnn_obj2.get_pred(mediumlist[0], mediumlist[1])
rnn_obj2.get_pred(longlist[0], longlist[1])
rnn_mets_txt = rnn_obj2.modmetrics

lstm_obj2 = models(X_train, y_train, embedding_layer).lstm_mod()
lstm_obj2.get_pred(shortlist[0], shortlist[1])
lstm_obj2.get_pred(mediumlist[0], mediumlist[1])
lstm_obj2.get_pred(longlist[0], longlist[1])
lstm_mets_txt = lstm_obj2.modmetrics

gru_obj2 = models(X_train, y_train, embedding_layer).gru_mod()
gru_obj2.get_pred(shortlist[0], shortlist[1])
gru_obj2.get_pred(mediumlist[0], mediumlist[1])
gru_obj2.get_pred(longlist[0], longlist[1])
gru_mets_txt = gru_obj2.modmetrics

print(rnn_mets_txt)
print(lstm_mets_txt)
print(gru_mets_txt)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
[[0.7873684210526316, 0.6824817518248175], [0.77882797731569, 0.65814696485623], [0.6709677419354839, 0.6265060240963856]]
[[0.791015625, 0.7390510948905109], [0.7743589743589744, 0.7236421725239617], [0.7350597609561753, 0.7409638554216867]]
[[0.8320610687022901, 0.5967153284671532], [0.7719298245614035, 0.7028753993610224], [0.6948529411764706, 0.7590361445783133]]
