In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, SpatialDropout1D
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import KFold
import re
import numpy as np
from matplotlib import pyplot as plt


In [None]:
# Reading Dataset

train = pd.read_csv('Corona_NLP_train.csv', engine='python')
test = pd.read_csv('Corona_NLP_test.csv', engine='python')

# Pre-processing Dataset

train = train[['OriginalTweet','Sentiment']]
train = train.rename(columns={'OriginalTweet': 'text'})
train = train.rename(columns={'Sentiment': 'sentiment'})

test = test[['OriginalTweet','Sentiment']]
test = test.rename(columns={'OriginalTweet': 'text'})
test = test.rename(columns={'Sentiment': 'sentiment'})

for idx,row in train.iterrows():
    row[0] = row[0].replace('rt',' ')

for idx,row in test.iterrows():
    row[0] = row[0].replace('rt',' ')
    

tokenizer = Tokenizer(num_words=max_features, split=' ')

tokenizer.fit_on_texts(train['text'].values)
tokenizer.fit_on_texts(test['text'].values)

x_train = pad_sequences(tokenizer.texts_to_sequences(train['text'].values))
x_test = pad_sequences(tokenizer.texts_to_sequences(test['text'].values))

y_train = pd.get_dummies(train['sentiment']).values
y_test = pd.get_dummies(test['sentiment']).values

In [None]:
# Setting up LSTM Model
embed_dim = 128
out_dim = 196

lstm_model = Sequential()
lstm_model.add(Embedding(max_features, embed_dim, input_length = x_train.shape[1]))
lstm_model.add(SpatialDropout1D(0.4))
lstm_model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(5,activation='sigmoid'))
lstm_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
lstm_model.summary()

tf.keras.utils.plot_model(lstm_model, show_shapes=True)

In [None]:
# Setting up GRU Model
embed_dim = 128
out_dim = 196

gru_model = Sequential()
gru_model.add(Embedding(max_features, embed_dim, input_length = x_train.shape[1]))
gru_model.add(SpatialDropout1D(0.4))
gru_model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
gru_model.add(Dense(5,activation='sigmoid'))
gru_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
gru_model.summary()

tf.keras.utils.plot_model(gru_model, show_shapes=True)

In [None]:
def evaluate_model(model, kfold, lr, epoch, n_embedding, n_hidden, dropout, note = ''):
    with tf.device('/device:GPU:0'):
        hists = []
        train_acc = []
        val_acc = []
        kf = KFold(kfold, shuffle = True)
        
        for train, test in kf.split(x_train, y_train):

            # Training Model
            hist = model.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.2)

            # Testing Model
            test_results = model.evaluate(x_train, y_train, verbose=False)
            print(f'Test results - Loss: {test_results[0]} - Accuracy: {100 * test_results[1]}%')
            test_acc = test_results[1]

            train_acc.append(hist.history['accuracy'])
            val_acc.append(hist.history['val_accuracy'])

    return train_acc, val_acc


In [None]:
# Parameters
kfold = 3
lr = 0.0005
epoch = 10
n_embedding = 50
n_hidden = 150
dropout = 0.00

# K-fold Evaluation of LSTM/GRU Model

model = lstm_model # or gru_model
train_acc, val_acc = evaluate_model(model, kfold = kfold, lr = lr, epoch = epoch, n_embedding = n_embedding, n_hidden = n_hidden, dropout=dropout)


In [None]:
def plotKfold(train_acc, val_acc, title, saveName):
    train_acc_list = np.array(train_acc)
    eval_acc_list = np.array(val_acc)
    eval_mean = np.mean(eval_acc_list, axis = 0)
    eval_std = np.std(eval_acc_list, axis = 0)
    train_mean = np.mean(train_acc_list, axis = 0)
    train_std = np.std(train_acc_list, axis = 0)

    k_list = range(1,6)

    plt.plot(k_list, train_mean, label='Train', linewidth = 2)
    plt.errorbar(k_list, eval_mean, eval_std, label='Validation', linewidth = 2)

    plt.xticks(range(0,5, 1))
    plt.legend()
    plt.xlabel('Number of Epoch')
    plt.ylabel('Accuracy')
    plt.title(title)

    plt.savefig(f'{saveName}.jpg', format = 'JPEG')

In [None]:
plotKfold(train_acc, val_acc, '3-Fold Covid Tweets', 
          '3-Fold Covid Tweets')