# References
https://github.com/ultimate010/crnn 
{Combination of Convolutional and Recurrent Neural Network for Sentiment Analysis of Short Texts}
https://keras.io/examples/imdb_bidirectional_lstm/

# Folder Paths

In [1]:
first_time = True

In [2]:
folder_path =  '/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/'
lankadeepa_data_path = folder_path + 'corpus/new/preprocess_from_isuru/lankadeepa_tagged_comments.csv'
gossip_lanka_data_path = folder_path + 'corpus/new/preprocess_from_unicode_values/gossip_lanka_tagged_comments.csv'
embedding_size = 400
embedding_type = 'word2vec'
embedding_type1 = 'word2vec'
context = 5
word_embedding_path = folder_path + "word_embedding/"+embedding_type+"/source2_data_from_gosspiLanka_and_lankadeepa/"+str(embedding_size)+"/"+embedding_type1+"_"+str(embedding_size)+"_"+str(context)
word_embedding_keydvectors_path = folder_path + "word_embedding/"+embedding_type+"/source2_data_from_gosspiLanka_and_lankadeepa/"+str(embedding_size)+"/keyed_vectors/keyed.kv"
embedding_matrix_path = folder_path + 'Sentiment Analysis/CNN RNN/embedding_matrix/'+embedding_type+'_lankadeepa_gossiplanka_'+str(embedding_size)+'_'+str(context)

# Imports Statements

In [None]:
if (first_time):
  from google.colab import drive
  drive.mount('/content/drive')

import collections
import pickle
import re
import random
import sys
import os 
import time

import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import FastText
from gensim.models import word2vec

from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,precision_recall_fscore_support

from __future__ import print_function

import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
from numpy import cumsum

import keras
from keras.models import Sequential,Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dropout, Activation, Flatten, \
    Embedding, Convolution1D, MaxPooling1D, AveragePooling1D, \
    Input, Dense, merge, Add,TimeDistributed, Bidirectional,SpatialDropout1D
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.regularizers import l2, l1_l2
from keras.constraints import maxnorm
from keras import callbacks
from keras.utils import generic_utils
from keras.optimizers import Adadelta

import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

# Load Data

In [None]:
lankadeepa_data = pd.read_csv(lankadeepa_data_path)[:9059]
gossipLanka_data = pd.read_csv(gossip_lanka_data_path)
gossipLanka_data = gossipLanka_data.drop(columns=['Unnamed: 3'])

In [None]:
lankadeepa_data.columns

In [None]:
lankadeepa_data

In [None]:
all_data = pd.concat([lankadeepa_data,gossipLanka_data])

In [None]:
all_data['label'].value_counts()

# preprocess Data

In [None]:
# edit this later 
def text_preprocessing(train_data,test_data):
  train_data_texts = train_data['comment']
  train_data_labels = train_data['label']
  test_data_texts = test_data['comment']
  test_data_labels = test_data['label']


  comment_texts = []
  comment_labels = []

  train_text = []
  test_text = []
  train_labels=[]
  test_labels=[]

  for label in train_data_labels:
    if label == "POSITIVE":
      train_labels.append(1)
    else:
      train_labels.append(0)
  comment_labels.append(train_labels)

  for label in test_data_labels:
    if label == "POSITIVE":
      test_labels.append(1)
    else:
      test_labels.append(0)
  comment_labels.append(test_labels)
  

  for comment in train_data_texts:
    lines = []
    try:
      words = comment.split()
      lines += words
    except:
      continue
    train_text.append(lines)
  comment_texts.append(train_text)

  for comment in test_data_texts:
    lines = []
    try:
      words = comment.split()
      lines += words
    except:
      continue
    test_text.append(lines)
  comment_texts.append(test_text)


  return comment_texts,comment_labels

# edit this later 
def text_preprocessing_1(data):
  comments = data['comment']
  labels = data['label']

  comments_splitted = []
  labels_encoded = []

  for label in labels:
    if label == "POSITIVE":
      labels_encoded.append(1)
    else:
      labels_encoded.append(0)

  for comment in comments:
    lines = []
    try:
      words = comment.split()
      lines += words
    except:
      continue
    comments_splitted.append(lines)
  return comments_splitted,labels_encoded


def text_preprocessing_2(data):
  comments = data['comment']
  labels = data['label']

  comments_splitted = []

  for comment in comments:
    lines = []
    try:
      words = comment.split()
      lines += words
    except:
      continue
    comments_splitted.append(lines)

  return comments_splitted,labels

In [None]:
comment_texts, comment_labels = text_preprocessing_2(all_data)

# prepare tokenizer

t = Tokenizer()
t.fit_on_texts(comment_texts)
vocab_size = len(t.word_index) + 1
print(vocab_size)

In [None]:
encoded_docs = t.texts_to_sequences(comment_texts)
print(encoded_docs)

In [None]:
max_length = len(max(encoded_docs, key=len))
padded_docs = pad_sequences(encoded_docs, maxlen=max_length)
print(padded_docs)

comment_labels = np.array(comment_labels)
padded_docs = np.array(padded_docs)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(padded_docs, comment_labels, test_size=0.2, random_state=0)
# X_train = np.array(X_train)
# X_test = np.array(X_test)
# y_train = np.array(y_train)
# y_test = np.array(y_test)
# comment_labels = np.array(comment_labels)

# Word Embedding

## Generate Embedding Metrix

In [None]:
def generate_embedding_metrix():
  if (embedding_type == 'fasttext'):
    word_embedding_model = FastText.load(word_embedding_path)
  else:
    word_embedding_model = word2vec.Word2Vec.load(word_embedding_path)
    
  word_vectors = word_embedding_model.wv
  word_vectors.save(word_embedding_keydvectors_path)
  word_vectors = KeyedVectors.load(word_embedding_keydvectors_path, mmap='r')

  embeddings_index = dict()
  for word, vocab_obj in word_vectors.vocab.items():
    embeddings_index[word]=word_vectors[word]

  # create a weight matrix for words in training docs
  embedding_matrix = zeros((vocab_size, embedding_size))
  for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

  pickle.dump(embedding_matrix, open(embedding_matrix_path, 'wb'))
  return embedding_matrix

## Load Embedding Matrix

In [None]:
def load_word_embedding_atrix():
  f = open(embedding_matrix_path, 'rb')
  embedding_matrix= np.array(pickle.load(f))
  return embedding_matrix

# Models

## RNN(LSTM/GRU) model

In [None]:
def build_RNN_model():
    main_input = Input(shape=(maxlen, ), dtype='int32', name='main_input')
    embedding  = Embedding(max_features, embedding_dims,
                  weights=[embedding_matrix], input_length=maxlen,
                  name='embedding' ,trainable=False)(main_input)

    embedding = Dropout(0.50)(embedding)

    x = RNN(rnn_output_size)(embedding)

    x = Dense(hidden_dims, activation='relu', init='he_normal',
              W_constraint = maxnorm(3), b_constraint=maxnorm(3),
              name='mlp')(x)

    x = Dropout(0.10, name='drop')(x)

    output = Dense(6, init='he_normal',
                   activation='softmax', name='output')(x)

    model = Model(input=main_input, output=output)
    model.compile(loss={'output':'sparse_categorical_crossentropy'},
                optimizer=Adadelta(lr=0.95, epsilon=1e-06),
                metrics=["accuracy"])
    print(model.summary())
    return model

## CNN+RNN(LSTM /GRU) model 

In [None]:
def build_CNN_RNN_model():
    main_input = Input(shape=(maxlen, ), dtype='int32', name='main_input')
    embedding  = Embedding(max_features, embedding_dims,
                  weights=[embedding_matrix], input_length=maxlen,
                  name='embedding' ,trainable=False)(main_input)

    embedding = Dropout(0.50)(embedding)

    conv4 = Convolution1D(nb_filter=nb_filter,
                          filter_length=4,
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1,
                          name='conv4')(embedding)
    maxConv4 = MaxPooling1D(pool_length=2,
                             name='maxConv4')(conv4)

    conv5 = Convolution1D(nb_filter=nb_filter,
                          filter_length=5,
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1,
                          name='conv5')(embedding)
    maxConv5 = MaxPooling1D(pool_length=2,
                            name='maxConv5')(conv5)

    x = keras.layers.Concatenate(axis=1)([maxConv4, maxConv5])

    x = Dropout(0.15)(x)

    x = RNN(rnn_output_size)(x)


    x = Dense(hidden_dims, activation='relu', init='he_normal',
              W_constraint = maxnorm(3), b_constraint=maxnorm(3),
              name='mlp')(x)

    x = Dropout(0.10, name='drop')(x)

    output = Dense(1, init='he_normal',
                   activation='sigmoid', name='output')(x)

    model = Model(input=main_input, output=output)
    model.compile(loss={'output':'binary_crossentropy'},
                optimizer=Adadelta(lr=0.95, epsilon=1e-06),
                metrics=["accuracy"])
    return model

## CNN+BiLSTM

In [None]:
def build_CNN_BiLSTM():
  # main model
  input = Input(shape=(maxlen,))
  embedding = Embedding(max_features,300,weights=[embedding_matrix],input_length=maxlen)(input)

  conv4 = Convolution1D(nb_filter=nb_filter,
                          filter_length=4,
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1,
                          name='conv4')(embedding)
  maxConv4 = MaxPooling1D(pool_length=2,
                             name='maxConv4')(conv4)

  conv5 = Convolution1D(nb_filter=nb_filter,
                          filter_length=5,
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1,
                          name='conv5')(embedding)
  maxConv5 = MaxPooling1D(pool_length=2,
                            name='maxConv5')(conv5)


  x = keras.layers.Concatenate(axis=1)([maxConv4, maxConv5])

  x = Dropout(0.15)(x)

  model =  Bidirectional (LSTM (300,return_sequences=True,dropout=0.8),merge_mode='concat')(x)
  model = TimeDistributed(Dense(300,activation='relu'))(model)
  model = Flatten()(model)
  model = Dense(300,activation='relu')(model)
  output = Dense(2,activation='softmax')(model)
  model = Model(input,output)
  model.compile(loss='sparse_categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
  return model


## BiLSTM 

In [None]:
def build_BiLSTM_1_1():
  input = Input(shape=(maxlen,))
  embedding = Embedding(max_features,embedding_dims,weights=[embedding_matrix],input_length=maxlen)(input)

  model =  Bidirectional (LSTM (300,return_sequences=True,dropout=drop_out_value),merge_mode='concat')(embedding)
  model = TimeDistributed(Dense(300,activation='relu'))(model)
  model = Flatten()(model)
  model = Dense(300,activation='relu')(model)
  output = Dense(2,activation='softmax')(model)
  model = Model(input,output)
  model.compile(loss='sparse_categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
  return model

# final model
def build_BiLSTM_1_2():
  input = Input(shape=(maxlen,))
  embedding = Embedding(max_features,embedding_dims,weights=[embedding_matrix],input_length=maxlen)(input)

  model =  Bidirectional (LSTM (300,return_sequences=True,dropout=drop_out_value,kernel_regularizer=l2(0.01)),merge_mode='concat')(embedding)
  model = TimeDistributed(Dense(300,activation='relu'))(model)
  model = Flatten()(model)
  # model = Dense(300,activation='relu')(model)
  output = Dense(1,activation='sigmoid')(model)
  model = Model(input,output)
  model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
  return model

def build_BiLSTM_1_3():
  input = Input(shape=(maxlen,))
  embedding = Embedding(max_features,embedding_dims,weights=[embedding_matrix],input_length=maxlen)(input)

  model =  LSTM (512,return_sequences=True,dropout=drop_out_value,kernel_regularizer=l2(0.01))(embedding)
  model = LSTM (256,return_sequences=True,dropout=drop_out_value,kernel_regularizer=l2(0.01))(model)
  model = TimeDistributed(Dense(300,activation='relu'))(model)
  model = Flatten()(model)
  # model = Dense(300,activation='relu')(model)
  output = Dense(1,activation='sigmoid')(model)
  model = Model(input,output)
  model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
  return model

# BiLSTM with dropout regularization
def build_BiLSTM_2_1():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dims, input_length=maxlen,weights=[embedding_matrix]))
  model.add(Bidirectional(LSTM (300,dropout=drop_out_value),merge_mode='concat'))
  # model.add(Dropout(drop_out_value))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

# BiLSTM with l2 regularization
def build_BiLSTM_2_2():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dims, input_length=maxlen,weights=[embedding_matrix]))
  model.add(Bidirectional(LSTM (300,kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)),merge_mode='concat'))
  model.add(Dropout(drop_out_value))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model


# Train and Test

In [None]:
def Train_Test_Model(model,X_train, X_test, y_train, y_test):

  print('Training and Testing...')
  test_accs = []
  first_run = True


  acc=[]
  val_acc=[]
  loss=[]
  val_loss=[]
  best_val_acc = 0
  best_test_acc = 0
  for j in range(nb_epoch):
      a = time.time()
      his = model.fit(X_train, y_train,
                      batch_size=batch_size,
                      validation_data=[X_test, y_test],
                      shuffle=True,
                      epochs=1, verbose=verbosity)
      acc+=his.history['accuracy']
      val_acc+=his.history['val_accuracy']
      loss+=his.history['loss']
      val_loss+=his.history['val_loss']
      # print('Epoch %d/%d\t%s' % (j + 1, nb_epoch, str(his.history)))
      if his.history['val_accuracy'][0] >= best_val_acc:
          score, test_acc = model.evaluate(X_test, y_test,
                                      batch_size=batch_size,
                                      verbose=2)
          best_val_acc = his.history['val_accuracy'][0]
          best_test_acc = test_acc
          print('Got best epoch  best val acc is %f test acc is %f' %
                (best_val_acc, best_test_acc))
          if len(test_accs) > 0:
              print('Current avg test acc:', str(np.mean(test_accs)))
      b = time.time()
      cost = b - a
      left = (nb_epoch - j - 1)
      print('One round cost %ds, %d round %ds %dmin left' % (cost, left,
                                                            cost * left,
                                                            cost * left / 60.0))
      test_accs.append(best_test_acc)
      predictions = model.predict(X_test, batch_size=batch_size, verbose=0)

  print('Avg test acc:', str(np.mean(test_accs)))
  return y_test,predictions

# Cross Validation

In [None]:
def Do_Cross_Validation(model,X,y):

  # Define per-fold score containers
  acc_per_fold = []
  loss_per_fold = []

  kfold = KFold(n_splits=folds, shuffle=True)

  fold_no = 1
  inputs = X
  targets = y
  for train, test in kfold.split(inputs, targets):
    model = model

    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
    his = model.fit(inputs[train], targets[train],
                  batch_size=batch_size,
                  shuffle=True,
                  epochs=nb_epoch, 
                  verbose=verbosity,
                  validation_split=validation_split)
    
    # Generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

  # == Provide average scores ==
  print('------------------------------------------------------------------------')
  print('Score per fold')
  for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
  print('------------------------------------------------------------------------')
  print('Average scores for all folds:')
  print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
  print(f'> Loss: {np.mean(loss_per_fold)}')
  print('------------------------------------------------------------------------')

# Plot Graphs

In [None]:
def Plot_graphs():

  epochs=range(len(acc)) # Get number of epochs

  #------------------------------------------------
  # Plot training and validation accuracy per epoch
  #------------------------------------------------
  plt.plot(epochs, acc, 'r')
  plt.plot(epochs, val_acc, 'b')
  plt.title('Training and validation accuracy')
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend(["Accuracy", "Validation Accuracy"])

  plt.figure()

  #------------------------------------------------
  # Plot training and validation loss per epoch
  #------------------------------------------------
  plt.plot(epochs, loss, 'r')
  plt.plot(epochs, val_loss, 'b')
  plt.title('Training and validation loss')
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.legend(["Loss", "Validation Loss"])

  plt.figure()


  # Expected Output
  # A chart where the validation loss does not increase sharply!

# Main

In [None]:
embedding_matrix = generate_embedding_metrix()
# embedding_matrix = load_word_embedding_atrix()

In [None]:
batch_size = 32 # 64, 128
nb_filter = 200
filter_length = 4 # test with 2,3,4,5
hidden_dims = nb_filter * 2
nb_epoch = 15
RNN = LSTM 
rnn_output_size = embedding_size 
folds = 3
maxlen = 210 #test with other values
max_features = embedding_matrix.shape[0] #vocab_size
embedding_dims = embedding_size
drop_out_value = 0.5 #0.8 #0.3
verbosity = 1
validation_split = 0.2

In [None]:
model = build_RNN_model()

In [None]:
labels, predictions = Train_Test_Model(model,X_train, X_test, y_train, y_test)
# Do_Cross_Validation(model,padded_docs,comment_labels)

In [None]:
# precision, recall, fscore, support = precision_recall_fscore_support(labels, predictions)
# print('precision: {}'.format(precision))
# print('recall: {}'.format(recall))
# print('fscore: {}'.format(fscore))
# print('support: {}'.format(support))

In [None]:
# predictions[2]
# y_test[2]