In [None]:
m = 300000   # Number of words in the vocabulary
n = 300     # N most frequent words to skip
k = 30000      # K least frequent words to skip

In [None]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df['text'], test_size=0.2, random_state=42)

#Split the temp data into dev and test sets
dev_data, test_data = train_test_split(temp_data, test_size=0.7, random_state=42)

#Print the sizes of the train, dev, and test sets
print("Train data size:", len(train_data))
print("Dev data size:", len(dev_data))
print("Test data size:", len(test_data))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=m-k, skip_top=n)
word_index = tf.keras.datasets.imdb.get_word_index()

index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'

x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

In [None]:
train_doc_length = 0
for doc in x_train:
  tokens = str(doc).split()
  train_doc_length += len(tokens)

print('\nTraining data average document length =', (train_doc_length / len(x_train)))

In [None]:
VOCAB_SIZE = 100000
SEQ_MAX_LENGTH = 240
vectorizer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, 
                                               output_mode='int', 
                                               ngrams=1, name='vector_text',
                                               output_sequence_length=SEQ_MAX_LENGTH)

with tf.device('/CPU:0'):
  vectorizer.adapt(x_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_curve, roc_auc_score, auc
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from seaborn import heatmap
from pandas import DataFrame
import IPython.display as ipd


def classification_data_nn(estimator, 
                          x_train, y_train,
                          x_test, y_test, 
                          epochs=1, 
                          num_layers=1,
                          emb_size=64,
                          h_size=64,
                          batch_size=64,
                          splits=5):
  
  train_accuracies, test_accuracies, train_precisions, test_precisions, train_recall, test_recall, train_f1, test_f1 = [], [], [], [], [], [], [], []
  
  split_size = int(len(x_train) / splits)
  x_splits = np.split(x_train, splits)
  y_splits = np.split(y_train, splits)
  test_cm = None
  
  for i in range(0, len(x_splits)):
    if i == 0:
      curr_x = x_splits[0]
      curr_y = y_splits[0]
    else:
      curr_x = np.concatenate((curr_x, x_splits[i]), axis=0)
      curr_y = np.concatenate((curr_y, y_splits[i]), axis=0)
    
    new_estimator = estimator(num_layers=num_layers, emb_size=emb_size, h_size=h_size)
    model = new_estimator.get_model()
    est_his = model.fit(curr_x, curr_y, epochs=epochs, batch_size=batch_size)
    
    train_pred = model.predict(curr_x)
    test_pred = model.predict(x_test)
    
    train_pred = np.round(train_pred)
    test_pred = np.round(test_pred)

    train_accuracies.append(accuracy_score(curr_y, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))
    
    train_precisions.append(precision_score(curr_y, train_pred))
    test_precisions.append(precision_score(y_test, test_pred))
    
    train_recall.append(recall_score(curr_y, train_pred))
    test_recall.append(recall_score(y_test, test_pred))
    
    train_f1.append(f1_score(curr_y, train_pred))
    test_f1.append(f1_score(y_test, test_pred))


  return {'estimator': new_estimator.__name__, 
          'splits': splits,
          'split_size': split_size, 
          'test_predictions': test_pred,
          'test_predictions_cont': test_pred,
          'train_accuracy': train_accuracies, 
          'test_accuracy': test_accuracies, 
          'train_precision': train_precisions, 
          'test_precision': test_precisions, 
          'train_recall': train_recall, 
          'test_recall': test_recall, 
          'train_f1': train_f1, 
          'test_f1': test_f1,
          }

def classification_data(estimator, 
                          x_train, y_train,
                          x_test, y_test,
                          splits = 5):
  train_accuracies, test_accuracies, train_precisions, test_precisions, train_recall, test_recall, train_f1, test_f1 = [], [], [], [], [], [], [], []
  
  # Split the training data into n splits
  split_size = int(len(x_train) / splits)
  x_splits = np.split(x_train, splits)
  y_splits = np.split(y_train, splits)
  
  # Train the model on each split and evaluate on the test set
  for i in range(0, len(x_splits)):
    if i == 0:
      curr_x = x_splits[0]
      curr_y = y_splits[0]
    else:
      curr_x = np.concatenate((curr_x, x_splits[i]), axis=0)
      curr_y = np.concatenate((curr_y, y_splits[i]), axis=0)
    
    # Train the model and get train/test predictions
    estimator.fit(curr_x, curr_y)
    train_pred = estimator.predict(curr_x)
    test_pred = estimator.predict(x_test)
    
    # Calculate and save the necessary metrics for this train/test split
    train_accuracies.append(accuracy_score(curr_y, train_pred))
    test_accuracies.append(accuracy_score(y_test, test_pred))
    
    train_precisions.append(precision_score(curr_y, train_pred))
    test_precisions.append(precision_score(y_test, test_pred))
    
    train_recall.append(recall_score(curr_y, train_pred))
    test_recall.append(recall_score(y_test, test_pred))
    
    train_f1.append(f1_score(curr_y, train_pred))
    test_f1.append(f1_score(y_test, test_pred))

  
  # Results required for all the future plots/tables
  return {'estimator': estimator.__class__.__name__, 
          'split_size': split_size, 
          'splits': splits,
          'test_predictions': test_pred,
          'train_accuracy': train_accuracies, 
          'test_accuracy': test_accuracies, 
          'train_precision': train_precisions, 
          'test_precision': test_precisions, 
          'train_recall': train_recall, 
          'test_recall': test_recall, 
          'train_f1': train_f1, 
          'test_f1': test_f1}
  
def classification_plots(classification_data, full_scale=False):
  split_size = classification_data['split_size']
  splits = classification_data['splits']
  
  figure, axis = plt.subplots(2, 2, figsize=(6, 6), dpi=100, gridspec_kw={'width_ratios': [1, 1], 'height_ratios': [1, 1]})
  figure.suptitle("Learning Curve for {estimator}".format(estimator=classification_data['estimator']), fontsize=16)
  labels = ['Accuracy', 'Precision', 'Recall', 'F1']
  
  for i in range(0, 2):
    for j in range(0, 2):
      axis[i, j].set_title(labels[i * 2 + j])
      axis[i, j].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
      if full_scale:
        axis[i, j].axis(ymin=0, ymax=1.02)
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data['train_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#2c8dc9", label="Training")
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data['test_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#FFAD00", label="Testing")
      axis[i, j].grid(alpha = 0.3)  
  
  handles, labels = axis[1, 1].get_legend_handles_labels()
  figure.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=2, fancybox=True, shadow=True)
  figure.tight_layout()
 
  return figure


def classification_table(classification_data):
  split_size = classification_data['split_size']
  df = DataFrame(data={'Train Accuracy': np.round(classification_data['train_accuracy'], 2), 
                         'Test Accuracy': np.round(classification_data['test_accuracy'], 2), 
                         'Precision Train' : np.round(classification_data['train_precision'], 2), 
                         'Precision Test' : np.round(classification_data['test_precision'], 2), 
                         'Recall Train' : np.round(classification_data['train_recall'], 2), 
                         'Recall Test' : np.round(classification_data['test_recall'], 2), 
                         'F1 Train' : np.round(classification_data['train_f1'], 2), 
                         'F1 Test' : np.round(classification_data['test_f1'], 2)}, 
                   index=list(range(split_size, len(x_train) + split_size, split_size)))
  return df

def classification_plots_compare(classification_data_x, classification_data_y, full_scale=False):
  """
  Plots the learning curves for the train/test accuracies, precisions, recalls
  and F1 scores for each split in one figure for both classifiers.
  
  Arguments:
    classification_data_x: The dictionary containing the train/test data for the first classifier.
    classification_data_y: The dictionary containing the train/test data for the second classifier.
    full_scale: Whether or not to plot the full scale of the y-axis.
  Returns: 
    A figure containing the learning curves for the train/test accuracies, precisions, recalls and F1 scores 
    for both classifiers.
  """
  
  split_size = classification_data_x['split_size']
  splits = classification_data_x['splits']
  
  figure, axis = plt.subplots(2, 2, figsize=(6, 6), dpi=100, gridspec_kw={'width_ratios': [1, 1], 'height_ratios': [1, 1]})
  figure.suptitle("Learning Curve Comparison for {estimator} against {estimator_2} ".format(estimator=classification_data_x['estimator'], estimator_2=classification_data_y['estimator']), fontsize=12)
  labels = ['Accuracy', 'Precision', 'Recall', 'F1']
  
  for i in range(0, 2):
    for j in range(0, 2):
      axis[i, j].set_title(labels[i * 2 + j])
      axis[i, j].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
      if full_scale:
        axis[i, j].axis(ymin=0, ymax=1.02)
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data_y['train_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#AD49C2", label="Training {estimator}".format(estimator=classification_data_y['estimator']))
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data_y['test_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#7CC249", label="Testing {estimator}".format(estimator=classification_data_y['estimator']))
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data_x['train_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#2c8dc9", label="Training {estimator}".format(estimator=classification_data_x['estimator']))
      axis[i, j].plot(list(range(split_size, splits*split_size + split_size, split_size)), classification_data_x['test_' + labels[i * 2 + j].lower().replace(' ', '_')], '-', color="#FFAD00", label="Testing {estimator}".format(estimator=classification_data_x['estimator']))
      axis[i, j].grid(alpha = 0.3) 
    
  handles, labels = axis[1, 1].get_legend_handles_labels()
  figure.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.1), ncol=2, fancybox=True, shadow=True)
  figure.tight_layout()
  return figure


def roc_curve_plot(y_pred_cont, name):
  fpr, tpr, _ = roc_curve(y_test, y_pred_cont)
  roc_auc = auc(fpr, tpr)

  figure, axis = plt.subplots(1, 1, figsize=(6, 6), dpi=100)
  axis.plot(fpr, tpr, color='#2c8dc9', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
  axis.plot([0, 1], [0, 1], color='#FFAD00', lw=2, linestyle='--', label='No Skill')
  axis.set_xlim([0.0, 1.0])
  axis.set_ylim([0.0, 1.05])
  axis.set_xlabel('False Positive Rate')
  axis.set_ylabel('True Positive Rate')
  axis.set_title('Receiver operating characteristic for {estimator}'.format(estimator=name))
  axis.legend(loc="lower right")
  axis.grid(alpha = 0.3)

  return figure

def loss_plot(loss, val_loss, name):
  figure, axis = plt.subplots(1, 1, figsize=(6, 6), dpi=100)
  epochs = range(1, len(loss)+1)
  axis.plot(epochs, loss, color='#2c8dc9', lw=2, label='Loss')
  axis.plot(epochs, val_loss, color='#FFAD00', lw=2, label='Validation Loss')
  axis.set_xlabel('Epoch')
  axis.set_ylabel('Loss')
  axis.set_title('Loss over Epochs for {estimator}'.format(estimator=name))
  axis.legend(loc="upper right")
  axis.grid(alpha = 0.3)

  return figure

In [None]:
class bigru_rnn():
    
    def __init__(self, num_layers=1, emb_size=64, h_size=64):
        self.num_layers = num_layers
        self.emb_size = emb_size
        self.h_size = h_size
        
        inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='txt_input')
        x = vectorizer(inputs)
        x = tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary()),
                                        output_dim=self.emb_size, name='word_embeddings',
                                        mask_zero=True)(x)
        
        for n in range(self.num_layers):
            if n != self.num_layers - 1:
                x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=self.h_size, 
                                    name=f'bigru_cell_{n}', 
                                    return_sequences=True,
                                    dropout=0.2))(x)
            else:
                x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=self.h_size, 
                                                name=f'bigru_cell_{n}',
                                                dropout=0.2))(x)
        x = tf.keras.layers.Dropout(rate=0.5)(x)
         
        o = tf.keras.layers.Dense(units=1, activation='sigmoid', name='lr')(x)
        
        self.model = tf.keras.models.Model(inputs=inputs, outputs=o, name='biGRU_RNN')
        self.model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy())

    def __name__(self):
        return 'biGRU RNN'
    
    def fit(self, x_train, y_train, epochs=1, batch_size=64, validation_split=0):
        self.model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
    
    def get_model(self):
        return self.model
      
loss_rnn = bigru_rnn(num_layers=1, emb_size=64, h_size=64)
model = loss_rnn.get_model()
print(model.summary())

In [None]:
# Cluster weight Layer
class CustomLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(CustomLayer, self).__init__()
        
    def call(self, input_1, user_id, training=False):
        # Should multiply the input of the previous layer with a coefficient based
        # on the cluster that the user that made this tweet belongs to.
        # input_1 is the input of the previous layer.
        # user_id is the id of the user that made the tweet.
        # The cluster is determined by the user_id.
        # The coefficient is the weight of the cluster.
        """
        weight = cluster.get(user_id)
        """
        x = input_1*weight

        
        return x
