In [16]:
# built in modules
import pandas as pd
import numpy as np
from sklearn import metrics 

# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib
from torch import nn
from keras.models import Sequential
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Dropout

import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from utils import stance_classes, category_classes
import random as rnd
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer

In [214]:
t = pd.read_csv('./Dataset/classification_train_sample1.csv')
d = pd.read_csv('./Dataset/dev.csv')

In [217]:
X_beforeTokenization = t['text']
yc = t['category']
yc_val = yc.replace(category_classes)
# we need values to be from 0 to 9 instead of from 1 to 10
yc_val -=1

X_beforeTokenization_dev = d['text']
yc_dev = d['category']
yc_val_dev = yc_dev.replace(category_classes)
yc_val_dev -=1


In [219]:
model_name="aubmindlab/bert-base-arabertv02-twitter"
arabert_prep = ArabertPreprocessor(model_name=model_name)
X = X_beforeTokenization.apply(arabert_prep.preprocess)
X_val = X_beforeTokenization_dev.apply(arabert_prep.preprocess)

In [220]:
def tokenizing(x):
        input_ids = []
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        for i in range(len(x)):
        # tokenize each sentence using bert's tokenizer
        # the tokenizer returns a batch encoding, which is derived from a dictionary
        # this dictionary holds the various inputs needed by the model of such tokenizer.
                encoded_sentence = tokenizer.encode_plus(
                        text=x[i],                      # Preprocess sentence
                        add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
                        max_length=64,                  # Max length to truncate/pad # tuned?
                                                        # The model is trained on a sequence length of 64, using max length beyond 64 might result in degraded performance
                        padding='max_length',           # Pad sentence to max length
                        return_attention_mask=True,     # Return attention mask
                        truncation = True)
                input_ids.append(encoded_sentence.get('input_ids'))
        input_ids = np.array(input_ids)
        return input_ids

In [221]:
# refrence token to number
input_ids_train = tokenizing(X)
input_ids_test = tokenizing(X_val)

In [222]:
# all words in the dataset 
lst = [word for x in input_ids_train for word in x]

# unique words
vocab = list(set(lst))
vocab_lengthing = len(vocab)

# map ids of train/test to numbers from 0 to vocab_lengthing in order to be consistent with embedding matrix
for i in range(len(vocab)):
    input_ids_train[input_ids_train==vocab[i]]=i
    input_ids_test[input_ids_test==vocab[i]]=i

# map unknowns (not found in train) in test data to a token id value -> vocab_lengthing
input_ids_test[input_ids_test > vocab_lengthing] = vocab_lengthing 


In [224]:
class LSTMDataset(torch.utils.data.Dataset):

  def __init__(self, x, y):
    
    self.tensor_x = torch.tensor(x)
    self.tensor_y = torch.tensor(y)
    
  def __len__(self):
    return len(self.tensor_x)

  def __getitem__(self, idx):
    return self.tensor_x[idx], self.tensor_y[idx]


In [225]:
dataset_train = LSTMDataset(input_ids_train, yc_val)
dataset_test = LSTMDataset(input_ids_test, yc_val_dev)

In [226]:
class LSTM(nn.Module):
  def __init__(self, vocab_size=35181, embedding_dim=300, hidden_size=50, n_classes=len(list(category_classes.keys()))):

    super(LSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_size,num_layers = 3, batch_first=True)
    self.linear = nn.Linear(hidden_size, n_classes)

  def forward(self, sentences):

    final_output = None
    sentences = self.embedding(sentences)
    sentences, _ = self.lstm(sentences)
    sentences = torch.mean(sentences, 1)
    sentences = self.linear(sentences)
    final_output = sentences

    return final_output

In [227]:
# all words in the dataset 
lst = [word for x in input_ids_train for word in x]

# unique words
vocab = set(lst)
model = LSTM(vocab_size = len(vocab)+2)


In [228]:

def train_lstm(model, train_dataset, val_dataset, criterion, optimizer, classes_names, n_classes=3, batch_size=256, epochs=30):

  # dataloader divides dataset into batches 
  train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
  val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True)
  
  # create a dictionary that holds both train and validation
  dataloader = {
      "train": train_dataloader,
      "val": val_dataloader}

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):

      for phase in ['train', 'val']:
          if phase == 'train':  # put the model in training mode
              model.train()
          else:
              # put the model in validation mode, in order not to update parameters in dropout.
              model.eval()

          # keep track of training and validation loss    
          total_acc_train = 0
          total_loss_train = 0
          train_labels = []
          train_preds = []
        
          for train_input, train_label in dataloader[phase]:
              # train_input: sentences
              # move data to the device
              train_label = train_label.to(device)
              train_input = train_input.to(device)

              # do the forward pass
              output = model(train_input)
              
              # The docs of the loss function expects:
              # input of shape: (batch_size, n_classes)
              # target of shape: (btach_size)
              # loss calculation
              batch_loss = criterion(output, train_label)

              # append the batch loss to the total_loss_train
              total_loss_train += batch_loss
              
              # calculate the batch accuracy (just add the number of correct predictions)
              # torch.Tensor.item(): Returns the value of this tensor as a standard Python number. This only works for tensors with one element.
              train_pred = torch.argmax(output, dim=-1)

              num_correct_predictions = (train_pred == train_label).sum().item()
              acc = num_correct_predictions
              total_acc_train += acc

              if phase == 'train':
                  # zero your gradients
                  optimizer.zero_grad()
                  # do the backward pass
                  batch_loss.backward()
                  # update the weights with your optimizer
                  optimizer.step()
              
              # move data to cpu then numpy so you can make use of sklearn metric functions
              train_labels += list(train_label.to('cpu').detach().numpy())
              train_preds += list(train_pred.to('cpu').detach().numpy())
              
          # calculate epoch's loss
          # len(train_dataset) will call the __len__ of the LSTMDataset
          # will return the number of sentences in the dataset
          if phase == 'train':
            sentences_count = len(train_dataset)
          else:
            sentences_count = len(val_dataset)

          # Measuring performance
          # calculate epoch's accuracy and loss
          epoch_loss = total_loss_train / sentences_count
          epoch_acc = total_acc_train / sentences_count
          report = metrics.classification_report(train_labels, train_preds, target_names=classes_names, digits=4, output_dict=True)
          
          print(f'Epochs: {epoch_num + 1} | {phase} Loss: {epoch_loss} | {phase} Accuracy: {epoch_acc} | {phase} macro avg persision: {report["macro avg"]}\n')
          
          if epoch_num % 5==0:
            # calculate the classification report each 5 epochs
            report = metrics.classification_report(train_labels, train_preds, target_names=classes_names, digits=4)
            print(f'Classification Report: {report}\n')

In [229]:
# make the criterion cross entropy loss, calculate error
criterion = torch.nn.CrossEntropyLoss()

# create the optimizer (Adam), updates the weights
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

In [230]:
# batch: as bulks, bulk of sentences, each iter, update their weights each iter 
# parameters in the model apply the gradient discend (derivative of loss/weights)
train_lstm(model, dataset_train, dataset_test, criterion, optimizer,list(category_classes.keys()) , n_classes=10, batch_size=256, epochs=20)
 

Epochs: 1 | train Loss: 0.006494984962046146 | train Accuracy: 0.40940265486725663 | train macro avg persision: {'precision': 0.3937346956479821, 'recall': 0.40940265486725663, 'f1-score': 0.3918878446332858, 'support': 36160}

Classification Report:               precision    recall  f1-score   support

   info_news     0.2072    0.0606    0.0937      3616
   celebrity     0.4814    0.5998    0.5341      3616
        plan     0.2589    0.4311    0.3235      3616
    requests     0.4873    0.5354    0.5102      3616
      rumors     0.5015    0.4967    0.4991      3616
      advice     0.5045    0.3844    0.4364      3616
restrictions     0.6163    0.7270    0.6671      3616
    personal     0.2608    0.1822    0.2146      3616
   unrelated     0.3922    0.4923    0.4366      3616
      others     0.2273    0.1845    0.2036      3616

    accuracy                         0.4094     36160
   macro avg     0.3937    0.4094    0.3919     36160
weighted avg     0.3937    0.4094    0.3919  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 3 | val Loss: 0.006853986997157335 | val Accuracy: 0.382 | val macro avg persision: {'precision': 0.250784252300955, 'recall': 0.274414400710857, 'f1-score': 0.23337982267723953, 'support': 1000}

Epochs: 4 | train Loss: 0.001135127735324204 | train Accuracy: 0.9142699115044248 | train macro avg persision: {'precision': 0.912832880078237, 'recall': 0.9142699115044248, 'f1-score': 0.9111479592796522, 'support': 36160}

Epochs: 4 | val Loss: 0.006690312176942825 | val Accuracy: 0.502 | val macro avg persision: {'precision': 0.33857363839220367, 'recall': 0.3366458972896854, 'f1-score': 0.32625321008361274, 'support': 1000}

Epochs: 5 | train Loss: 0.0008016804931685328 | train Accuracy: 0.9430862831858408 | train macro avg persision: {'precision': 0.9428647883789946, 'recall': 0.9430862831858405, 'f1-score': 0.9419181272132766, 'support': 36160}

Epochs: 5 | val Loss: 0.006829148158431053 | val Accuracy: 0.522 | val macro avg persision: {'precision': 0.3083513789089788, 'recall':

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 6 | val Loss: 0.006842697970569134 | val Accuracy: 0.563 | val macro avg persision: {'precision': 0.2880470782239698, 'recall': 0.27380071404457157, 'f1-score': 0.27537916206453666, 'support': 1000}

Classification Report:               precision    recall  f1-score   support

   info_news     0.6755    0.6532    0.6642       545
   celebrity     0.7959    0.8069    0.8014       145
        plan     0.1961    0.2439    0.2174        82
    requests     0.2000    0.1000    0.1333        20
      rumors     0.0000    0.0000    0.0000        15
      advice     0.2500    0.1000    0.1429        10
restrictions     0.0000    0.0000    0.0000         2
    personal     0.3759    0.4141    0.3941       128
   unrelated     0.3514    0.3611    0.3562        36
      others     0.0357    0.0588    0.0444        17

    accuracy                         0.5630      1000
   macro avg     0.2880    0.2738    0.2754      1000
weighted avg     0.5675    0.5630    0.5641      1000


Epochs: 7

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 8 | val Loss: 0.007557963021099567 | val Accuracy: 0.544 | val macro avg persision: {'precision': 0.2679782394525751, 'recall': 0.2523651673641064, 'f1-score': 0.25605826339970217, 'support': 1000}

Epochs: 9 | train Loss: 0.0003873550449497998 | train Accuracy: 0.9749446902654867 | train macro avg persision: {'precision': 0.9750106246381852, 'recall': 0.9749446902654867, 'f1-score': 0.9747940754716635, 'support': 36160}

Epochs: 9 | val Loss: 0.0076604881323874 | val Accuracy: 0.564 | val macro avg persision: {'precision': 0.27878885394265784, 'recall': 0.2619369953307671, 'f1-score': 0.26523450441040974, 'support': 1000}

Epochs: 10 | train Loss: 0.00032908120192587376 | train Accuracy: 0.9785674778761062 | train macro avg persision: {'precision': 0.9786443179793766, 'recall': 0.9785674778761061, 'f1-score': 0.9784459442527036, 'support': 36160}

Epochs: 10 | val Loss: 0.007947646081447601 | val Accuracy: 0.576 | val macro avg persision: {'precision': 0.2969678950885137, 'rec

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 15 | val Loss: 0.009128120727837086 | val Accuracy: 0.543 | val macro avg persision: {'precision': 0.25969413829231197, 'recall': 0.24462705071184293, 'f1-score': 0.24936070104438718, 'support': 1000}

Epochs: 16 | train Loss: 0.00023354259610641748 | train Accuracy: 0.9838772123893805 | train macro avg persision: {'precision': 0.9839767997231281, 'recall': 0.9838772123893806, 'f1-score': 0.983828481954068, 'support': 36160}

Classification Report:               precision    recall  f1-score   support

   info_news     0.9771    0.9342    0.9552      3616
   celebrity     0.9857    0.9939    0.9898      3616
        plan     0.9787    0.9895    0.9840      3616
    requests     0.9526    0.9903    0.9711      3616
      rumors     0.9931    0.9900    0.9916      3616
      advice     0.9978    1.0000    0.9989      3616
restrictions     0.9997    1.0000    0.9999      3616
    personal     0.9851    0.9674    0.9761      3616
   unrelated     0.9856    0.9842    0.9849      361

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 17 | val Loss: 0.009295482188463211 | val Accuracy: 0.559 | val macro avg persision: {'precision': 0.2938909539379725, 'recall': 0.25413254189782486, 'f1-score': 0.2620735481780056, 'support': 1000}

Epochs: 18 | train Loss: 0.0002312069700565189 | train Accuracy: 0.9837112831858407 | train macro avg persision: {'precision': 0.9838431591255727, 'recall': 0.9837112831858408, 'f1-score': 0.9836868406769007, 'support': 36160}



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epochs: 18 | val Loss: 0.009379194118082523 | val Accuracy: 0.541 | val macro avg persision: {'precision': 0.295377890278957, 'recall': 0.26935257754784125, 'f1-score': 0.27539045445407195, 'support': 1000}

Epochs: 19 | train Loss: 0.00024391648184973747 | train Accuracy: 0.9826603982300885 | train macro avg persision: {'precision': 0.9827976129614303, 'recall': 0.9826603982300884, 'f1-score': 0.9826249287718343, 'support': 36160}

Epochs: 19 | val Loss: 0.009657694958150387 | val Accuracy: 0.548 | val macro avg persision: {'precision': 0.2996579963146685, 'recall': 0.2622089839413104, 'f1-score': 0.2752394746952012, 'support': 1000}

Epochs: 20 | train Loss: 0.00023179441632237285 | train Accuracy: 0.9838772123893805 | train macro avg persision: {'precision': 0.9840150684363582, 'recall': 0.9838772123893806, 'f1-score': 0.9838495622595348, 'support': 36160}

Epochs: 20 | val Loss: 0.009658045135438442 | val Accuracy: 0.57 | val macro avg persision: {'precision': 0.2736323947656779, '

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
