<a href="https://colab.research.google.com/github/HananeNourMoussa/darija-ner/blob/master/Experiment_2_BERT_RNN_Dense_Layer(Softmax).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting Experiment Parameters:

In [None]:
# UBC-NLP/ARBERT
# UBC-NLP/MARBERT
# bert-base-multilingual-cased
# SI2M-Lab/DarijaBERT
# CAMeL-Lab/bert-base-arabic-camelbert-da
# CAMeL-Lab/bert-base-arabic-camelbert-msa
# CAMeL-Lab/bert-base-arabic-camelbert-mix
# aubmindlab/bert-base-arabertv02

In [None]:
config_default = {
    'epochs' : 5,
    'batch_size': 8,
    'lr_bert': 3e-5,
    'lr_rest':1e-4,
    'hidden_size': 256,
    'num_layers': 1,
    'eps': 1e-8,
    "lm": "aubmindlab/bert-base-arabertv02",
    "p1":0.3,
    "p2": 0.2,
    "dataset": "mixedNERcorp"
}

In [None]:
MAX_LEN = 100
lm = "aubmindlab/bert-base-arabertv02"
batch_size = 8
freeze_bert = False

In [None]:
#GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Apr 20 00:53:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


# Installations and Imports:

In [None]:
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install torch -q
!pip install wandb -Uq

In [None]:
import pandas as pd
import transformers
from transformers import BertForTokenClassification, AutoTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup
from google.colab import drive
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
import numpy as np
from tqdm import tqdm, trange
from statistics import mean
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch import LongTensor
import wandb
import os
import random

In [None]:
#ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mhanane-n-moussa[0m ([33mdarijaner[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Data Preprocessing:

In [None]:
drive.mount('/content/gdrive/', force_remount=True)
dataset = pd.read_csv("/content/gdrive/MyDrive/annotated_corpus/ner-corpus-darija/MixedNERcorp_train.csv")
dataset.pop("Unnamed: 0")
dataset

Mounted at /content/gdrive/


Unnamed: 0,Sentence,Token,Tag
0,0,Uppsala,B-LOC
1,0,),O
2,0,هيّا,O
3,0,رابع,O
4,0,أكبر,O
...,...,...,...
177821,5957,في,O
177822,5957,مختلف,O
177823,5957,أنحاء,O
177824,5957,المصنع,O


In [None]:
tokenizer = AutoTokenizer.from_pretrained(lm)

In [None]:
class SentenceGetter(object):
  def __init__(self, data):
    #start from the first sentence
    self.n_sent = 1
    self.data = data
    self.empty = False
    #aggregate token and tag
    agg_func = lambda s:[(to, ta) for to, ta in zip (s["Token"].values.tolist(),
                                                     s["Tag"].values.tolist())]
    self.grouped = self.data.groupby("Sentence").apply(agg_func)
    #make list of sentences
    self.sentences = [s for s in self.grouped]
  def get_next(self):
    try:
      #get current sentence
      s = self.grouped["Sentence: {}".format(self.n_sent)]
      #move to next sentence
      self.n_sent += 1
      return s
    except:
      return None

In [None]:
getter = SentenceGetter(dataset)

In [None]:
#get tokens of sentences
sentences = [[token[0] for token in sentence] for sentence in getter.sentences]
sentences[0]

['Uppsala',
 ')',
 'هيّا',
 'رابع',
 'أكبر',
 'مدينة',
 'ف',
 'سّويد',
 'من',
 'بعد',
 'سطوكهولم',
 '،',
 'ݣوتنبورݣ',
 'ؤ',
 'مالمو',
 '.']

In [None]:
#remove a nan value that causes errors later
sentences_ = list()
for sentence in sentences:
  sentence_ = list(filter(lambda item: type(item) == str, sentence))
  sentences_.append(sentence_)

In [None]:
#getting tags
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-LOC', 'O']


In [None]:
#adding PAD to tags and assigning label numbers
tag_values = list(set(dataset["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
#function to tokenize each word into subwords while preserving corresponding labels
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
#pass each sentence at a time to the function
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences_, labels)
]

In [None]:
#separate text and labeLslog
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
#convert text to input ids
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
#getting encoded tags with PAD included for CLS and SEP tokens
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
#setting attention masks to tell model to ignore PADs
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
#split data (sentences) to train, validation, and test
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.2)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.2)

In [None]:
#convert to pyTorch tensors
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Model Definitions:

### BERT-BiLSTM-Softmax:

In [None]:
class BERT_BiLSTM_SM(nn.Module):
    def __init__(self, hidden_size, num_layers, p1, p2, num_classes=len(tag2idx)):
        super().__init__()
        config = transformers.BertConfig.from_pretrained(lm, hidden_dropout_prob=p1)
        self.bert = transformers.BertModel.from_pretrained(lm, config = config)
        input_size = self.bert.config.to_dict()['hidden_size']
        self.bilstm = nn.LSTM(hidden_size = hidden_size, input_size = input_size, bidirectional = True, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.drop = nn.Dropout(p2)
        if freeze_bert == True:
          for param in self.bert.parameters():
            param.requires_grad = False
        else:
          for param in self.bert.parameters():
            param.requires_grad = True
    def forward(self, input_ids, attention_mask = None):
        s = self.bert(input_ids = input_ids, attention_mask=attention_mask)
        s = s['last_hidden_state']
        s = self.drop(s)
        s, _ = self.bilstm(s)
        s = s.reshape(-1, s.shape[2])
        s = self.fc(s)
        return F.log_softmax(s)

### BERT-BiGRU-Softmax:

In [None]:
class BERT_BiGRU_SM(nn.Module):
    def __init__(self, hidden_size, num_layers, p1, p2, num_classes=len(tag2idx)):
        super().__init__()
        config = transformers.BertConfig.from_pretrained(lm, hidden_dropout_prob=p1)
        self.bert = transformers.BertModel.from_pretrained(lm, config = config, add_pooling_layer = False)
        input_size = self.bert.config.to_dict()['hidden_size']
        self.gru = nn.GRU(hidden_size = hidden_size, input_size = input_size, bidirectional = True, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.drop = nn.Dropout(p2)
        if freeze_bert == True:
          for param in self.bert.parameters():
            param.requires_grad = False
        else:
          for param in self.bert.parameters():
            param.requires_grad = True
    def forward(self, input_ids, attention_mask = None):
        s = self.bert(input_ids = input_ids, attention_mask=attention_mask)
        s = s['last_hidden_state']
        s = self.drop(s)
        s, _ = self.gru(s)
        s = s.reshape(-1, s.shape[2])
        s = self.fc(s)
        return F.log_softmax(s)

# Training and Validation:

In [None]:
model = BERT_BiLSTM_SM(hidden_size = 256, num_layers = 1, p1 = 0.3, p2 = 0.2)
model.cuda()
optimizer = optim.AdamW([
        {'params': model.bert.parameters(), 'lr':3e-5},
        {'params': model.bilstm.parameters(), 'lr':1e-4},
        {'params': model.fc.parameters(), 'lr':1e-4}
                        ])
total_steps = len(train_dataloader) * 5
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def get_metrics(true, pred):
  acc = accuracy_score(true, pred)
  f1 = f1_score(true, pred)
  prec = precision_score(true, pred)
  recall = recall_score(true, pred)
  report = classification_report(true, pred)
  return acc, f1, prec, recall, report

#### RNN Code:

In [None]:
def evaluate_model(model, dataset, total_loss = 0):
  model.eval()
  predictions = []
  truth = []
  with torch.no_grad():
      for batch in dataset:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch
        logits = model(input_ids, input_mask)
        loss = criterion(logits, labels.view(-1))
        total_loss += loss.item()
        label_ids = labels.to('cpu').numpy()
        bs = int((logits.size(dim = 0))/MAX_LEN)
        logits = logits.view(bs, MAX_LEN, logits.shape[1])
        logits = logits.detach().cpu().numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis = 2)])
        truth.extend(label_ids)
  avg_loss = total_loss / len(dataset)
  pred_tags = [tag_values[p_i] for p, l in zip(predictions, truth)
                                for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
  true_tags = [tag_values[l_i] for l in truth
                                for l_i in l if tag_values[l_i] != "PAD"]
  pred = list()
  pred.append(pred_tags)
  true = list()
  true.append(true_tags)
  return pred, true, avg_loss

In [None]:
training_loss = []
validation_loss = []
tr_accuracies, val_accuracies = [], []
tr_f1s, val_f1s = [], []
tr_precisions, val_precisions = [], []
tr_recalls, val_recalls = [], []
criterion = nn.CrossEntropyLoss().to(device)
def train(config = None):
  with wandb.init(project="final-results", config=config_default):
    config = wandb.config
    # Train the model
    for epoch in trange(config.epochs, desc = "Epoch"):
        # Training phase
        model.train()
        train_loss = 0
        for step, batch in enumerate(train_dataloader):
            model.zero_grad()
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, labels = batch
            logits = model(input_ids, input_mask)
            loss = criterion(logits, labels.view(-1))
            loss.backward()
            train_loss += loss.item()
            #torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            optimizer.step()
            scheduler.step()
        avg_train_loss = train_loss / len(train_dataloader)
        print("Average train loss: {}".format(avg_train_loss))
        training_loss.append(avg_train_loss)
        #log train loss
        wandb.log({'train_loss': avg_train_loss, "epoch": epoch})
        #performance on training
        tr_pred, tr_true, _ = evaluate_model(model, train_dataloader)
        tr_acc, tr_f1, tr_prec, tr_recall, tr_report = get_metrics(tr_true, tr_pred)
        print("Training Accuracy: {}".format(tr_acc))
        tr_accuracies.append(tr_acc)
        wandb.log({'train_acc': tr_acc, "epoch": epoch})
        print("Training F1-Score: {}".format(tr_f1))
        tr_f1s.append(tr_f1)
        wandb.log({'train_f1': tr_f1, "epoch": epoch})
        print("Training Precision: {}".format(tr_prec))
        tr_precisions.append(tr_prec)
        wandb.log({'train_prec': tr_prec, "epoch": epoch})
        print("Training Recall: {}".format(tr_recall))
        tr_recalls.append(tr_recall)
        wandb.log({'train_recall': tr_recall, "epoch": epoch})
        print("Training Classification Report:\n {}".format(tr_report))
        print()
        # performance on validation
        val_pred, val_true, avg_val_loss = evaluate_model(model, valid_dataloader)
        validation_loss.append(avg_val_loss)
        wandb.log({'validation_loss': avg_val_loss, "epoch": epoch})
        val_acc, val_f1, val_prec, val_recall, val_report = get_metrics(val_true, val_pred)
        print("Validation Accuracy: {}".format(val_acc))
        val_accuracies.append(val_acc)
        wandb.log({'val_acc': val_acc, "epoch": epoch})
        print("Validation F1-Score: {}".format(val_f1))
        val_f1s.append(val_f1)
        wandb.log({'val_f1': val_f1, "epoch": epoch})
        print("Validation Precision: {}".format(val_prec))
        val_precisions.append(val_prec)
        wandb.log({'val_prec': val_prec, "epoch": epoch})
        print("Validation Recall: {}".format(val_recall))
        val_recalls.append(val_recall)
        wandb.log({'val_recall': val_recall, "epoch": epoch})
        print("Validation Classification Report:\n {}".format(val_report))
        overfitting = abs(tr_f1-val_f1)
        wandb.log({'overfitting': overfitting, "epoch": epoch})
        print()
  #model.save(os.path.join(wandb.run.dir, "model.h5"))
#wandb.agent(sweep_id, train)
train()

  return F.log_softmax(s)


Average train loss: 0.20523667425517267


  _warn_prf(average, modifier, msg_start, len(result))


Training Accuracy: 0.9117629037792265
Training F1-Score: 0.4201214963231376
Training Precision: 0.45670811771066655
Training Recall: 0.3889619786870149
Training Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.55      0.70      0.62      6509
        MISC       0.00      0.00      0.00      3239
         ORG       0.00      0.00      0.00      2014
         PER       0.29      0.39      0.34      3440

   micro avg       0.46      0.39      0.42     15202
   macro avg       0.17      0.22      0.19     15202
weighted avg       0.30      0.39      0.34     15202




  _warn_prf(average, modifier, msg_start, len(result))
Epoch:  20%|██        | 1/5 [02:32<10:08, 152.01s/it]

Validation Accuracy: 0.910935192582371
Validation F1-Score: 0.4011291460832745
Validation Precision: 0.4396658415841584
Validation Recall: 0.36880352971710356
Validation Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.53      0.70      0.60      1509
        MISC       0.00      0.00      0.00       833
         ORG       0.00      0.00      0.00       564
         PER       0.30      0.38      0.34       947

   micro avg       0.44      0.37      0.40      3853
   macro avg       0.17      0.22      0.19      3853
weighted avg       0.28      0.37      0.32      3853


Average train loss: 0.09099750991773185
Training Accuracy: 0.9460198029508358
Training F1-Score: 0.614332892998679
Training Precision: 0.6168589998673564
Training Recall: 0.6118273911327456
Training Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00     

Epoch:  40%|████      | 2/5 [05:03<07:35, 151.92s/it]

Validation Accuracy: 0.9379766141331978
Validation F1-Score: 0.5709019091507571
Validation Precision: 0.5793693212185996
Validation Recall: 0.5626784323903452
Validation Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.63      0.80      0.71      1509
        MISC       0.45      0.30      0.36       833
         ORG       0.47      0.49      0.48       564
         PER       0.62      0.46      0.53       947

   micro avg       0.58      0.56      0.57      3853
   macro avg       0.44      0.41      0.41      3853
weighted avg       0.57      0.56      0.55      3853


Average train loss: 0.06596324190453645
Training Accuracy: 0.9547693740502916
Training F1-Score: 0.6961379488504262
Training Precision: 0.6836864138018521
Training Recall: 0.7090514405999211
Training Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00    

Epoch:  60%|██████    | 3/5 [07:37<05:05, 152.80s/it]

Validation Accuracy: 0.9452635146585324
Validation F1-Score: 0.6483854433623784
Validation Precision: 0.6403442166540116
Validation Recall: 0.6566311964702829
Validation Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.72      0.83      0.77      1509
        MISC       0.51      0.52      0.52       833
         ORG       0.61      0.56      0.59       564
         PER       0.63      0.55      0.59       947

   micro avg       0.64      0.66      0.65      3853
   macro avg       0.50      0.49      0.49      3853
weighted avg       0.64      0.66      0.64      3853


Average train loss: 0.05364660121580186
Training Accuracy: 0.9609639233370914
Training F1-Score: 0.7734927690477681
Training Precision: 0.737876254180602
Training Recall: 0.8127220102618077
Training Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00     

Epoch:  80%|████████  | 4/5 [10:10<02:32, 152.90s/it]

Validation Accuracy: 0.9479022925896338
Validation F1-Score: 0.7142153239714216
Validation Precision: 0.6797186400937867
Validation Recall: 0.7524007267064625
Validation Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.75      0.86      0.80      1509
        MISC       0.50      0.59      0.54       833
         ORG       0.58      0.66      0.61       564
         PER       0.82      0.78      0.80       947

   micro avg       0.68      0.75      0.71      3853
   macro avg       0.53      0.58      0.55      3853
weighted avg       0.69      0.75      0.72      3853


Average train loss: 0.04641156950684252
Training Accuracy: 0.9680224008627029
Training F1-Score: 0.8092706376747
Training Precision: 0.7841322721944599
Training Recall: 0.8360742007630575
Training Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0

Epoch: 100%|██████████| 5/5 [12:43<00:00, 152.72s/it]

Validation Accuracy: 0.9549471033965188
Validation F1-Score: 0.7455413212760612
Validation Precision: 0.7223168654173765
Validation Recall: 0.7703088502465611
Validation Classification Report:
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.80      0.86      0.83      1509
        MISC       0.56      0.59      0.58       833
         ORG       0.62      0.70      0.66       564
         PER       0.82      0.82      0.82       947

   micro avg       0.72      0.77      0.75      3853
   macro avg       0.56      0.60      0.58      3853
weighted avg       0.73      0.77      0.75      3853







0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
overfitting,▁▅▆▇█
train_acc,▁▅▆▇█
train_f1,▁▄▆▇█
train_loss,█▃▂▁▁
train_prec,▁▄▆▇█
train_recall,▁▄▆██
val_acc,▁▅▆▇█
val_f1,▁▄▆▇█
val_prec,▁▄▆▇█

0,1
epoch,4.0
overfitting,0.06373
train_acc,0.96802
train_f1,0.80927
train_loss,0.04641
train_prec,0.78413
train_recall,0.83607
val_acc,0.95495
val_f1,0.74554
val_prec,0.72232


# Evaluation on Test Set:

### On DarNERcorp_test:

In [None]:
#torch.save(model.state_dict(), '/content/gdrive/MyDrive/Capstone/CAMeL_MIX_GRU_EX2')

In [None]:
#model = BERT_BiLSTM_SM(hidden_size = 256, num_layers = 1, p1 = 0.3, p2 = 0.2)
#model.load_state_dict(torch.load('/content/gdrive/MyDrive/Capstone/AraBERT_LSTM_EX2', map_location=device))

In [None]:
darner_test = pd.read_csv("/content/gdrive/MyDrive/annotated_corpus/ner-corpus-darija/DarNERcorp_test.csv")
test_darner_set = ' '.join(darner_test["Token"])
tokenized_sentence = tokenizer.encode(test_darner_set)

Token indices sequence length is longer than the specified maximum sequence length for this model (17957 > 512). Running this sequence through the model will result in indexing errors


In [None]:
model.cuda()

BERT_BiLSTM_SM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [None]:
new_labels = []
new_tokens = []
n = len(tokenized_sentence)//512
for i in range(0, n+1):
  if (i == n):
    batch = tokenized_sentence[i*512:len(tokenized_sentence)]
  else:
    batch = tokenized_sentence[i*512:i*512+512]
  input_ids = torch.tensor([batch]).cuda()
  with torch.no_grad():
      logits = model(input_ids)
  label_indices = np.argmax(logits.to('cpu').numpy(), axis=1)
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  for token, label_idx in zip(tokens, label_indices):
      if token.startswith("##"):
          #adding subword to previous word
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

  return F.log_softmax(s)


In [None]:
dar_predictions = [new_labels[1:len(new_labels)-1]]
dar_tags = [list(darner_test['Tag'])]

In [None]:
dar_acc, dar_f1, dar_prec, dar_recall, dar_report = get_metrics(dar_tags, dar_predictions)
print(dar_acc,'\n', dar_prec,'\n', dar_recall,'\n', dar_f1,'\n', dar_report)

  _warn_prf(average, modifier, msg_start, len(result))


0.9313353566009105 
 0.6228448275862069 
 0.6784037558685446 
 0.6494382022471911 
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.73      0.83      0.78       539
        MISC       0.55      0.51      0.53       395
         ORG       0.50      0.61      0.55       190
         PER       0.56      0.66      0.61       154

   micro avg       0.62      0.68      0.65      1278
   macro avg       0.47      0.52      0.49      1278
weighted avg       0.62      0.68      0.65      1278



### On mixedNERcorp_test:

In [None]:
import unicodedata as ud
def remove_punc(s):
  if (len(s) > 1) and s!= '--':
    s = ''.join(c for c in s if not ud.category(c).startswith('P'))
  return s

In [None]:
mixner_test = pd.read_csv("/content/gdrive/MyDrive/annotated_corpus/ner-corpus-darija/MixedNERcorp_test.csv")

In [None]:
tokens = list(mixner_test["Token"])
tokens = list(map(remove_punc, tokens))

In [None]:
test_mixner_set = ' '.join(tokens)
tokenized_sentence = tokenizer.encode(test_mixner_set)

In [None]:
new_labels = []
new_tokens = []
n = len(tokenized_sentence)//512
for i in range(0, n+1):
  if (i == n):
    batch = tokenized_sentence[i*512:len(tokenized_sentence)]
  else:
    batch = tokenized_sentence[i*512:i*512+512]
  input_ids = torch.tensor([batch]).cuda()
  with torch.no_grad():
      logits = model(input_ids)
  label_indices = np.argmax(logits.to('cpu').numpy(), axis=1)
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  for token, label_idx in zip(tokens, label_indices):
      if token.startswith("##"):
          #adding subword to previous word
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

  return F.log_softmax(s)


In [None]:
mix_predictions = [new_labels[1:len(new_labels)-1]]
mix_tags = [list(mixner_test['Tag'])]

In [None]:
mix_acc, mix_f1, mix_prec, mix_recall, mix_report = get_metrics(mix_tags, mix_predictions)
print(mix_acc,'\n', mix_prec,'\n', mix_recall,'\n', mix_f1,'\n', mix_report)

  _warn_prf(average, modifier, msg_start, len(result))


0.9443804336440766 
 0.6647325933400605 
 0.7395453269716531 
 0.7001461405606484 
               precision    recall  f1-score   support

          AD       0.00      0.00      0.00         0
         LOC       0.76      0.84      0.80      1215
        MISC       0.50      0.51      0.51       638
         ORG       0.59      0.68      0.63       649
         PER       0.75      0.80      0.77      1061

   micro avg       0.66      0.74      0.70      3563
   macro avg       0.52      0.57      0.54      3563
weighted avg       0.68      0.74      0.71      3563

