<a href="https://colab.research.google.com/github/HaifaCLG/Arabizi/blob/main/langdetect_lstm_bert_crf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchtext==0.4.0
import torch
import torch.nn as nn
import numpy as np
import os
import pandas as pd
from torchtext import data
from torch.utils.data import DataLoader
!pip install transformers
from transformers import BertTokenizer,BertModel
from torch.utils.data.sampler import SubsetRandomSampler
!pip install seqeval
from seqeval.metrics import accuracy_score


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
Successfully installed torchtext-0.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 32.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 75.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x

Mounting my drive on colab, in order to get my annotated file and other essential files

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
import sys
from pathlib import Path
base=Path('/content/gdrive/MyDrive/Arabizi-project')
sys.path.append(str(base))
print(base)
!cp -r "{base}" .

Mounted at /content/gdrive
/content/gdrive/MyDrive/Arabizi-project


The following cell contains the code that prepare the data : tokenizing , making equal size sentences by padding 
mapping words to numbers 
make dictionaries from words to numbers and vice versa


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,add_special_tokens=True)
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
print(device)

def get_dict_map_label(data):

  vocab=["PAD","CLS","SEP"]+list(set(data['categ'].to_list()))  
  index_to_tag = {index: word for index, word in enumerate(vocab)}
  tag_to_index = {word: index for index, word in enumerate(vocab)}
  return tag_to_index,index_to_tag

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)
        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)
    return tokenized_sentence, labels

class Dataset(data.Dataset):
    def __init__(self,data):
        self.data=data
        self.srcs,self.tgs=self.tokenize_srcs_tgs()

    def split_to_sentences(self,data):
      result_sens=[]
      result_tags=[]
      sen_num=len(data["categ"].tolist())
      for i in range(0,sen_num):
        original_sen_list=data["token"].tolist()[i]
        original_tag_list=data["categ"].tolist()[i]
        result_sens.extend([['[CLS]']+original_sen_list+['[SEP]']])
        result_tags.extend([['CLS']+original_tag_list+['SEP']])
      return result_sens,result_tags

    def tokenize_srcs_tgs(self):
        srcs,tgs=self.split_to_sentences(self.data)
        tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(srcs, tgs)]
        srcs = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
        tgs = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]
        tokenized_srcs=[]
        tokenized_tgs=[]
        for i in range(0,len(srcs)):
          tokenized_srcs.append(tokenizer.convert_tokens_to_ids(srcs[i]) )        
          tokenized_tgs.append([tag_to_index[w] for w in tgs[i]])
        samples = sorted(zip(tokenized_srcs, tokenized_tgs), key=lambda x: len(x[0]))
        inputs = [item[0] for item in samples]
        targets = [item[1] for item in samples]
        return inputs,targets

    def __len__(self):
        return len(self.srcs)

    def __getitem__(self, index):
        return torch.LongTensor(self.srcs[index]),torch.LongTensor(self.tgs[index])



import csv
#na_filter=False in order to not consider NA words as null
labeled_df = pd.read_csv("/content/Arabizi-project/words_annotated.csv",dtype=str,na_filter=False)

labeled_data=labeled_df.groupby(["sen_id","sen_num"],as_index=False)["token","categ"].agg(lambda x: list(x))
tag_to_index,index_to_tag=get_dict_map_label(labeled_df)

#collate_fn  padded each batch to the max len in the batch
BATCH_SIZE=32
def collate_fn(batch):
    # batch contains a list of tuples of structure (sequence, target)
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    if len(inputs)<BATCH_SIZE:  
      for i in range(len(inputs),BATCH_SIZE):
        inputs.append(torch.tensor([tokenizer.convert_tokens_to_ids('[CLS]'),tokenizer.convert_tokens_to_ids('[SEP]')]))
        targets.append(torch.tensor([tag_to_index['CLS'],tag_to_index['SEP']]))
    inputs = torch.nn.utils.rnn.pad_sequence(inputs,batch_first=True, padding_value=tokenizer.convert_tokens_to_ids('[PAD]'))
    targets = torch.nn.utils.rnn.pad_sequence(targets,batch_first=True, padding_value=tag_to_index['PAD'])
    return inputs, targets



Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

cuda




Defining the model and training 

In [None]:
!pip install pytorch-crf

#define parameters
embed_size =768*4
hidden_size = 400
num_layers =2
output_dim=len(tag_to_index)


from torchcrf import CRF

class Model(nn.Module):
    def __init__(self, embedding_dim, h_dim, num_layers,out_dim):
        super().__init__() 
        self.num_layers=num_layers
        self.h_dim=h_dim
        self.bert= BertModel.from_pretrained('bert-base-multilingual-uncased',output_hidden_states = True)
        self.crf = CRF(output_dim,batch_first=True)

        # Define LSTM layer
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=h_dim,
            num_layers=num_layers,
            dropout=0.5,
            batch_first=True,
            bidirectional=True
        )
        # Define fc layer
        self.fc=nn.Linear(2*h_dim, out_dim)

    def forward(self, x, prev_state,segments_ids,tags=None):
      self.bert=self.bert.to(device)
      self.lstm=self.lstm.to(device)
      self.fc=self.fc.to(device)
      self.crf=self.crf.to(device)
      # Put the model in "evaluation" mode, meaning feed-forward operation.
      self.bert.eval()
      with torch.no_grad():
        outputs = self.bert(x, segments_ids)
        hidden_states = outputs[2]
      # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor.
      token_embeddings = torch.stack(hidden_states, dim=0)
      token_embeddings = token_embeddings.permute(1,2,0,3)
      #token embedding shape: [Batch_size x seq_len x 13 x 768]
      Batch_size=token_embeddings.shape[0]
      seq_len=token_embeddings.shape[1]    
      token_vecs_cat=np.zeros(shape=(Batch_size,seq_len,token_embeddings.shape[3]*4))
      token_vecs_cat=torch.FloatTensor(token_vecs_cat).to(device)
      b=0
      for batch in token_embeddings:
        # For each token in the sentence...
        # `token` is a [ 13  x 768] tensor
        i=0
        for token in batch:
          #concat the vectors from the last four layers.
          token_vecs_cat[b][i]=torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
          i+=1
        b+=1
      output, state = self.lstm(token_vecs_cat, prev_state)   
      output=self.fc(output)
      mask=(x!=0)
      best_tags = self.crf.decode(output,mask=mask)
      crf_loss = -self.crf(output, mask=mask,tags=tags,reduction='mean') if tags is not None else None
      return best_tags, crf_loss,state


    def init_state(self, batch_size):
        return (torch.zeros(2*self.num_layers, batch_size, self.h_dim).to(device),
                torch.zeros(2*self.num_layers, batch_size, self.h_dim).to(device))
    

def train(model, optimizer, scheduler , dataloader, max_epochs=10):
  clip_grad=0.5  

  for epoch_idx in range(max_epochs):
      model.train()
      state_h, state_c = model.init_state(BATCH_SIZE)
      train_loss=0
      for batch_idx,batch in enumerate(dataloader):
          X, y = batch
          X=X.to(device)
          y=y.to(device)       
          state_h = state_h.detach()
          state_c = state_c.detach()
          # Forward pass
          # Mark each of the sentence tokens as belonging to sentence "1".
          segments_ids =torch.tensor([[1] * len(xx) for xx in X])
          segments_ids=segments_ids.to(device)
          y_pred, batch_loss,(state_h, state_c) = model(X, (state_h, state_c),segments_ids,y)     
          optimizer.zero_grad()
          loss=batch_loss
          train_loss+=loss.item()
          # Backward pass
          loss.backward()
          # Prevent large gradients
          if clip_grad > 0:
              torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
          #updates parameters
          optimizer.step() 
          scheduler.step()



      print(f"Epoch #{epoch_idx}, train loss={train_loss/(len(dataloader)-1):.4f}")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


Calculating the 10-fold cross validation 

In [None]:

epochs =4
import numpy
# join bpe split tokens
def join_bpe(token_ids,label_tags):
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    new_labels = []
    for token, label_idx in zip(tokens, label_tags):
      if not token.startswith("##") :
        new_labels.append(label_idx)
    return new_labels
from transformers import get_linear_schedule_with_warmup
from sklearn import metrics
from sklearn.model_selection import KFold
batch_size=32
#we have the following  6 categ names
categ_names = ['Arabizi', 'English', 'French ', 'Arabic ','Trigger', 'Other  ']
labeled_dataset=Dataset(labeled_data)
def k_fold_cross_validation(k=10):
    support=np.array([0]*len(categ_names))
    f1_score_arr,precision_score_arr,recall_score_arr=np.array([0.0]*len(categ_names)),np.array([0.0]*len(categ_names)),np.array([0.0]*len(categ_names))
    f1_macro,precision_macro,recall_macro=0.0,0.0,0.0
    f1_weighted,precision_weighted,recall_weighted=0.0,0.0,0.0
    f1_micro,precision_micro,recall_micro=0.0,0.0,0.0
    kf=KFold(n_splits=k,shuffle=True)
    acc_score=0
    for train_idx,test_idx in kf.split(np.arange(len(labeled_dataset))):
        train_sample = torch.utils.data.SubsetRandomSampler(train_idx)
        dev_sample = torch.utils.data.SubsetRandomSampler(test_idx)
        train_dl = DataLoader(labeled_dataset,batch_size=BATCH_SIZE,collate_fn=collate_fn,sampler=train_sample)
        dev_dl=DataLoader(labeled_dataset, batch_size=BATCH_SIZE,collate_fn=collate_fn,sampler=dev_sample)
        model=Model(embed_size,hidden_size,num_layers,output_dim)
        optimizer = torch.optim.Adam(model.parameters())
        total_steps = len(train_dl) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        train(model, optimizer,scheduler, train_dl, max_epochs=epochs)
        model.eval()
        predictions , true_labels,x_total = [], [],[]
        state_h, state_c = model.init_state(batch_size)
        for batch_idx,batch in enumerate(dev_dl):
            X,y=batch
            X=X.to(device)
            y=y.to(device)
            # Forward pass
            state_h = state_h.detach()
            state_c = state_c.detach() 
            # Mark each of the sentence tokens as belonging to sentence "1".
            segments_ids =torch.tensor([[1] * len(xx) for xx in X])
            segments_ids=segments_ids.to(device)  
            with torch.no_grad():
              y_pred,_, (state_h, state_c) = model(X, (state_h, state_c),segments_ids,y)
            label_ids = y.to('cpu').numpy()
            # Calculate the accuracy for this batch of  sentences.
            predictions.extend(numpy.array(y_pred))
            true_labels.extend(label_ids)
            x_to_add=X.to('cpu').numpy()
            x_total.extend(x_to_add)
        x_total=[p_i for p, l in zip(x_total, true_labels)
                                  for p_i, l_i in zip(p, l) if index_to_tag[l_i] != "PAD" and index_to_tag[l_i] !="CLS" and index_to_tag[l_i] !="SEP"]
        pred_tags = [index_to_tag[p_i] for p, l in zip(predictions, true_labels)
                                  for p_i, l_i in zip(p, l) if index_to_tag[l_i] != "PAD" and index_to_tag[l_i] !="CLS" and index_to_tag[l_i] !="SEP"]
        valid_tags = [index_to_tag[l_i] for l in true_labels
                                  for l_i in l if index_to_tag[l_i] != "PAD" and index_to_tag[l_i] !="CLS" and index_to_tag[l_i] !="SEP"]

        y_pred_f=join_bpe(x_total,pred_tags)
        y_test_f=join_bpe(x_total,valid_tags)
        p, r, f1, s = metrics.precision_recall_fscore_support(y_test_f,y_pred_f,labels=['0','1','2','3','4','5'],average=None)
        for i in range(0,len(categ_names)) :
            precision_score_arr[i]+=p[i]
            f1_score_arr[i]+=f1[i]
            recall_score_arr[i]+=r[i]
            support[i]+=s[i]
        acc_score+=metrics.accuracy_score(y_test_f,y_pred_f)
        p, r, f1, _ = metrics.precision_recall_fscore_support(y_test_f,y_pred_f,average='macro')
        precision_macro+=p
        recall_macro+=r
        f1_macro+=f1
        p, r, f1, _ = metrics.precision_recall_fscore_support(y_test_f,y_pred_f,average='micro')
        precision_micro+=p
        recall_micro+=r
        f1_micro+=f1
        p, r, f1, _ = metrics.precision_recall_fscore_support(y_test_f,y_pred_f,average='weighted')
        precision_weighted+=p
        recall_weighted+=r
        f1_weighted+=f1
        total_supp=sum(support)
    precision_macro=precision_macro/k
    recall_macro=recall_macro/k
    f1_macro=f1_macro/k
    precision_weighted=precision_weighted/k
    recall_weighted=recall_weighted/k
    f1_weighted=f1_weighted/k
    precision_micro=precision_micro/k
    recall_micro=recall_micro/k
    f1_micro=f1_micro/k
    print("---- {0} fold cross validation of the model----".format(k))
    print(acc_score/k)
    print('Category     precision    recall      f1-score      support')
    for i in range(0, len(categ_names)):
        precision_score=precision_score_arr[i]/k
        recall_score=recall_score_arr[i]/k
        f1_score=f1_score_arr[i]/k
        print(' {0}  :  {1:.2f}         {2:.2f}      {3:.2f}        {4}  '.format(categ_names[i],precision_score,recall_score,f1_score,support[i]))
    print('micro avg  : {0:.2f}         {1:.2f}      {2:.2f}        {3} '.format(precision_micro,recall_micro,f1_micro,total_supp))
    print('macro avg  : {0:.2f}         {1:.2f}      {2:.2f}        {3} '.format(precision_macro,recall_macro,f1_macro,total_supp))
    print('weighted avg:{0:.2f}         {1:.2f}      {2:.2f}        {3} '.format(precision_weighted,recall_weighted,f1_weighted,total_supp))

k_fold_cross_validation(k=10)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=7.6519
Epoch #1, train loss=3.2749
Epoch #2, train loss=2.6312
Epoch #3, train loss=2.0244


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

Epoch #0, train loss=7.5056
Epoch #1, train loss=3.2886
Epoch #2, train loss=2.4861
Epoch #3, train loss=1.8937


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=7.6390
Epoch #1, train loss=3.3114
Epoch #2, train loss=2.5828
Epoch #3, train loss=1.9920


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=7.4791
Epoch #1, train loss=3.2577
Epoch #2, train loss=2.5151
Epoch #3, train loss=1.9266


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=8.0114
Epoch #1, train loss=3.3504
Epoch #2, train loss=2.4815
Epoch #3, train loss=1.9419


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=7.3652
Epoch #1, train loss=3.2422
Epoch #2, train loss=2.4943
Epoch #3, train loss=1.9076


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=8.3144
Epoch #1, train loss=3.2627
Epoch #2, train loss=2.4861
Epoch #3, train loss=1.8918


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=8.0997
Epoch #1, train loss=3.2992
Epoch #2, train loss=2.5162
Epoch #3, train loss=1.9346


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=8.1814
Epoch #1, train loss=3.3488
Epoch #2, train loss=2.5194
Epoch #3, train loss=1.9140


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=8.4024
Epoch #1, train loss=3.2952
Epoch #2, train loss=2.5005
Epoch #3, train loss=1.8891




---- 10 fold cross validation of the model----
0.9527438006206136
Category     precision    recall      f1-score      support
 Arabizi  :  0.91         0.95      0.93        4869  
 English  :  0.97         0.98      0.97        16938  
 French   :  0.64         0.44      0.50        167  
 Arabic   :  0.98         0.99      0.98        2680  
 Trigger  :  0.76         0.66      0.71        1406  
 Other    :  0.97         0.94      0.95        4385  
micro avg  : 0.95         0.95      0.95        30445 
macro avg  : 0.87         0.83      0.84        30445 
weighted avg:0.95         0.95      0.95        30445 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Calculating Accuracy and Classification metrics

predicting the tags of given sentence using a given trained model

In [None]:
def predict(sentence,model):
  model.eval()
  tokenized_sentence = tokenizer.encode(sentence)
  sen_ids = torch.tensor([tokenized_sentence]).to(device)
  state_h, state_c = model.init_state(1)
  with torch.no_grad():
    segments_ids=torch.tensor([[1]*len(x) for x in sen_ids])
    segments_ids=segments_ids.to(device)  
    y_pred,_, (state_h, state_c) = model(sen_ids, (state_h, state_c),segments_ids,tags=None)
  label_indices=numpy.array(y_pred)
	# join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(sen_ids.to('cpu').numpy()[0])
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
      new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
      new_labels.append(index_to_tag[label_idx])
      new_tokens.append(token)
  return new_labels[1:-1]
  #for token, label in zip(new_tokens, new_labels):
    #print("{}\t{}".format(label, token))
labeled_dataset=Dataset(labeled_data)
model=Model(embed_size,hidden_size,num_layers,output_dim)
train_dl = DataLoader(labeled_dataset,batch_size=BATCH_SIZE,collate_fn=collate_fn)
optimizer = torch.optim.Adam(model.parameters())
total_steps = len(train_dl) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
train(model, optimizer,scheduler, train_dl, max_epochs=epochs)


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch #0, train loss=5.5540
Epoch #1, train loss=3.1767
Epoch #2, train loss=2.1493
Epoch #3, train loss=1.4927


In [None]:
model.eval()
print(predict("imagine law tele3na maabaad fel team ",model))
print(predict("Lava traveled 19km and stopped outside Medina .",model))
print(predict("kinda wanted a 4lifer but yall 4everyone",model))
#print(predict("the face today in Jerusalem ",model))
#print(predict("sho had eleshe hogwarts",model))

['1', '1', '0', '0', '0', '1']
['1', '1', '5', '1', '1', '1', '4', '5']
['1', '1', '1', '0', '1', '0', '0']


The following code is for selecting new random sentences and predicting the tags using our trained model, saving the results on a file

In [None]:
import os
import random
import re
def preprocess_sentence(s):
    # Shorten repeated characters
    s = re.sub(r"(\w)(\1){2,}", r"\1\1", s)
    s=re.sub(r'([ء-ي])([a-zA-Z0-9])',r"\1 \2",s)
    s=re.sub(r'([a-zA-Z0-9])([ء-ي])',r"\1 \2",s)
    s=re.sub("/"," / ",s)
    s=re.sub("؟"," ؟ ",s)
    s=re.sub("،"," ، ",s)
    s=re.sub("-"," - ",s)
    s=re.sub("([\.]+)",r" \1 ",s)
    s=re.sub("[ ]+"," ",s)

    return s

def select_random_sentences(dir_name,out_file,sample_num):
    for file_name in os.listdir(dir_name):
        with open(dir_name+'/'+file_name, 'r', encoding="utf-8") as f,open(out_file,'a',encoding="utf-8",newline='') as out:
            lines = [line for line in f]
            if(len(lines)<sample_num):
                continue
            random_choice = random.sample(lines[1:], sample_num)
            out.writelines(random_choice)


# this function chooses new random sentences, a sample_num from each file in dir_name
# these sentences are new, i.e aren't already in the csv_file
# note that out_file contains all the sentences without annotation
#the function prints the final number of the new sentences and annotate only these sentences which will be saved in the annotated file
#both files will be downloaded
from google.colab import files
def choose_new_random_sentences(csv_file, dir_name, out_file, sample_num, clf):
    num_of_sentences=0
    with open(csv_file, 'r', encoding="utf-8") as f1:
        reader = csv.reader(f1)
        posts = [row[1] + ',' + row[2] for row in reader]
    select_random_sentences(dir_name, out_file, sample_num)
    annotated_file = out_file.split('.csv')[0] + '_annotated.csv'
    with open(out_file, 'r', encoding="utf-8") as f, open(annotated_file, 'w', encoding="utf-8", newline='') as out:
        reader = csv.reader(f)
        writer = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for line in reader:
            if line[1] + ',' + line[2] in posts:
                continue
            preproccessed_sen=preprocess_sentence(line[4])
            prediction=predict(preproccessed_sen,clf)
            #here I chose from the selected sentences only these who contain Arabizi,English and trigger
            if not set(['0','1','4']).issubset(set(prediction)) :
                continue
            for w,pred in zip(str.split(preproccessed_sen),prediction):
              writer.writerow([line[0], line[1], line[3], w, pred])
            num_of_sentences+=1
    files.download(out_file)
    files.download(annotated_file)
    #printing the number of annotated sentences (note that this is not equal to number of sentences in out_file)
    print(num_of_sentences)

#choose_new_random_sentences('/content/Arabizi-project/words_annotated.csv','/content/Arabizi-project/reddit_subreddits','/content/Arabizi-project/random_trigger_reddit.csv',20,model)
#choose_new_random_sentences('/content/Arabizi-project/words_annotated.csv','/content/Arabizi-project/all_users_tokenized','/content/Arabizi-project/random_trigger_CS_twitter.csv',10,model)
