In [23]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
!git lfs install

Error: Failed to call git rev-parse --git-dir --show-toplevel: "fatal: not a git repository (or any of the parent directories): .git\n"
Git LFS initialized.


In [25]:
!git clone https://huggingface.co/NepBERTa/NepBERTa

fatal: destination path 'NepBERTa' already exists and is not an empty directory.


In [26]:
from transformers import BertModel
model = BertModel.from_pretrained('./NepBERTa',from_tf=True)

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [27]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [28]:
df = pd.read_csv('./improved.csv',index_col = False)

In [29]:
df = df.loc[:,~df.columns.str.match("Unnamed")]

In [30]:
df.head()

Unnamed: 0,label,sentence
0,1,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा म...
1,1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब...
2,1,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !
3,1,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को ...
4,1,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ...


In [31]:
text = df.sentence.values

In [32]:
labels = df.label.values

In [33]:
vocab_file_dir = './NepBERTa/' 

In [34]:
tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,
                                        strip_accents=False,
                                         clean_text=False )

In [35]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)



In [36]:
val_ratio = 0.2 
batch_size = 16

In [37]:
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels
)

In [38]:
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])
val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

In [39]:
#Dataloader 
train_dataloader = DataLoader(
                  train_set,
                  sampler =RandomSampler(train_set),
                  batch_size = batch_size
)

In [40]:
validation_dataloader = DataLoader(
                val_set,
                sampler =SequentialSampler(val_set),
                batch_size = batch_size
)

In [41]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  b_f1 = 2 * (b_precision * b_recall) / (b_precision + b_recall)
  return b_accuracy, b_precision, b_recall, b_specificity,b_f1

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 2


In [43]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    vocab_file_dir,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
    from_tf= True
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    train_accuracy = []
    train_precision = []
    train_recall = []
    train_specificity = []
    train_f1 = []
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        logits = train_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        b_accuracy, b_precision, b_recall, b_specificity,b_f1 = b_metrics(logits, label_ids)
        train_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': train_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': train_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': train_specificity.append(b_specificity)
        if b_f1 != 'nan': train_f1.append(b_specificity)
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []
    val_f1 = []
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity,b_f1 = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
        if b_f1 != 'nan': val_f1.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Train Accuracy: {:.4f}'.format(sum(train_accuracy)/len(train_accuracy)))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
    print('\t - Training Precision: {:.4f}'.format(sum(train_precision)/len(train_precision)) if len(train_precision)>0 else '\t - Training Precision: NaN')
    print('\t - Training Recall: {:.4f}'.format(sum(train_recall)/len(train_recall)) if len(train_recall)>0 else '\t - Training Recall: NaN')
    print('\t - Training Specificity: {:.4f}\n'.format(sum(train_specificity)/len(train_specificity)) if len(train_specificity)>0 else '\t - Training Specificity: NaN')
    print('\t - Train F1: {:.4f}\n'.format(sum(train_f1)/len(train_f1)) if len(train_f1)>0 else '\t - Train F1: NaN')
    print('\t - Validation F1: {:.4f}\n'.format(sum(val_f1)/len(val_f1)) if len(val_f1)>0 else '\t - Validation F1: NaN')

Epoch:  50%|█████     | 1/2 [00:25<00:25, 25.28s/it]


	 - Train loss: 0.3994
	 - Train Accuracy: 0.8277
	 - Validation Accuracy: 0.8379
	 - Validation Precision: 0.8308
	 - Validation Recall: 0.9286
	 - Validation Specificity: 0.7037

	 - Training Precision: 0.8502
	 - Training Recall: 0.8922
	 - Training Specificity: 0.7397

	 - Train F1: 0.7397

	 - Validation F1: 0.7037



Epoch: 100%|██████████| 2/2 [00:51<00:00, 25.63s/it]


	 - Train loss: 0.2977
	 - Train Accuracy: 0.8809
	 - Validation Accuracy: 0.8216
	 - Validation Precision: 0.9018
	 - Validation Recall: 0.7857
	 - Validation Specificity: 0.8664

	 - Training Precision: 0.8926
	 - Training Recall: 0.9182
	 - Training Specificity: 0.8199

	 - Train F1: 0.8199

	 - Validation F1: 0.8664






In [45]:
new_sentence = 'धोति रहेछ जस्तो छ यो वेले  '

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'Hate' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Not Hate'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  धोति रहेछ जस्तो छ यो वेले  
Predicted Class:  Hate




In [46]:
torch.save(model.state_dict(), "nepali_bert.pt")

In [48]:
 !zip -r model.zip nepali_bert.pt

  adding: nepali_bert.pt (deflated 7%)


In [49]:
!ls


improved.csv  model.zip  nepali_bert.pt  NepBERTa  sample_data
