In [52]:
!pip install --upgrade transformers==4.43.3 huggingface_hub==0.24.6 --quiet

In [2]:
!pip install bnlp_toolkit --quiet
!pip install git+https://github.com/csebuetnlp/normalizer --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
normalizer 0.0.1 requires emoji==1.4.2, but you have emoji 1.7.0 which is incompatible.
normalizer 0.0.1 requires ftfy==6.0.3, but you have ftfy 6.2.0 which is incompatible.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bnlp-toolkit 4.0.3 requires emoji==1.7.0, but you have emoji 1.4.2 which is incompatible.
bnlp-toolkit 4.0.3 requires ftfy==6.2.0, but you have ftfy 6.0.3 which is incompatible.[0m[31m
[0m

In [3]:
import transformers
import huggingface_hub

print("Transformers version:", transformers.__version__)
print("Hugging Face Hub version:", huggingface_hub.__version__)

Transformers version: 4.43.3
Hugging Face Hub version: 0.24.6


In [4]:
!pip install evaluate seqeval --quiet

In [None]:
# !rm -rf seqeval evaluate metrics __pycache__

In [5]:
import pyarrow, datasets, evaluate, transformers, seqeval
print(f"pyarrow: {pyarrow.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"evaluate: {evaluate.__version__}")
print(f"transformers: {transformers.__version__}")
import importlib.metadata
print(f"seqeval: {importlib.metadata.version('seqeval')}")

2025-10-14 22:26:54.056781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760480814.083652  153330 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760480814.091551  153330 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pyarrow: 21.0.0
datasets: 4.1.1
evaluate: 0.4.6
transformers: 4.43.3
seqeval: 1.2.2


In [6]:
import evaluate

metric = evaluate.load("seqeval")
print("Seqeval metric loaded successfully")

Seqeval metric loaded successfully


In [7]:
import os
import random
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import  DataLoader, Dataset
from torch.optim import AdamW

import transformers 
from transformers import AutoModel, AutoConfig, AutoTokenizer
%env TOKENIZERS_PARALLELISM=true

from bnlp import BasicTokenizer
from normalizer import normalize

import warnings
warnings.filterwarnings("ignore")

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

env: TOKENIZERS_PARALLELISM=true


In [8]:
def process_label(label):    
    name_label=["B-PER", "I-PER"]
    new_name_label= ["B-NAME", "I-NAME"]
    labellist= []
    for i in label:
        if i in name_label:
            labellist.append(new_name_label[name_label.index(i)])
        else:
            labellist.append('O')      
    return labellist



def readfile(filename):
    file = open(filename, encoding="utf-8")
    sentences = []
    sentence = []
    labels= []
    label= []
    for line in file:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n" or line[1]=="\n":
            if len(sentence) > 0:
                sentence= " ".join(sentence)
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label= []
            continue
        
        splits = line.split('\t')
        if (splits[-1]=='\n'):
            continue
        sentence.append(splits[0])
        label.append(splits[-1].split("\n")[0]) # remove extra "\n"

    if len(sentence) >0: # for last sentence
        sentence= " ".join(sentence)
        sentences.append(sentence)
        labels.append(label)
        sentence = []
        label= []
    
    file.close()

    df= pd.DataFrame(columns=["sentences", "labels"])
    df['sentences']= sentences
    df['labels']= labels
    df['labels']= df['labels'].apply(lambda x: process_label(x))
    df['sentences']= df['sentences'].apply(lambda x: normalize(x))
    
    return df

In [9]:
train_data_path = "/kaggle/input/bengali-name-recognition/dataset/train_data.txt"
test_data_path = "/kaggle/input/bengali-name-recognition/dataset/test_data.txt"

In [10]:
train_dataset = readfile(train_data_path)
test_dataset = readfile(test_data_path)

In [11]:
print(train_dataset.shape, test_dataset.shape)
train_dataset.head(), test_dataset.head()

(4612, 2) (1950, 2)


(                                           sentences  \
 0  কী কারণে তাঁদের মধ্যে ঝামেলা হয়েছে , তা জানি ...   
 1  আশঙ্কাজনক অবস্থায় উপজেলা স্বাস্থ্য কমপ্লেক্সে...   
 2  খুলনার দিঘলিয়া উপজেলার বারাকপুর মধ্যপাড়ায় গ...   
 3  পুলিশের পিটুনিতে কেসমত আলীর মৃত্যু হয়েছে বলে ...   
 4  তবে পুলিশ বলছে , ওই ব্যক্তি পুলিশ দেখে পড়ে গে...   
 
                                               labels  
 0  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
 1                  [O, O, O, O, O, O, O, O, O, O, O]  
 2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
 3        [O, O, B-NAME, I-NAME, O, O, O, O, O, O, O]  
 4         [O, O, O, O, O, O, O, O, O, O, O, O, O, O]  ,
                                            sentences  \
 0  উন্নয়নের বিস্ময় বাংলাদেশ বাংলাদেশের অগ্রগতি ...   
 1  অর্থনীতি ও আর্থসামাজিক বেশির ভাগ সূচকে বাংলাদে...   
 2    নিম্ন আয়ের দেশগুলোকে ছাড়িয়েছে তো অনেক আগেই ।   
 3  আন্তর্জাতিক মুদ্রা তহবিল ( আইএমএফ ) গত সপ্তাহে...   
 4  সবাইকে অন্তর্ভুক্ত করে প্রবৃদ্

In [12]:
def check_common_entry(df1, df2):
    b_tokenizer= BasicTokenizer()
    list1= [tuple(b_tokenizer.tokenize(txt)) for txt in df1['sentences']]
    list2= [tuple(b_tokenizer.tokenize(txt)) for txt in df2['sentences']]
    common= set(list1).intersection(set(list2))
    return common


def remove_common_entries(df, common):
    b_tokenizer= BasicTokenizer()
    # Pre-tokenize all df sentences
    df_tokens = df['sentences'].apply(lambda x: tuple(b_tokenizer.tokenize(x)))
    # Convert common to set for fast lookup
    common_set = set(common)
    # Mask for rows to keep (sentence tokens not in common_set)
    mask = df_tokens.apply(lambda x: x not in common_set)
    df_filtered = df[mask].reset_index(drop=True)
    
    removed_count = len(df) - len(df_filtered)
    print(f"Total {removed_count} rows removed from the dataset.")
    return df_filtered


def remove_erroneous_entries(df):
    temp_df= df.copy()
    b_tokenizer= BasicTokenizer()
    temp_df['len_labels']= temp_df['labels'].apply(lambda x: len(x))
    temp_df['len_words']= temp_df['sentences'].apply(lambda x: len(b_tokenizer.tokenize(x)))
    
    error_=[]
    for i in range(len(temp_df)):
        if temp_df['len_labels'][i] != temp_df['len_words'][i]:
            error_.append(i)
    print(f"{len(error_)} no of data was detected as erroneous and discarded.")
    df= df.drop(error_).reset_index(drop= True)

    return df

In [13]:
common= check_common_entry(train_dataset, test_dataset) 
print(f"No of common sentences between Train and Test dataset: {len(common)}")

No of common sentences between Train and Test dataset: 81


In [14]:
test_dataset_= remove_common_entries(test_dataset, common)
print(test_dataset.shape, test_dataset_.shape)

Total 92 rows removed from the dataset.
(1950, 2) (1858, 2)


In [15]:
train_data = remove_erroneous_entries(train_dataset)
test_data = remove_erroneous_entries(test_dataset_)

print([train_dataset.shape,train_data.shape], [test_dataset_.shape,test_data.shape])

995 no of data was detected as erroneous and discarded.
417 no of data was detected as erroneous and discarded.
[(4612, 2), (3617, 2)] [(1858, 2), (1441, 2)]


In [16]:
train_data['name_tag']= train_data['labels'].apply(lambda x: 1 if x.count("B-NAME")>0 else 0)
train_data.head()

Unnamed: 0,sentences,labels,name_tag
0,আশঙ্কাজনক অবস্থায় উপজেলা স্বাস্থ্য কমপ্লেক্সে...,"[O, O, O, O, O, O, O, O, O, O, O]",0
1,খুলনার দিঘলিয়া উপজেলার বারাকপুর মধ্যপাড়ায় গ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
2,পুলিশের পিটুনিতে কেসমত আলীর মৃত্যু হয়েছে বলে ...,"[O, O, B-NAME, I-NAME, O, O, O, O, O, O, O]",1
3,"তবে পুলিশ বলছে , ওই ব্যক্তি পুলিশ দেখে পড়ে গে...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",0
4,"কেসমতের পরিবার , এলাকাবাসী ও পুলিশ সূত্রে জানা...","[B-NAME, O, O, O, O, O, O, O, O, O, O, O, O, O...",1


In [17]:
print(f"With name token: {sum(train_data['name_tag'])}")
print(f"Without name token: {len(train_data)-sum(train_data['name_tag'])}")

With name token: 826
Without name token: 2791


In [None]:
# test_data['name_tag']= test_data['labels'].apply(lambda x: 1 if x.count("B-NAME")>0 else 0)
# test_data.head()

In [None]:
# print(f"With name token: {sum(test_data['name_tag'])}")
# print(f"Without name token: {len(test_data)-sum(test_data['name_tag'])}")

In [51]:
def downsampling(df):
    random.seed(42)
    index_0= df[df['name_tag']==0].index   #indexes without name entity
    index_1= df[df['name_tag']==1].index   #indexes with name entity
    index_n= None
    if len(index_0) > len(index_1):
        index = [i for i in index_0]
        index_n= random.sample(index, k= len(index_0) - len(index_1))
    if index_n is not None:
        df= df.drop(index_n).reset_index(drop= True)
    return df


def upsampling(df, upsample_size=1.0):
    random.seed(42)
    df['name_tag'] = df['labels'].apply(lambda x: 1 if x.count("B-NAME") > 0 else 0)
    index_0 = df[df['name_tag'] == 0].index
    index_1 = df[df['name_tag'] == 1].index

    if len(index_0) > len(index_1):
        n_diff = len(index_0) - len(index_1)
        k = int(n_diff * upsample_size)
        index_add = random.choices(index_1, k=k)
    elif len(index_1) > len(index_0):
        n_diff = len(index_1) - len(index_0)
        k = int(n_diff * upsample_size)
        index_add = random.choices(index_0, k=k)
    else:
        index_add = []

    if index_add:
        df = pd.concat([df, df.loc[index_add]]).reset_index(drop=True)
    
    return df

In [50]:
class CONFIG:
    train= True 
    debug= False 
    seed= 42
    n_folds= 3
    num_epochs= 50
    label_names=['O', 'B-NAME', 'I-NAME']
    num_labels= len(label_names)
    model_name= "celloscopeai/celloscope-28000-ner-banglabert-finetuned"  #"csebuetnlp/banglabert"  #"csebuetnlp/banglabert_large" 
    model_checkpoint= "/kaggle/working/best_model_0.bin"
    max_length= 126
    
    do_normalize= True
    do_downsampling= False
    do_upsampling= False
    upsample_size= 1
    train_batch_size= 8
    valid_batch_size= 16
    test_batch_size= 16
    num_workers= 2
    device= torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    gradient_accumulation_steps= 1
    learning_rate= 2e-5
    weight_decay= 1e-2
    scheduler= "CosineAnnealingWarmRestarts" #"CosineAnnealingLR" #"linear"
    T_max= 500
    T_0= 500
    min_lr= 1e-7
    
    eps = 1e-6
    betas= [0.9, 0.999]
    
if CONFIG.debug:
    CONFIG.n_folds= 2
    CONFIG.num_epochs=2
    CONFIG.dataset_size= 300

In [None]:
# remove_erroneous_entries(test_data)

In [20]:
if CONFIG.do_upsampling:
    # up_dataset= upsampling(train_data, upsample_size= CONFIG.upsample_size)
    dataset= upsampling(train_data, upsample_size= CONFIG.upsample_size)
if CONFIG.do_downsampling:
    # down_dataset= downsampling(train_data)
    dataset= downsampling(train_data)

# dataset=train_data.copy()

In [131]:
print(f"Dataset shape: \nupsampled: {up_dataset.shape}, downsampled: {down_dataset.shape}, original train: {train_dataset.shape}")

print("\n.......upsampled data.......")
print(f"With name token : {sum(up_dataset['name_tag'])}")
print(f"Without name token: {len(up_dataset)-sum(up_dataset['name_tag'])}")

print("\n.......downsampled data.......")
print(f"With name token: {sum(down_dataset['name_tag'])}")
print(f"Without name token: {len(down_dataset)-sum(down_dataset['name_tag'])}")

Dataset shape: 
upsampled: (3618, 3), downsampled: (1652, 3), original train: (4612, 2)

.......upsampled data.......
With name token : 827
Without name token: 2791

.......downsampled data.......
With name token: 826
Without name token: 826


In [21]:
if CONFIG.debug:
    data= dataset[['sentences', 'labels']][: CONFIG.dataset_size]
else:
    data= dataset[['sentences', 'labels']]

data['name_tag']= data['labels'].apply(lambda x: 1 if x.count("B-NAME")>0 else 0)

In [22]:
print(data.shape)
data.head()

(5582, 3)


Unnamed: 0,sentences,labels,name_tag
0,আশঙ্কাজনক অবস্থায় উপজেলা স্বাস্থ্য কমপ্লেক্সে...,"[O, O, O, O, O, O, O, O, O, O, O]",0
1,খুলনার দিঘলিয়া উপজেলার বারাকপুর মধ্যপাড়ায় গ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1
2,পুলিশের পিটুনিতে কেসমত আলীর মৃত্যু হয়েছে বলে ...,"[O, O, B-NAME, I-NAME, O, O, O, O, O, O, O]",1
3,"তবে পুলিশ বলছে , ওই ব্যক্তি পুলিশ দেখে পড়ে গে...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",0
4,"কেসমতের পরিবার , এলাকাবাসী ও পুলিশ সূত্রে জানা...","[B-NAME, O, O, O, O, O, O, O, O, O, O, O, O, O...",1


In [23]:
print(f"With name token: {sum(data['name_tag'])}")
print(f"Without name token: {len(data)-sum(data['name_tag'])}")

With name token: 2791
Without name token: 2791


In [24]:
# checking every entry has exact number of labels corresponding to its words 
b_tokenizer= BasicTokenizer()
data['len_labels']= data['labels'].apply(lambda x: len(x))
data['len_words']= data['sentences'].apply(lambda x: len(b_tokenizer.tokenize(x)))

In [25]:
data.head()

Unnamed: 0,sentences,labels,name_tag,len_labels,len_words
0,আশঙ্কাজনক অবস্থায় উপজেলা স্বাস্থ্য কমপ্লেক্সে...,"[O, O, O, O, O, O, O, O, O, O, O]",0,11,11
1,খুলনার দিঘলিয়া উপজেলার বারাকপুর মধ্যপাড়ায় গ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,27,27
2,পুলিশের পিটুনিতে কেসমত আলীর মৃত্যু হয়েছে বলে ...,"[O, O, B-NAME, I-NAME, O, O, O, O, O, O, O]",1,11,11
3,"তবে পুলিশ বলছে , ওই ব্যক্তি পুলিশ দেখে পড়ে গে...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",0,14,14
4,"কেসমতের পরিবার , এলাকাবাসী ও পুলিশ সূত্রে জানা...","[B-NAME, O, O, O, O, O, O, O, O, O, O, O, O, O...",1,28,28


In [26]:
data_ = data.copy()

skf= StratifiedKFold(n_splits= CONFIG.n_folds, random_state= CONFIG.seed, shuffle= True)

for fold, (train_index, val_index) in enumerate(skf.split(X= data_, y= data_['name_tag'])):
    data_.loc[val_index, 'fold']= int(fold)
    
data_['fold']= data_['fold'].astype(int)

data_= data_[['sentences', 'labels', 'fold']]

print(data_.groupby('fold').size())

fold
0    1861
1    1861
2    1860
dtype: int64


In [27]:
data_.head()

Unnamed: 0,sentences,labels,fold
0,আশঙ্কাজনক অবস্থায় উপজেলা স্বাস্থ্য কমপ্লেক্সে...,"[O, O, O, O, O, O, O, O, O, O, O]",2
1,খুলনার দিঘলিয়া উপজেলার বারাকপুর মধ্যপাড়ায় গ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0
2,পুলিশের পিটুনিতে কেসমত আলীর মৃত্যু হয়েছে বলে ...,"[O, O, B-NAME, I-NAME, O, O, O, O, O, O, O]",1
3,"তবে পুলিশ বলছে , ওই ব্যক্তি পুলিশ দেখে পড়ে গে...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",2
4,"কেসমতের পরিবার , এলাকাবাসী ও পুলিশ সূত্রে জানা...","[B-NAME, O, O, O, O, O, O, O, O, O, O, O, O, O...",0


In [28]:
label_names=['O', 'B-NAME', 'I-NAME']
id2label= {}
label2id= {}
for i, label in enumerate(label_names):
    id2label[i]= label
    label2id[label] = i

display(id2label)
display(label2id)

{0: 'O', 1: 'B-NAME', 2: 'I-NAME'}

{'O': 0, 'B-NAME': 1, 'I-NAME': 2}

In [29]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_name)

In [30]:
def align_labels_with_tokens(tokens, labels):
    new_labels = []
    word_ids = tokens.word_ids()
    previous_word_id = None
    
    for word_id in word_ids:
        if word_id is None or word_id >= len(labels):
            # Special token or truncated word: ignore
            label = -100
        elif word_id != previous_word_id:
            label = label2id[labels[word_id]]
        else:
            # Repeated subword token
            label = -100
        previous_word_id = word_id
        new_labels.append(label)
    
    return new_labels

In [31]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, cfg):
        self.df= df
        self.cfg= cfg
        self.tokenizer= tokenizer
        self.max_length= self.cfg.max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text= self.df.sentences[index]
        labels= self.df.labels[index]
        inputs= self.tokenizer(text, truncation= True, max_length= self.max_length, padding= True)
        new_labels= align_labels_with_tokens(inputs, labels)
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'targets': new_labels            
        }

In [32]:
class Collate:
    
    def __init__(self, tokenizer):
        self.tokenizer= tokenizer
    
    def __call__(self, batch):
        output= dict()
        output['input_ids'] = [sample['input_ids'] for sample in batch]
        output['attention_mask'] = [sample['attention_mask'] for sample in batch]
        output['targets'] = [sample['targets'] for sample in batch]
        
        batch_max= max([len(ids) for ids in output['input_ids']])
        
        # dynamic padding
        if self.tokenizer.padding_side == 'right':
            output['input_ids'] = [ids + (batch_max - len(ids))*[self.tokenizer.pad_token_id] for ids in output['input_ids']]
            output['attention_mask']= [mask + (batch_max - len(mask))*[0] for mask in output['attention_mask']]
            output['targets']= [target + (batch_max - len(target))*[-100] for target in output['targets']]
        else:
            output['input_ids'] = [(batch_max - len(ids))*[self.tokenizer.pad_token_id] + ids for ids in output['input_ids']]
            output['attention_mask']= [(batch_max - len(mask))*[0] + mask for mask in output['attention_mask']]
            output['targets']= [(batch_max - len(target))*[-100] + target for target in output['targets']]
        
        output['input_ids'] = torch.tensor(output['input_ids'], dtype= torch.long)
        output['attention_mask'] = torch.tensor(output['attention_mask'], dtype= torch.long)
        output['targets'] = torch.tensor(output['targets'], dtype=torch.long)
        
        return output
    

collate_fn= Collate(tokenizer)

In [33]:
def prepare_loader(df, tokenizer,fold, cfg):
    df_train= df[df.fold != fold].reset_index(drop= True) 
    df_valid= df[df.fold == fold].reset_index(drop= True)
    valid_labels = df_valid['labels'].values
    
    # converting dataFrame to dataset.
    train_dataset= CustomDataset(df_train, tokenizer, cfg)
    valid_dataset= CustomDataset(df_valid, tokenizer, cfg)
    
    train_loader= DataLoader(train_dataset, 
                             batch_size= cfg.train_batch_size, 
                             collate_fn= collate_fn, 
                             num_workers= cfg.num_workers, 
                             shuffle= True, 
                             pin_memory= True,
                             drop_last= False, )
    
    valid_loader= DataLoader(valid_dataset, 
                            batch_size= cfg.valid_batch_size,
                            collate_fn= collate_fn, 
                            num_workers= cfg.num_workers,
                            shuffle= False,
                            pin_memory= True, 
                            drop_last= False,
                            )
    
    return train_loader, valid_loader

In [34]:
class NER_MODEL(nn.Module):
    def __init__(self, model_name= None, cfg= CONFIG):
        super(NER_MODEL, self).__init__()
        self.cfg= cfg
        self.num_labels= self.cfg.num_labels
        if model_name != None:
            self.model_name= model_name
        else:
            self.model_name= self.cfg.model_name
            
        self.model_config= AutoConfig.from_pretrained(self.model_name, output_hidden_states= True)
        self.model= AutoModel.from_pretrained(self.model_name, config= self.model_config)
        
        self.dropout= nn.Dropout(p= 0.2)
        self.linear= nn.Linear(self.model_config.hidden_size, self.num_labels)


    
    def forward(self, input_ids, attention_mask, targets= None):
        
        outputs= self.model(input_ids,
                            attention_mask= attention_mask)
        
        sequence_output= outputs[0]

        entity_logits= self.dropout(sequence_output)
        # entity_logits= sequence_output #self.dropout(sequence_output)
        entity_logits= self.linear(entity_logits)
        
        return entity_logits

In [49]:
import evaluate
metric = evaluate.load("seqeval")
print("Seqeval metric loaded successfully")

def compute_metrics(logits, labels):
    predictions = np.argmax(logits.detach().cpu().numpy(), axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[ label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    
    return all_metrics['overall_f1']


def token_loss_fn(logits, labels, attention_mask= None):
    loss_fn= nn.CrossEntropyLoss(ignore_index= -100) 
    num_labels= CONFIG.num_labels
    
    if attention_mask is not None:
        mask= attention_mask.view(-1) == 1   # mask for keeping the effective part
        active_logits= logits.view(-1, num_labels)[mask]
        active_labels= labels.view(-1)[mask]
        entity_loss= loss_fn(active_logits, active_labels)
    else:
        entity_loss= loss_fn(logits.view(-1, num_labels), labels.view(-1))
    
    return entity_loss

Seqeval metric loaded successfully


In [36]:
def fetch_scheduler(optimizer):
    if CONFIG.scheduler == "CosineAnnealingLR":
        scheduler= lr_scheduler.CosineAnnealingLR(optimizer, T_max= CONFIG.T_max, eta_min= CONFIG.min_lr)
    elif CONFIG.scheduler == "CosineAnnealingWarmRestarts":
        scheduler= lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0= CONFIG.T_0, eta_min= CONFIG.min_lr)
    elif CONFIG.scheduler== "linear":
        scheduler= lr_scheduler.LinearLR(optimizer, start_factor= 0.01, end_factor= 1.0, total_iters= 100)
    elif CONFIG.scheduler == None:
        return None

    return scheduler

In [37]:
def train_one_epoch(model, dataloader, optimizer, scheduler, fold, epoch, device= CONFIG.device):
    model.train()

    dataset_size= 0
    running_loss= 0.0
    score= []
    
    progress_bar= tqdm(enumerate(dataloader), total= len(dataloader))
    steps= len(dataloader)
    for step, data in progress_bar:
        ids= data['input_ids'].to(device, dtype= torch.long)
        masks= data['attention_mask'].to(device, dtype= torch.long)
        targets= data['targets'].to(device, dtype= torch.long)
        
        batch_size= ids.size(0)
        outputs= model(ids, masks)
        loss= token_loss_fn(outputs, targets, attention_mask= masks)
        f1_score= compute_metrics(logits= outputs, labels= targets)
        score.append(f1_score)
        if CONFIG.gradient_accumulation_steps > 1:
            loss= loss/ CONFIG.gradient_accumulation_steps
        
        loss.backward()
        ## Gradient Accumulation
        if (step + 1) % CONFIG.gradient_accumulation_steps == 0 or step == steps:
            optimizer.step() 
            optimizer.zero_grad()
            
            if scheduler is not None:
                scheduler.step()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss= running_loss/ dataset_size
        epoch_f1_score= np.mean(score)
        
        progress_bar.set_postfix(Epoch= epoch,
                                 Train_loss= epoch_loss,
                                 F1_Score= epoch_f1_score,
                                 LR= optimizer.param_groups[0]['lr'])
    
    return epoch_loss, epoch_f1_score 

In [38]:
def valid_one_epoch(model, dataloader, epoch, device= CONFIG.device):
    model.eval()
    
    dataset_size= 0
    running_loss= 0.0
    score= []
    
    progress_bar= tqdm(enumerate(dataloader), total= len(dataloader))
    steps= len(dataloader)
    
    for step, data in progress_bar:
        ids= data['input_ids'].to(device, dtype= torch.long)
        masks= data['attention_mask'].to(device, dtype= torch.long)
        targets= data['targets'].to(device, dtype= torch.long)
        
        batch_size= ids.size(0)
        
        with torch.no_grad():
            outputs= model(ids, masks)
            loss= token_loss_fn(outputs, targets, attention_mask= masks)
            f1_score= compute_metrics(logits= outputs, labels= targets)
        
        score.append(f1_score)
        
        if CONFIG.gradient_accumulation_steps > 1:
            loss= loss/ CONFIG.gradient_accumulation_steps
        
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss= running_loss/ dataset_size
        epoch_f1_score= np.mean(score)
        
        progress_bar.set_postfix(Epoch= epoch,
                                 Valid_loss= epoch_loss,
                                 Valid_F1_Score= epoch_f1_score)
        
    return epoch_loss, epoch_f1_score 

In [39]:
import time
from collections import defaultdict
def training_loop(model, optimizer, scheduler, fold, num_epochs= CONFIG.num_epochs, patience= 3):
    
    start= time.time()
    best_loss= np.inf
    best_score= 0
    trigger_times= 0
    history= defaultdict(list)
    
    for epoch in range(1, num_epochs+1):
        # ---- Training Phase ----
        train_epoch_loss, train_f1_score= train_one_epoch(model, train_loader, optimizer, scheduler, fold, epoch, CONFIG.device)
        # ---- Validation Phase ----
        valid_epoch_loss, valid_f1_score = valid_one_epoch(model, valid_loader, epoch, CONFIG.device)

        # ---- Track metrics ----
        history['train_loss'].append(train_epoch_loss)
        history['valid_loss'].append(valid_epoch_loss)
        history['train_f1_score'].append(train_f1_score)
        history['valid_f1_score'].append(valid_f1_score)
        
        # ---- Model checkpointing ----
        if  valid_f1_score >= best_score: 
            trigger_times= 0
            print(f"Validation Score Improved {best_score:.4f} ---> {valid_f1_score:.4f}")
            best_score= valid_f1_score
            
            path= f"best_model_{fold}.bin"
            torch.save(model.state_dict(), path)
            print(f"Model saved to {path}")
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early Stoping. \n")
                break
                
    time_elapsed= time.time() - start
    print(f"\nTraining complete in {time_elapsed // 3600:.0f}h {(time_elapsed % 3600) // 60:.0f}m {(time_elapsed % 60):.0f}s")

    
    return history, valid_epoch_loss, best_score

### Run Training

In [40]:
if CONFIG.train:
    fold_scores= []
    for fold in range(CONFIG.n_folds):
        torch.cuda.empty_cache()
        print(f"====== Fold: {fold} ======")

        train_loader, valid_loader = prepare_loader(data_, tokenizer, fold= fold, cfg= CONFIG)

        model= NER_MODEL(cfg= CONFIG)
        model.to(device= CONFIG.device)

        optimizer= AdamW(model.parameters(), lr= CONFIG.learning_rate, weight_decay= CONFIG.weight_decay, eps= CONFIG.eps, betas= CONFIG.betas)
        scheduler= fetch_scheduler(optimizer)

        history, epoch_loss, f1_score= training_loop(model, optimizer, scheduler, fold, CONFIG.num_epochs, patience= 5)#epoch_loss

        print("\n\n")
        print(f"Fold [{fold}] avg loss: {epoch_loss}\n")
        print(f"Fold [{fold}] avg score: {f1_score}\n")
        fold_scores.append(f1_score)
        
        if fold < CONFIG.n_folds-1:
            del model
        del train_loader, valid_loader

    print(f"====== ====== ====== ======")
    print(f"Overall score: {np.mean(np.mean(fold_scores, axis= 0))}")
    print(f"====== ====== ====== ======")



  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.0000 ---> 0.8338
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8338 ---> 0.8410
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8410 ---> 0.8472
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8472 ---> 0.8535
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8535 ---> 0.8540
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8540 ---> 0.8643
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8643 ---> 0.8774
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8774 ---> 0.8818
Model saved to best_model_0.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Early Stoping. 


Training complete in 0h 16m 23s



Fold [0] avg loss: 0.01152227494580967

Fold [0] avg score: 0.8818110802050969



  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.0000 ---> 0.7985
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.7985 ---> 0.8063
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8063 ---> 0.8176
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8176 ---> 0.8247
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8247 ---> 0.8329
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8329 ---> 0.8330
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8330 ---> 0.8340
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8340 ---> 0.8361
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8361 ---> 0.8368
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8368 ---> 0.8414
Model saved to best_model_1.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Early Stoping. 


Training complete in 0h 24m 9s



Fold [1] avg loss: 0.01669692062667999

Fold [1] avg score: 0.8413742553524662



  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.0000 ---> 0.7967
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.7967 ---> 0.7970
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.7970 ---> 0.8125
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8125 ---> 0.8134
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8134 ---> 0.8334
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8334 ---> 0.8400
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8400 ---> 0.8414
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8414 ---> 0.8459
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Validation Score Improved 0.8459 ---> 0.8503
Model saved to best_model_2.bin


  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/466 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

Early Stoping. 


Training complete in 0h 20m 52s



Fold [2] avg loss: 0.025874755348359618

Fold [2] avg score: 0.8502798885380481

Overall score: 0.8578217413652037


## Testing

In [41]:
from tqdm.auto import tqdm
def testing_loop(model, dataloader, device= CONFIG.device):
    model.eval()
    
    score= []
    
    progress_bar= tqdm(enumerate(dataloader), total= len(dataloader))
    steps= len(dataloader)
    
    for step, data in progress_bar:
        ids= data['input_ids'].to(device, dtype= torch.long)
        masks= data['attention_mask'].to(device, dtype= torch.long)
        targets= data['targets'].to(device, dtype= torch.long)
        
        batch_size= ids.size(0)
        
        with torch.no_grad():
            outputs= model(ids, masks)
            f1_score= compute_metrics(logits= outputs, labels= targets)
        
        score.append(f1_score)
        
        f1_score= np.mean(score)
        
        progress_bar.set_postfix(test_F1_Score= f1_score,)
    
    print(f"====== ====== ====== ======")
    print(f"Overall f1_score: {np.mean(np.mean(f1_score))}")
    print(f"====== ====== ====== ======")

    return f1_score 

In [42]:
test_dataset= CustomDataset(test_data, tokenizer, CONFIG)
test_loader= DataLoader(test_dataset, 
                        batch_size= CONFIG.test_batch_size,
                        collate_fn= collate_fn, 
                        num_workers= CONFIG.num_workers,
                        shuffle= False,
                        pin_memory= True, 
                        drop_last= False,
                        )

In [43]:
model_paths= [
    "/kaggle/working/best_model_0.bin",
    "/kaggle/working/best_model_1.bin",
    "/kaggle/working/best_model_2.bin"
    ]

In [44]:
model= NER_MODEL(cfg= CONFIG)
for i,model_path in enumerate(model_paths):
    model.load_state_dict(torch.load(model_path, map_location= CONFIG.device))
    model.to(CONFIG.device)
    print(f"Run testing for model {CONFIG.model_name}")
    f1_score= testing_loop(model, test_loader, device= CONFIG.device)
    print(f"Model {i} result: {f1_score}")

Run testing for model celloscopeai/celloscope-28000-ner-banglabert-finetuned


  0%|          | 0/91 [00:00<?, ?it/s]

Overall f1_score: 0.8357228470488637
Model 0 result: 0.8357228470488637
Run testing for model celloscopeai/celloscope-28000-ner-banglabert-finetuned


  0%|          | 0/91 [00:00<?, ?it/s]

Overall f1_score: 0.8369379458726484
Model 1 result: 0.8369379458726484
Run testing for model celloscopeai/celloscope-28000-ner-banglabert-finetuned


  0%|          | 0/91 [00:00<?, ?it/s]

Overall f1_score: 0.8349410558910031
Model 2 result: 0.8349410558910031


## End to End Inference

In [45]:
def prediction(text, model, tokenizer, cfg= CONFIG):
    inputs= tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt")
    outputs= model(inputs['input_ids'].to(cfg.device), inputs['attention_mask'].to(cfg.device))
    outputs= outputs.detach().cpu().numpy().argmax(axis= -1)[0, 1:-1]
    return outputs

def inference_fn(text, model_name= None, model_checkpoint= None, cfg= CONFIG):
    # loading model and weights
    if model_name is not None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model= NER_MODEL(model_name= model_name, cfg= cfg)
    else:
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        model= NER_MODEL(cfg= CONFIG)
        
    if model_checkpoint != None:
        model.load_state_dict(torch.load(model_checkpoint, map_location=  cfg.device))
    else:
        model.load_state_dict(torch.load(cfg.model_checkpoint, map_location=  cfg.device))
    
    model.to(cfg.device)
    ## processing inputs
    outputs=[]
    if type(text) == str:
        text= normalize(text)
        output= prediction(text, model, tokenizer, cfg)
        outputs.append(output)
    elif type(text)== list:
        for txt in text:
            txt= normalize(txt)
            output= prediction(txt, model, tokenizer, cfg)
            outputs.append(output)      
    else:
        outputs= None
        print("Please give input in string format or list of strings")
    
    return outputs

In [46]:
def extract_spans(prediction):
    span_indices = [i for i, v in enumerate(prediction) if v != 0 ]
    span_list= []
    span= []
    # span_indices
    for i in range(len(span_indices)):
        if i == 0 or span_indices[i] != span_indices[i-1]+1:
            if span:
                span_list.append(span)

            span= [span_indices[i]]

        else:
            span.append(span_indices[i])
    if span:
        span_list.append(span)
    
    return span_list

def extract_names(text, span_list, tokenizer):
    name_list= []
    if len(span_list) > 0:
        for span in span_list:
            tokens= tokenizer(text)['input_ids'][1:-1][span[0]:span[-1]+1]
            name= normalize(tokenizer.decode(tokens))
            name_list.append(name)
        return name_list
    else:
        return None

def show_names(texts, predictions, tokenizer):
    if type(texts)== str:
        texts= [texts]
    for text, pred in zip(texts, predictions):
        span_list= extract_spans(pred)
        name_list= extract_names(text, span_list, tokenizer)
        print(f"Given Text: {text} \nExtracted Names: {name_list}")
    

In [47]:
model_path= model_paths[0]
texts=  "শিক্ষা মন্ত্রণালয়ের দায়িত্বশীল একটি সূত্রে জানা যায়, মুহাম্মদ আজাদ খানের বিষয়ে শিক্ষা মন্ত্রণালয়ের নীতিনির্ধারকেরা সন্তুষ্ট ছিলেন না।" 
texts= ["অভিনেত্রী মন্দিরা চক্রবর্তীকে পূজার চার সাজে সাজানোর সময় ডিজাইনার হিসেবে আরাম, স্বকীয়তা আর স্বাচ্ছন্দ্যকে প্রাধান্য দিয়েছি বেশি। প্রতিটি সাজেই ছিল টেকসই নকশার ছোঁয়া।",
       "শিল্প মন্ত্রণালয়ের সচিব মো. আব্দুর রহিম বলেন, পবিত্র রমজান মাস এলেই ব্যবসায়ীদের মধ্যে বেশি মুনাফা করার প্রবণতা তৈরি হয়।",
       "শিক্ষা মন্ত্রণালয়ের দায়িত্বশীল একটি সূত্রে জানা যায়, মুহাম্মদ আজাদ খানের বিষয়ে শিক্ষা মন্ত্রণালয়ের নীতিনির্ধারকেরা সন্তুষ্ট ছিলেন না।"
      ]

outputs= inference_fn(texts, model_checkpoint= model_path, cfg= CONFIG)
show_names(texts, outputs, tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Given Text: অভিনেত্রী মন্দিরা চক্রবর্তীকে পূজার চার সাজে সাজানোর সময় ডিজাইনার হিসেবে আরাম, স্বকীয়তা আর স্বাচ্ছন্দ্যকে প্রাধান্য দিয়েছি বেশি। প্রতিটি সাজেই ছিল টেকসই নকশার ছোঁয়া। 
Extracted Names: ['মন্দিরা চক্রবর্তীকে']
Given Text: শিল্প মন্ত্রণালয়ের সচিব মো. আব্দুর রহিম বলেন, পবিত্র রমজান মাস এলেই ব্যবসায়ীদের মধ্যে বেশি মুনাফা করার প্রবণতা তৈরি হয়। 
Extracted Names: ['মো. আব্দুর রহিম']
Given Text: শিক্ষা মন্ত্রণালয়ের দায়িত্বশীল একটি সূত্রে জানা যায়, মুহাম্মদ আজাদ খানের বিষয়ে শিক্ষা মন্ত্রণালয়ের নীতিনির্ধারকেরা সন্তুষ্ট ছিলেন না। 
Extracted Names: ['মুহাম্মদ আজাদ খানের']
