In [None]:
!pip install -r requirements.txt

In [None]:
from tqdm import tqdm
import torch, gc, os
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import recall_score, precision_score

class NERTrainer():
    def __init__(self, 
                 model, 
                 train_dataloader, 
                 valid_dataloader, 
                 optimizer, 
                 scheduler, 
                 num_epochs,
                 id2label, 
                 output_dir,
                 patience,
                 fold):
        self.model = model
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.num_epochs = num_epochs
        self.output_dir = output_dir
        self.id2label = id2label
        self.patience = patience
        self.fold = fold

    def clear_memories(self):
        torch.cuda.empty_cache()
        gc.collect()
        
    def compute_metrics(self,logits, labels):
        predictions = logits.argmax(-1).numpy()
        true_labels = [[ self.id2label[l] for l in label if l != -100] for label in labels.numpy()]
        true_predictions = [[self.id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels.numpy())]
        
        score = f1_score(y_true = true_labels, y_pred=true_predictions, average='macro')
        return score
    
    def compute_f5_score(self, logits, labels):
        predictions = logits.argmax(-1).numpy()
        true_labels = [[ self.id2label[l] for l in label if l != -100] for label in labels.numpy()]
        true_predictions = [[self.id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels.numpy())]
        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
        return f5_score
    
    def train_one_epoch(self, epoch):
        self.model.train()
        
        running_loss = 0
        progress_bar = tqdm(enumerate(self.train_dataloader),total=len(self.train_dataloader))
        scores = []
        
        for step, data in progress_bar:
            out = self.model(**data)
            loss = out.loss
            logits = out.logits
            
            loss.backward()
            # print(optimizer)
            self.optimizer.step()
            self.optimizer.zero_grad()
            
            scores.append(self.compute_f5_score(logits=logits.detach().cpu(),
                                               labels= data['labels'].detach().cpu()))
            
            if self.scheduler is not None:
                self.scheduler.step()
                
            running_loss += loss.item()
            epoch_loss = running_loss/(step+1)
            
            score = sum(scores)/len(scores)
            progress_bar.set_postfix(Epoch = epoch,
                                    TrainingLoss = epoch_loss,
                                    F1 = score
                                    )
            
        del out, loss, logits
        self.clear_memories()
        return epoch_loss, score
    
    def valid_one_epoch(self, epoch):
        self.model.eval()
        
        running_loss = 0
        progress_bar = tqdm(enumerate(self.valid_dataloader),total=len(self.valid_dataloader))
        scores = []
        for index, data in progress_bar:
            with torch.no_grad():
                out = self.model(**data)
            loss = out.loss
            logits = out.logits
                
            running_loss += loss.item()
            epoch_loss = running_loss/(index+1)
            scores.append(self.compute_f5_score(logits=logits.detach().cpu(),
                            labels= data['labels'].detach().cpu()))
            
            score = sum(scores)/len(scores)
            progress_bar.set_postfix(Epoch = epoch,
                                    ValidationLoss = epoch_loss,
                                    F1 = score
                                    )
            
        del out, loss, logits
        self.clear_memories()
        return epoch_loss, score

    def __call__(self):
        print('\n')
        prev_best_loss = np.inf
        best_score = -np.inf
        model_output_dir=self.output_dir
        
        early_break_count = 0
        for epoch in range(self.num_epochs):
            training_loss, training_score = self.train_one_epoch(epoch = epoch)
            
            validation_loss, validation_score = self.valid_one_epoch(epoch = epoch)
            
            print('='*170 + '\n')
            print(f'Fold- {self.fold}, epoch- {epoch}')
            print(f'Training Loss for epoch: {epoch} is {training_loss}, F1 Score is: {training_score}')
            print(f'Validation Loss for epoch: {epoch} is {validation_loss}, F1 Score is: {validation_score}')

            if validation_score > best_score:
                print(f'F1 Score improved from {best_score} --> {validation_score}')
                best_score = validation_score
                
                checkpoint_dir = os.path.join(model_output_dir,f'Checkpoint-Fold-{self.fold}-F1')
                
                if not os.path.exists(model_output_dir):
                    os.mkdir(model_output_dir)
                if not os.path.exists(checkpoint_dir):
                    os.mkdir(os.path.join(checkpoint_dir))
                    
                self.model.save_pretrained(save_directory = checkpoint_dir)
                print(f"Model Saved at {checkpoint_dir}")
                
                if validation_score > 0.95:
                    break
                
            elif validation_loss < prev_best_loss:
                print(f'Loss improved from {prev_best_loss} --> {validation_loss}')
                prev_best_loss = validation_loss
                
                checkpoint_dir = os.path.join(model_output_dir,f'Checkpoints-Fold-{self.fold}-loss')
                
                if not os.path.exists(model_output_dir):
                    os.mkdir(model_output_dir)
                if not os.path.exists(checkpoint_dir):
                    os.mkdir(os.path.join(checkpoint_dir))
                
                self.model.save_pretrained(save_directory = checkpoint_dir)
                print(f"Model Saved at {checkpoint_dir}")
                
            else:
                early_break_count +=1
                print(f'Early break is at {early_break_count}, will stop training at {self.patience}')
                if early_break_count >= self.patience:
                    print(f'Early Stopping')
                    break
                            
            print('\n' + '='*170)
        
        print(f'Training over with best loss: {prev_best_loss} and best F1: {best_score}')
        
        fold_dir = os.path.join(model_output_dir, f'Fold-{self.fold}')
        if not os.path.exists(os.path.join(model_output_dir, f'Fold-{self.fold}')):
            os.mkdir(fold_dir)
        self.model.save_pretrained(save_directory = fold_dir)
        
        print(f'Model saved at {model_output_dir}')
        print('='*170)

In [None]:
import re, torch
from torch.utils.data import Dataset
from banglanlptoolkit import BnNLPNormalizer

def create_id_label_conversion(unique_labels):
    id2label = {}
    label2id = {}
    for index, label in enumerate((unique_labels)):
        label2id[label] = int(index)
        id2label[index] = label
    return id2label, label2id, len(id2label)

class preprocess():
    def __init__(self, 
                 label2id = None,
                 return_target_ids = True,
                 stopword_dict=[], 
                 stopword_remove = True,
                 punct_remove = True, 
                 to_lower = True, 
                 strip = True):
        
        self.to_lower = to_lower
        self.return_ids = return_target_ids
        self.punct_remove = punct_remove
        self.strip = strip
        self.stopword_dict = stopword_dict
        self.stopword_remove = stopword_remove
        self.label2id = label2id
        
    def remove_punctuations(self, text, label):
        if self.punct_remove:
            text = re.sub(r'[^\w\s]', '', text.strip())
            text = re.sub(r'[|iœОабвгдезиклмнопрстуцчщь。いくけしたてなを一不业中丰为了产人们任优会使便保內其力务務区口可各后吸售回國地場孔家富専己市引心懂成手技抗护挽捷措撬改教文断新施晦普晰服术来果業標正洋流海涩清源漁焕然物現畅留発的目真瞳研碑磨社科章続练细育自艺节行表見见論质资距进遗重鑽門间难]','',text.strip())
            text = re.sub(r'[Éàáãçéêíя–—，’]', '', text.strip())
            return re.sub(r'[_]', '', text.strip()) # if label == 'O' else text
        else:
            return text
    
    def remove_stopwords(self, text, label):
        if not self.stopword_remove:
            return text
        if text in self.stopword_dict:
            return '' if label == 'O' else text
        else:
            return text
    
    def strip_empty_strings(self, tokens, labels):
        new_labels = []
        new_tokens = []
        if not self.strip:
            return tokens, labels
        
        for token, label in zip(tokens, labels):
            if token != '':
                new_tokens.append(token)
                if self.return_ids and self.label2id != None:
                    new_labels.append(self.label2id[label])
                else:
                    new_labels.append(label)
        return new_tokens, new_labels
        
    def __call__(self, tokens, labels):
        if self.to_lower:
            tokens = [self.remove_punctuations(text, label).lower() for text, label in zip(tokens, labels)]
        else:
            tokens = [self.remove_punctuations(text, label) for text, label in zip(tokens, labels)]
        
        tokens = [self.remove_stopwords(text, label) for text, label in zip(tokens, labels)]
        tokens, labels = self.strip_empty_strings(tokens, labels)
        if len(tokens) != len(labels):
            raise ValueError(f'The length of tokens are {len(tokens)} while the length of labels are {len(labels)}')
        return tokens, labels
    
class tokenize_data():
    def __init__(self, 
                 tokenizer, 
                 add_special_tokens=False, 
                 max_length=None, 
                 padding=False, 
                 truncation=None):
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding
        self.truncation = truncation
        self.add_special_tokens = add_special_tokens
        
    def align_labels_to_ids(self, labels, word_id_list):
        new_labels = []
        prev_word_id = -1
        none_label_token = -101
        
        for _, word_id in enumerate(word_id_list):
            if word_id is None:
                new_labels.append(none_label_token)
            elif word_id != prev_word_id:
                prev_word_id = word_id
                new_labels.append(labels[word_id])
            elif word_id == prev_word_id:
                prev_word_id = word_id
                new_labels.append(labels[word_id])
        return new_labels
    
    def __call__(self, tokens, labels):
        tokenized_inputs = self.tokenizer(' '.join(tokens), 
                                          add_special_tokens=self.add_special_tokens, 
                                          truncation=self.truncation, 
                                          padding=self.padding, 
                                          max_length = self.max_length)
        word_id_list = tokenized_inputs.word_ids()
        labels = self.align_labels_to_ids(labels, word_id_list)
        if len(tokenized_inputs['input_ids']) != len(labels):
            raise ValueError(f"The length of tokenized inputs are {len(tokenized_inputs['input_ids'])} while the length of labels are {len(labels)}")
        return tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], labels
    
class CustomDataCollator:
    def __init__(self, tokenizer, device):
        self.tokenizer= tokenizer
        self.device = device
        
    def __call__(self, batch):
        output= {}
        max_len = max([len(ids['input_ids']) for ids in batch])
        
        output["input_ids"] = [sample['input_ids'] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output['labels'] = [sample['labels'] for sample in batch]
        
        max_len= max([len(ids) for ids in output['input_ids']])
        
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = torch.tensor([ids + (max_len - len(ids))*[self.tokenizer.pad_token_id] for ids in output['input_ids']], dtype=torch.long, device=self.device)
            output['attention_mask']= torch.tensor([mask + (max_len - len(mask))*[0] for mask in output['attention_mask']], dtype=torch.long, device=self.device)
            output['labels']= torch.tensor([target + (max_len - len(target))*[-100] for target in output['labels']], dtype=torch.long, device=self.device)
        else:
            output["input_ids"] = torch.tensor([(max_len - len(ids))*[self.tokenizer.pad_token_id] + ids for ids in output['input_ids']], dtype=torch.long, device=self.device)
            output['attention_mask']= torch.tensor([(max_len - len(mask))*[0] + mask for mask in output['attention_mask']], dtype=torch.long, device=self.device)
            output['labels']= torch.tensor([(max_len - len(target))*[-100] + target for target in output['labels']], dtype=torch.long, device=self.device)

        return output
    
class NERDataset(Dataset):
    def __init__(self, 
                 data,
                 label2id,
                 max_length,
                 tokenizer,
                 stopwords
                 ):
        
        self.label2id = label2id
        self.tokens = data.tokens
        self.labels = data.labels
        self.max_length = max_length
        self.preprocess_fn = preprocess(label2id=self.label2id,stopword_dict=stopwords)
        self.tokenizer_fn = tokenize_data(tokenizer = tokenizer, max_length=self.max_length, truncation=True)
    
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        tokens, labels = self.preprocess_fn(self.tokens[index], self.labels[index])
        input_ids, attention_mask, labels = self.tokenizer_fn(tokens, labels)
        return {'input_ids' : input_ids, 'attention_mask' : attention_mask, 'labels' : labels}

In [None]:
import pandas as pd
import json, torch, gc, os, glob
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForTokenClassification, AutoModelForTokenClassification
from torch.utils.data import DataLoader
from data_utils import create_id_label_conversion, CustomDataCollator, NERDataset
from torch.optim import AdamW, lr_scheduler
from train_utils import NERTrainer

class CONFIG:
    train_debug = False
    seeds = [42] #[0, 42, 43, 50] # Random seeds for each fold
    dataset_path = ['DATA/processed_train.json','External Data']
    model_path = "microsoft/deberta-v3-base" #'dslim/bert-base-NER'
    stopword_dir = 'json_folder/stopwords.json'
    train_batch_size = 2
    valid_batch_size = 2
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    learning_rate = 2e-5
    weight_decay = 0.1
    output_dir = 'Test' if train_debug else 'Models/Train_Deberta_V3'
    num_epochs = 3 if train_debug else 100
    T_max = 500
    min_lr = learning_rate
    max_length = 128 if train_debug else 1024
    patience = 9
    
# Load NLTK Stopwords for English
with open(CONFIG.stopword_dir,'r') as f:
    stopwords = json.load(f)['english']

# Load Data from CSV
data = pd.DataFrame()
for data_path in CONFIG.dataset_path:
    if os.path.isfile(data_path):
        data = pd.concat([data, pd.DataFrame({'tokens':pd.read_json(data_path)['tokens'].tolist(), 'labels': pd.read_json(data_path)['labels'].tolist()})])
    elif os.path.isdir(data_path):
        files = glob.glob(data_path + '/*')
        for file in files:
            data = pd.concat([data, pd.DataFrame({'tokens':pd.read_json(file)['tokens'].tolist(), 'labels': pd.read_json(file)['labels'].tolist()})])

# Calculate id2label and label2id using unique labels from dataframe
unique_labels = pd.DataFrame.explode(data,column='labels').labels.unique()
id2label, label2id, num_labels = create_id_label_conversion(unique_labels)

for fold, seed in enumerate(CONFIG.seeds):
    print('\n'+'+'*170)
    print(f'Training Starting for Fold {fold}')
    # Split and Fold
    train_data, valid_data = train_test_split(data, test_size=0.2, random_state=seed)
    train_data.reset_index(drop=True,inplace=True)
    valid_data.reset_index(drop=True,inplace=True)

    # Define and initialize necessary modules
    tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_path)    
    data_collator_fn = CustomDataCollator(tokenizer=tokenizer, device=CONFIG.device)
    model = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=CONFIG.model_path,
                                    id2label = id2label,
                                    label2id = label2id,
                                    num_labels = num_labels,
                                    ignore_mismatched_sizes = True
                                    ).to(CONFIG.device)

    # Initialize Dataset and DataLoader
    train_dataset = NERDataset(train_data, label2id=label2id, max_length=CONFIG.max_length, tokenizer=tokenizer, stopwords=stopwords)
    train_dataloader = DataLoader(train_dataset, batch_size=CONFIG.train_batch_size, collate_fn=data_collator_fn, shuffle=True, pin_memory=False)

    valid_dataset = NERDataset(valid_data, label2id=label2id, max_length=CONFIG.max_length, tokenizer=tokenizer, stopwords=stopwords)
    valid_dataloader = DataLoader(valid_dataset, batch_size=CONFIG.valid_batch_size, collate_fn=data_collator_fn, shuffle=False, pin_memory=False)

    print('\n'+'='*170)
    print(f'Training will start with {len(train_dataset)} training datapoints and {len(valid_dataset)} validation datapoints.')
    print(f'Batch size being used for training is {CONFIG.train_batch_size} and maximum length for datapoints are {CONFIG.max_length}')
    print(f'The unique label number is {num_labels}, unique labels are {id2label.values()}')
    print('='*170)

    # Initiate Optimizer and Scheduler
    optimizer= AdamW(model.parameters(), lr= CONFIG.learning_rate, weight_decay= CONFIG.weight_decay)
    scheduler= lr_scheduler.CosineAnnealingLR(optimizer, T_max= CONFIG.T_max, eta_min= CONFIG.min_lr)

    # Training Loop
    NERTrainer(model=model,
            train_dataloader=train_dataloader,
            valid_dataloader=valid_dataloader,
            num_epochs=CONFIG.num_epochs,
            id2label = id2label,
            optimizer=optimizer,
            scheduler=scheduler,
            output_dir=CONFIG.output_dir,
            patience = CONFIG.patience,
            fold=fold)()
    
    del model, optimizer, scheduler, train_dataset, valid_dataset, train_dataloader, valid_dataloader
    torch.cuda.empty_cache()
    gc.collect()
    print(f'Fold {fold}, finished training.')
    print('+'*170)
