In [2]:
import numpy as np # linear algebra
import pandas as pd 


import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
! pip install pytorch-crf seqeval



In [None]:
import torch
import torch.nn as nn
from torchcrf import CRF
from tqdm import tqdm
import transformers

import pandas as pd
import numpy as np
import joblib

from sklearn import preprocessing
from sklearn import model_selection

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset
from seqeval.metrics import classification_report as seq_classification_report, accuracy_score as seq_accuracy_score, f1_score as seq_f1_score


In [None]:
class Config:
    def __init__(self):
        self.MAX_LEN = 128
        self.TRAIN_BATCH_SIZE = 16
        self.VALID_BATCH_SIZE = 8
        self.EPOCHS = 15
        self.BASE_MODEL_PATH = "bert-large-cased"
        self.MODEL_PATH = "./model.pt"
#         self.TRAINING_FILE = "/input/conll003-englishversion/train.txt"
        self.TOKENIZER = transformers.BertTokenizer.from_pretrained(
                self.BASE_MODEL_PATH,
                do_lower_case=False
            )
config = Config()

In [None]:
class EntityDataset:
    def __init__(self, texts, tags):
        # Ensure the length of texts is equal to the length of tags
        assert len(texts) == len(tags)
        self.texts = texts
        self.tags = tags

    def __len__(self):
        # Return the length of the dataset
        return len(self.texts)

    def __getitem__(self, item):
        # Get text and tags for a specific item
        text = self.texts[item]
        tags = self.tags[item]

        # Initialize empty lists and variables
        ids = []
        target_tag = tags
        valid_mask = []

        # Tokenize each word in the text
        for i, s in enumerate(text):
            inputs = config.TOKENIZER.tokenize(s)
            # Create a valid mask for each token
            valid_mask.extend([1] + [0] * (len(inputs) - 1))
            ids.extend(inputs)

        # Convert tokens to IDs
        ids = config.TOKENIZER.convert_tokens_to_ids(ids)
        ids = ids[:config.MAX_LEN - 2]
        valid_mask = valid_mask[:config.MAX_LEN - 2]
        target_tag = target_tag[:config.MAX_LEN - 2]

        # Initialize mask and mask_crf
        mask = [1] * len(ids)
        mask_crf = [1] * len(ids)

        # Add special tokens
        ids = [101] + ids + [102]
        target_tag = [0] + target_tag + [0]
        valid_mask = [1] + valid_mask + [1]
        mask = [0] + mask + [0]
        mask_crf = [1] + mask_crf + [1]

        # Initialize token_type_ids
        token_type_ids = [0] * config.MAX_LEN

        # Padding
        padding_len = config.MAX_LEN - len(ids)
        ids = ids + [0] * padding_len
        mask = mask + [0] * padding_len
        mask_crf = mask_crf + [0] * padding_len
        valid_mask = valid_mask + [0] * padding_len

        # Other padding for target_tag
        other_padding_len = config.MAX_LEN - len(target_tag)
        target_tag = target_tag + [0] * other_padding_len

        # Assertion checks
        assert len(ids) == config.MAX_LEN
        assert len(mask) == config.MAX_LEN
        assert len(mask_crf) == config.MAX_LEN
        assert len(token_type_ids) == config.MAX_LEN
        assert len(target_tag) == config.MAX_LEN
        assert len(valid_mask) == config.MAX_LEN

        # Return a dictionary containing relevant tensors
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "mask_crf": torch.tensor(mask_crf, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
            "valid_mask": torch.tensor(valid_mask, dtype=torch.long)
        }


In [None]:
class CRF(nn.Module):
    def __init__(self, num_tags):
        super(CRF, self).__init__()
        self.num_tags = num_tags

        # Transition scores, where transitions[i, j] is the score of transitioning from tag i to tag j.
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

    def forward(self, emissions, tags):
        """
        Calculate the negative log likelihood of a sequence of tags given emission scores.

        Args:
            emissions (torch.Tensor): Emission score for each tag at each timestep, shape (seq_len, num_tags).
            tags (torch.LongTensor): Sequence of tags, shape (seq_len,).

        Returns:
            torch.Tensor: Negative log likelihood, a scalar.
        """
        seq_len = emissions.size(0)

        # Calculate the score of the provided tags
        score = self.transitions[tags[:-1], tags[1:]].sum() + emissions[list(range(seq_len)), tags].sum()

        # Calculate the sum of scores for all possible tag sequences
        all_scores = emissions[0, :] + self.transitions + emissions[1:, :].view(seq_len - 1, 1, -1)
        all_scores = all_scores.expand(seq_len - 1, -1, -1)
        all_scores = all_scores + emissions[2:, :].view(seq_len - 2, -1, 1)
        all_scores = all_scores.sum(dim=0)

        # Calculate the log sum exp
        log_sum_exp = torch.logsumexp(all_scores, dim=1)

        # Calculate the negative log likelihood
        return log_sum_exp - score

    def viterbi_decode(self, emissions):
        """
        Viterbi decoding to find the most likely sequence of tags given emission scores.

        Args:
            emissions (torch.Tensor): Emission score for each tag at each timestep, shape (seq_len, num_tags).

        Returns:
            torch.Tensor: Most likely sequence of tags, shape (seq_len,).
        """
        seq_len = emissions.size(0)

        # Initialize the viterbi variables
        viterbi_vars = emissions[0, :].unsqueeze(0)
        backpointers = []

        # Forward pass
        for t in range(1, seq_len):
            viterbi_t = viterbi_vars[t - 1, :].view(-1, 1) + self.transitions + emissions[t, :].view(1, -1)
            max_score, best_path = viterbi_t.max(dim=0)
            viterbi_vars = torch.cat([viterbi_vars, max_score.unsqueeze(0)])
            backpointers.append(best_path)

        # Backward pass to find the best path
        best_last_tag = torch.argmax(viterbi_vars[-1, :])
        best_path = [best_last_tag.item()]

        for backpointer in reversed(backpointers):
            best_last_tag = backpointer[best_last_tag]
            best_path.append(best_last_tag.item())

        return torch.tensor(list(reversed(best_path)))

# Example usage:
num_tags = 5
crf = CRF(num_tags)

# Dummy emissions and tags for demonstration purposes
emissions = torch.randn(10, num_tags)
tags = torch.tensor([2, 1, 4, 3, 2, 0, 1, 4, 3, 2])

# Calculate negative log likelihood
neg_log_likelihood = crf(emissions, tags)
print("Negative Log Likelihood:", neg_log_likelihood.item())

# Viterbi decoding
best_path = crf.viterbi_decode(emissions)
print("Viterbi Decoding - Best Path:", best_path.numpy())


In [None]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from collections import defaultdict

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    y_true_accumulator = []
    y_pred_accumulator = []

    for data in tqdm(data_loader, total=len(data_loader)):
        # Move data to the specified device
        for k, v in data.items():
            data[k] = v.to(device)

        # Zero out gradients
        optimizer.zero_grad()

        # Forward pass
        tag, target_tag, mask, loss = model(**data)

        # Mapping indices to tag labels
        map_dict = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
        mapped_target = [map_dict[item] for item in torch.flatten(mask * target_tag).detach().cpu().tolist()]
        mapped_output = [map_dict[item] for item in torch.flatten(mask * tag).detach().cpu().tolist()]

        y_true_accumulator.append(mapped_target)
        y_pred_accumulator.append(mapped_output)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Accumulate loss
        final_loss += loss.item()

    # Calculate average loss and F1 score
    avg_loss = final_loss / len(data_loader)
    f1 = f1_score(y_true_accumulator, y_pred_accumulator)

    return avg_loss, f1

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    y_true_accumulator = []
    y_pred_accumulator = []

    for data in tqdm(data_loader, total=len(data_loader)):
        # Move data to the specified device
        for k, v in data.items():
            data[k] = v.to(device)

        # Forward pass
        tag, target_tag, mask, loss = model(**data)

        # Mapping indices to tag labels
        map_dict = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
        mapped_target = [map_dict[item] for item in torch.flatten(mask * target_tag).detach().cpu().tolist()]
        mapped_output = [map_dict[item] for item in torch.flatten(mask * tag).detach().cpu().tolist()]

        y_true_accumulator.append(mapped_target)
        y_pred_accumulator.append(mapped_output)

        # Accumulate loss
        final_loss += loss.item()

    # Calculate average loss and F1 score
    avg_loss = final_loss / len(data_loader)
    f1 = f1_score(y_true_accumulator, y_pred_accumulator)

    return avg_loss, f1


In [None]:

class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        # Load pre-trained BERT model
        self.bert = transformers.BertModel.from_pretrained(config.BASE_MODEL_PATH, return_dict=False)
        # Linear layer for tag prediction
        self.out_tag = nn.Linear(1024, self.num_tag)
        # Initialize bias for the 'O' tag
        self.out_tag.bias.data[0] = 6
        # Conditional Random Field (CRF) layer
        self.crf = CRF(self.num_tag, batch_first=True)

    def forward(self, ids, mask, mask_crf, token_type_ids, target_tag, valid_mask):
        # BERT forward pass
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        # Linear layer for tag prediction
        emissions = self.out_tag(o1)
        # CRF calculations
        log_likelihood, sequence_of_tags = self.crf(emissions, target_tag, mask=mask_crf.bool(), reduction='mean'), self.crf.decode(emissions, mask=mask_crf.bool())

        # Apply the 'valid_sequence_output' function
        tag, mask = valid_sequence_output(sequence_of_tags, valid_mask, mask)

        # Calculate loss as negative log-likelihood
        loss = -1 * log_likelihood

        return tag, target_tag, mask, loss


In [None]:
def valid_sequence_output(sequence_output, valid_mask, attention_mask): # convert token back to word

    # batch_size, max_len, feat_dim = sequence_output.shape
    batch_size = len(sequence_output)
    max_len = config.MAX_LEN
    current_len = len(sequence_output[0])
    valid_output = torch.zeros(batch_size, max_len, dtype = torch.long,
                                device='cuda' if torch.cuda.is_available() else 'cpu')
    valid_attention_mask = torch.zeros(batch_size, max_len, dtype = torch.long,
                                device='cuda' if torch.cuda.is_available() else 'cpu')


    for i in range(batch_size):
        jj = -1
        for j in range(current_len):
            if valid_mask[i][j].item() == 1:
                jj += 1
                valid_output[i][jj] = sequence_output[i][j]*attention_mask[i][j]
                valid_attention_mask[i][jj] = attention_mask[i][j]

    return valid_output, valid_attention_mask

In [None]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Create EntityDataset instances for training and validation
train_dataset = EntityDataset(texts=dataset['train']['tokens'], tags=dataset['train']['ner_tags'])
valid_dataset = EntityDataset(texts=dataset['validation']['tokens'], tags=dataset['validation']['ner_tags'])

# Create DataLoader instances
train_data_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, shuffle=True, batch_size=config.VALID_BATCH_SIZE, num_workers=1)

# Set the device
device = torch.device("cuda")

# Define the tagset and create the EntityModel
tagset = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
model = EntityModel(num_tag=len(tagset))
model.to(device)

# Define optimizer parameters
optimizer_parameters = [
    {"params": model.bert.parameters(), "lr": 5e-5},
    {"params": model.out_tag.parameters(), "lr": 1e-3},
    {"params": model.crf.parameters(), "lr": 1e-3},
]

# Calculate the number of training steps
num_train_steps = int(dataset['train'].num_rows / config.TRAIN_BATCH_SIZE * config.EPOCHS)

# Create AdamW optimizer and linear schedule with warmup
optimizer = AdamW(optimizer_parameters, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_train_steps * 0.1, num_training_steps=num_train_steps
)

# Initialize best_loss for model saving
best_loss = np.inf

# Training loop
for epoch in range(config.EPOCHS):
    # Train the model
    train_loss, train_f1 = train_fn(train_data_loader, model, optimizer, device, scheduler)
    
    # Evaluate the model on the validation set
    val_loss, val_f1 = eval_fn(valid_data_loader, model, device)
    
    # Print training and validation metrics
    print(f"Train loss = {train_loss} Valid loss = {val_loss} ")
    print(f"Train f1_score = {train_f1} Valid f1_score = {val_f1} ")
    
    # Save the model if validation loss improves
    if val_loss < best_loss:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_loss = val_loss


In [None]:
print(train_f1)

In [None]:
def test_fn(data_loader,model, device):
    model.eval()
    final_loss = 0
    y_true_accumulator = []
    y_pred_accumulator = []
    for data in tqdm(data_loader,total=len(data_loader)):
        for k,v in data.items():
            data[k] = v.to(device)
        tag, target_tag, mask, loss = model(**data)
#         y_true_accumulator.append(torch.flatten(mask*target_tag).detach().cpu().tolist())
#         y_pred_accumulator.append(torch.flatten(mask*tag).detach().cpu().tolist()) 
        map_dict = {0 : 'O', 1 : 'B-PER', 2 : 'I-PER', 3 : 'B-ORG', 4 : 'I-ORG', 5 : 'B-LOC', 6 : 'I-LOC', 7 : 'B-MISC', 8 : 'I-MISC'}
        mapped_target = [map_dict[item] for item in torch.flatten(mask*target_tag).detach().cpu().tolist()]
        mapped_output = [map_dict[item] for item in torch.flatten(mask*tag).detach().cpu().tolist()]
        y_true_accumulator.append(mapped_target)
        y_pred_accumulator.append(mapped_output)   
        final_loss += loss.item()
        
    print(classification_report(y_true_accumulator, y_pred_accumulator, digits=2))
    return final_loss/len(data_loader), f1_score(y_true_accumulator, y_pred_accumulator), accuracy_score(y_true_accumulator, y_pred_accumulator)

In [None]:
def test_fn2(data_loader,model, device):
    model.eval()
    final_loss = 0
    y_true_accumulator = []
    y_pred_accumulator = []
    
    cnt = 0
    for data in tqdm(data_loader,total=len(data_loader)):
        for k,v in data.items():
            data[k] = v.to(device)
        tag, target_tag, mask, loss = model(**data)
#         y_true_accumulator.append(torch.flatten(mask*target_tag).detach().cpu().tolist())
#         y_pred_accumulator.append(torch.flatten(mask*tag).detach().cpu().tolist()) 
        map_dict = {0 : 'O', 1 : 'B-PER', 2 : 'I-PER', 3 : 'B-ORG', 4 : 'I-ORG', 5 : 'B-LOC', 6 : 'I-LOC', 7 : 'B-MISC', 8 : 'I-MISC'}
        mapped_target = [map_dict[item] for item in torch.flatten(mask*target_tag).detach().cpu().tolist()]
        mapped_output = [map_dict[item] for item in torch.flatten(mask*tag).detach().cpu().tolist()]
        y_true_accumulator.append(mapped_target)
        y_pred_accumulator.append(mapped_output)   
        final_loss += loss.item()
        break
    print(y_true_accumulator)
    print(y_pred_accumulator)
        
    print(classification_report(y_true_accumulator, y_pred_accumulator, digits=2))
    return final_loss/len(data_loader), f1_score(y_true_accumulator, y_pred_accumulator), accuracy_score(y_true_accumulator, y_pred_accumulator)

In [None]:
def inference(model, sentence):
    tokens = sentence.split(' ')

In [None]:
test_dataset = EntityDataset(texts = dataset['test']['tokens'], tags = dataset['test']['ner_tags'])
test_data_loader = torch.utils.data.DataLoader(test_dataset,batch_size=config.VALID_BATCH_SIZE,num_workers=1)
_, test_f1, test_acc = test_fn(test_data_loader, model, device)
print(f'F1 score on test set: {test_f1}\nAccuracy score on test set: {test_acc}')


In [None]:
_, test_f12, test_acc2 = test_fn2(test_data_loader, model, device)
print(f'F1 score on test set: {test_f12}\nAccuracy score on test set: {test_acc2}')

In [None]:
dataset['test']['tokens'][0]

In [None]:
dataset['test']['ner_tags'][0]