# Initialize paths to datasets

In [1]:
train_path_dataset = '/kaggle/input/github/Assignment_3/train.jsonl'
test_path_dataset = '/kaggle/input/github/Assignment_3/test.jsonl'
ners_path = '/kaggle/input/github/Assignment_3/ners.txt'

# Install and import libraries 

In [2]:
!pip install -q peft transformers datasets evaluate seqeval

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
import json
import torch
from datasets import Dataset

from nltk.data import load
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

In [5]:
#Load all entities and give them ids

def load_entities():
    tag_list = []
    
    with open(ners_path, "r", encoding='UTF-8') as ners_file:

        tags = [t.strip() for t in ners_file.read().split('\n')]
        tags = tags[:-1]
        
        tag_to_id = {}
        id_to_tag = {}
        tag_list.append('O')
        for idx, tag in enumerate(tags):
            tag_to_id['B-' + tag] = idx * 4 + 1
            tag_to_id['I-' + tag] = idx * 4 + 2
            tag_to_id['E-' + tag] = idx * 4 + 3
            tag_to_id['S-' + tag] = idx * 4 + 4
            
            id_to_tag[idx * 4 + 1] = 'B-' + tag
            id_to_tag[idx * 4 + 2] = 'I-' + tag
            id_to_tag[idx * 4 + 3] = 'E-' + tag
            id_to_tag[idx * 4 + 4] = 'S-' + tag
            
            tag_list.append('B-' + tag)
            tag_list.append('I-' + tag)
            tag_list.append('E-' + tag)
            tag_list.append('S-' + tag)
        tag_to_id['O'] = 0
        id_to_tag[0] = 'O'

        tag_to_id
    
    return tag_list, tag_to_id, id_to_tag

tag_list, tag_to_id, id_to_tag = load_entities()

In [6]:
import pandas as pd
from tqdm import tqdm

class IOBESFlatRuNNEDataset:
    def __init__(self, tokenizer, json_file_path):
        """
        Initialize the dataset with a tokenizer and the path to the JSONL file.
        
        Args:
        - tokenizer: A tokenizer object (e.g., from Hugging Face's transformers library).
        - json_file_path: Path to the JSONL file containing the data.
        """
        self.tokenizer = tokenizer
        self.json_file_path = json_file_path

    def _filter_flat_ners(self, entities):
        """
        Filter out nested named entities from the list of entities.
        
        Args:
        - entities: List of named entity tuples (start, end, label).
        
        Returns:
        - List of flat named entity tuples.
        """
        flat_entities = []

        for entity in entities:
            start, end, _ = entity
            if not any(start >= e[0] and end <= e[1] for e in entities if e != entity):
                flat_entities.append(entity)

        return flat_entities

    def _tokenize_sentences(self, flat_data):
        """
        Tokenize the sentences in the dataset and prepare them for training.
        
        Args:
        - flat_data: DataFrame containing the flat entities and sentences.
        
        Returns:
        - Updated DataFrame with tokenized data.
        """
        token_indices = []
        tokens = []
        offsets = []
        attention_masks = []

        for _, row in tqdm(flat_data.iterrows(), desc="Tokenizing sentences"):
            context = row["sentences"]
            encoding = self.tokenizer.encode_plus(context, truncation=True, add_special_tokens=False, return_offsets_mapping=True)
            offsets.append(encoding.offset_mapping)
            tokens.append(encoding.input_ids)
            attention_masks.append(encoding.attention_mask)

            offset_to_token_start = {}
            offset_to_token_end = {}
            for token_idx, (start, end) in enumerate(offsets[-1]):
                if start == end == 0:
                    continue
                offset_to_token_start[start] = token_idx
                offset_to_token_end[end - 1] = token_idx

            valid_ners = []
            for start, end, label in row["flat_ners"]:
                try:
                    new_start = offset_to_token_start[start]
                    new_end = offset_to_token_end[end]
                except KeyError:
                    continue
                valid_ners.append([new_start, new_end, label])

            token_indices.append(valid_ners)

        flat_data['token_indices'] = token_indices
        flat_data['input_ids'] = tokens
        flat_data['offset_mapping'] = offsets
        flat_data['attention_mask'] = attention_masks

        return flat_data

    def _convert_to_iobes_labels(self, flat_data):
        """
        Convert the token indices to IOBES format labels.
        
        Args:
        - flat_data: DataFrame containing token indices.
        
        Returns:
        - Updated DataFrame with IOBES format labels.
        """
        labeled_data = []
        for _, row in tqdm(flat_data.iterrows(), desc="Converting to IOBES labels"):
            labels = []

            for e in row["token_indices"]:
                labels.append((e[0], e[1], e[2]))

            label_seq = ["O"] * len(row['input_ids'])

            for start, end, tag in labels:
                if start == end:
                    label_seq[start] = 'S-' + tag
                else:
                    label_seq[start] = 'B-' + tag
                    label_seq[end] = 'E-' + tag
                    for i in range(start + 1, end):
                        label_seq[i] = 'I-' + tag

            labeled_data.append([tag_to_id.get(label) for label in label_seq])

        flat_data['labels'] = labeled_data

        return flat_data
    
    def _show_entity_changes(self, json_data):
        """
        Calculate the percentage change in the number of named entities after preprocessing.
        
        Args:
        - entities: List of original named entities.
        - flat_entities: List of flat named entities after preprocessing.
        
        Prints:
        - Mean percentage change per row.
        """
        entities = json_data['ners'].tolist()
        flat_entities = json_data['flat_ners'].tolist()
        
        changes = [(len(entities[i]) - len(flat_entities[i])) / len(entities[i]) * 100 for i in range(len(entities))]
        
        ner_changes = sum(changes) / len(changes) if len(changes) > 0 else 0
        
        print(f"Mean percentage change in number of named entities: {ner_changes:.2f}%")
        

    def get_dataset(self):
        """
        Load and preprocess the dataset.
        
        Returns:
        - DataFrame containing the tokenized dataset with IOBES format labels.
        """
        print("Loading data...")
        json_data = pd.read_json(self.json_file_path, lines=True)
        json_data = json_data[json_data.columns[::-1]]

        print("Getting flat entities...")
        flat_ners = [self._filter_flat_ners(row["ners"]) for _, row in tqdm(json_data.iterrows(), desc="Filtering flat entities")]
        json_data['flat_ners'] = flat_ners

        json_data = self._tokenize_sentences(json_data)
        json_data = self._convert_to_iobes_labels(json_data)
        
        for _, row in tqdm(json_data.iterrows(), desc="Checking for errors"):
            assert len(row['input_ids']) == len(row['attention_mask']) 
            assert len(row['attention_mask']) == len(row['offset_mapping'])
            assert len(row['offset_mapping']) == len(row['labels'])
        
        self._show_entity_changes(json_data)
        
        return json_data[['input_ids', 'attention_mask', 'offset_mapping', 'labels']]

# Preprocess data

In [7]:
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    BertForTokenClassification
)

model_checkpoint = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

2024-04-28 10:24:45.954978: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 10:24:45.955132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 10:24:46.222500: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [8]:
df = IOBESFlatRuNNEDataset(tokenizer, train_path_dataset).get_dataset()

Loading data...
Getting flat entities...


Filtering flat entities: 519it [00:00, 1065.35it/s]
Tokenizing sentences: 519it [00:01, 389.69it/s]
Converting to IOBES labels: 519it [00:00, 5388.33it/s]
Checking for errors: 519it [00:00, 11965.94it/s]

Mean percentage change in number of named entities: 20.93%





In [9]:
df.head(1)

Unnamed: 0,input_ids,attention_mask,offset_mapping,labels
0,"[95931, 34936, 543, 48645, 16882, 33933, 11977...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 2), (2, 6), (7, 8), (8, 11), (11, 15), (1...","[9, 11, 33, 34, 35, 41, 42, 42, 42, 42, 42, 42..."


# Get ready for training

In [10]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np
import pandas as pd
from datasets import Dataset

In [11]:
dataset = Dataset.from_pandas(df)

splitted_dataset  = dataset.train_test_split(test_size=0.2)
splitted_dataset['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'labels'],
    num_rows: 415
})

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
model = BertForTokenClassification.from_pretrained(model_checkpoint, id2label=id_to_tag, label2id=tag_to_id,
                                                   num_labels = 29 * 4 + 1, return_dict = False)
seqeval = evaluate.load("seqeval")

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [14]:
def compute_metrics(p, label_list = tag_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label)]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label)]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [15]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, 
    r=16, lora_alpha=32, lora_dropout=0.1, bias="all",
    target_modules=['guery', 'key', 'value', 'dense']
)

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,551,413 || all params: 179,802,090 || trainable%: 1.4190118702179713


In [20]:
output_dir = "trained_weigths"

lr = 1e-3
batch_size = 16
num_epochs = 10

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,
    metric_for_best_model='eval_f1',
    log_level='critical',
    seed=12345
)

In [21]:
import wandb
import os

wandb.init(mode="disabled")
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache() 

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splitted_dataset["train"],
    eval_dataset=splitted_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

{'eval_loss': 0.4461127817630768, 'eval_precision': 0.4587609274040289, 'eval_recall': 0.24058202112816424, 'eval_f1': 0.31563807531380755, 'eval_accuracy': 0.7041954627403846, 'eval_runtime': 4.6368, 'eval_samples_per_second': 22.429, 'eval_steps_per_second': 0.863, 'epoch': 1.0}
{'eval_loss': 0.37188318371772766, 'eval_precision': 0.5243726564753389, 'eval_recall': 0.24157863264899343, 'eval_f1': 0.33077098021378215, 'eval_accuracy': 0.7232008713942307, 'eval_runtime': 4.4974, 'eval_samples_per_second': 23.124, 'eval_steps_per_second': 0.889, 'epoch': 2.0}
{'eval_loss': 0.3952161371707916, 'eval_precision': 0.5141061452513966, 'eval_recall': 0.24456846721148096, 'eval_f1': 0.3314573859799198, 'eval_accuracy': 0.7223745492788461, 'eval_runtime': 4.4837, 'eval_samples_per_second': 23.195, 'eval_steps_per_second': 0.892, 'epoch': 3.0}
{'eval_loss': 0.4153476059436798, 'eval_precision': 0.5188507358093903, 'eval_recall': 0.24596372334064182, 'eval_f1': 0.3337239700712161, 'eval_accuracy'

TrainOutput(global_step=130, training_loss=0.1657801994910607, metrics={'train_runtime': 264.7249, 'train_samples_per_second': 15.677, 'train_steps_per_second': 0.491, 'train_loss': 0.1657801994910607, 'epoch': 10.0})

# Prediction

In [24]:
# Read JSONL file
json_data = pd.read_json(test_path_dataset, lines=True)
json_data = json_data[json_data.columns[::-1]]
test_data = json_data.rename(columns={"senences": "sentences"})

# Display the DataFrame
test_data.head(1)

Unnamed: 0,id,sentences
0,584,Владелец «Бирмингема» получил шесть лет тюрьмы...


In [25]:
def normalize_preds(preds):
    norm_preds = []
    start_e = 0
    end_e = 0
    for entity, word, start, end in preds:
        if entity[0] == 'S':
            norm_preds.append([start, end-1, entity[2:]])
        if entity[0] == 'B':
            start_e = start
        if entity[0] == 'E':
            end_e = end-1
            norm_preds.append([start_e, end_e, entity[2:]])
            
    return norm_preds

In [26]:
def predict(X_test, model, tokenizer):
    y_pred = []
    for text in tqdm(X_test["sentences"]):

        predictions = [[p['entity'], p['word'], p['start'], p['end']] for p in model(text)]

        norm_preds = normalize_preds(predictions)
        
        y_pred.append(norm_preds)

    return y_pred

In [27]:
from tqdm import tqdm
from transformers import pipeline

classifier = pipeline("ner", model=model, tokenizer=tokenizer)
y_preds = predict(test_data, classifier, tokenizer)

100%|██████████| 65/65 [00:36<00:00,  1.78it/s]


In [28]:
test_data['ners'] = y_preds
test_data.head(3)

Unnamed: 0,id,sentences,ners
0,584,Владелец «Бирмингема» получил шесть лет тюрьмы...,"[[10, 19, ORGANIZATION], [30, 38, DATE], [30, ..."
1,585,Акция протеста на Майдане Независимости объявл...,"[[0, 4, EVENT], [0, 13, EVENT], [18, 38, FACIL..."
2,586,Фольксваген может перейти под контроль Порше \...,"[[0, 10, ORGANIZATION], [39, 43, ORGANIZATION]..."


In [29]:
# Save results to a JSONL file
submission_df = test_data[['id', 'ners']]
submission_df.to_json('test.jsonl', lines=True, orient='records', force_ascii=False)

In [30]:
import zipfile
zipfile.ZipFile('test.zip', mode='w').write("test.jsonl")