# Data converting

## Install tokenizer
For Russian text we will use **razdel** tokenizer.

In [1]:
!pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [2]:
from razdel import tokenize

tokens = list(tokenize('Проверка токенизатора на русском языке, да-да, Иван Иванович.'))
tokens

[Substring(0, 8, 'Проверка'),
 Substring(9, 21, 'токенизатора'),
 Substring(22, 24, 'на'),
 Substring(25, 32, 'русском'),
 Substring(33, 38, 'языке'),
 Substring(38, 39, ','),
 Substring(40, 45, 'да-да'),
 Substring(45, 46, ','),
 Substring(47, 51, 'Иван'),
 Substring(52, 60, 'Иванович'),
 Substring(60, 61, '.')]

## Download dataset
I uploaded the datasets to google disk, so we will use *gdown* to get them.

In [3]:
!gdown --fuzzy https://drive.google.com/file/d/1Fe7yaIFt59iCq0ILI88HU7Ej9XE6S3bu/view?usp=sharing
!wget https://huggingface.co/datasets/iluvvatar/RuNNE/raw/main/data/test.jsonl
!gdown --fuzzy https://drive.google.com/file/d/1kDWIe5KSZdB4ipiNNBRvT9t-FW2ZkkeW/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1Fe7yaIFt59iCq0ILI88HU7Ej9XE6S3bu
To: /content/train.jsonl
  0% 0.00/7.31M [00:00<?, ?B/s]100% 7.31M/7.31M [00:00<00:00, 265MB/s]
--2024-04-18 10:54:36--  https://huggingface.co/datasets/iluvvatar/RuNNE/raw/main/data/test.jsonl
Resolving huggingface.co (huggingface.co)... 18.172.134.24, 18.172.134.4, 18.172.134.124, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 403934 (394K) [text/plain]
Saving to: ‘test.jsonl’


2024-04-18 10:54:36 (9.00 MB/s) - ‘test.jsonl’ saved [403934/403934]

Downloading...
From: https://drive.google.com/uc?id=1kDWIe5KSZdB4ipiNNBRvT9t-FW2ZkkeW
To: /content/validation.jsonl
100% 1.77M/1.77M [00:00<00:00, 182MB/s]


In [4]:
from razdel import tokenize, sentenize
import json

def read_training_dataset(file_name: str) -> None:
    """
    Read dataset in .jsonl format and save it as .txt 'TOKEN TAG1, TAG2 \n'

    Parameters:
        file_name (str): Path to dataset file.
    """
    annotated_sentences = []


    with open(f"{file_name}.jsonl") as dataset_file:
        lines = dataset_file.readlines()

    # Go through each line
    for line in lines:
        # Load content
        content = json.loads(line)
        # Get token data
        text = content["text"]
        tokens = list(tokenize(text))
        sentences = list(sentenize(text))
        end_positions = [sentence.stop for sentence in sentences]

        # Get NERs data
        nested_ners = [[] for _ in tokens]
        content_entities = [entity.split(" ") for entity in content["entities"]]
        content_entities.sort(key=lambda x: (int(x[0]) - int(x[1])))

        # For each NER
        for entity in content_entities:
            start, end, entity_type = entity
            start, end = int(start), int(end)
            begin_placed = False
            # Find token it linked with
            for i in range(len(tokens)):
                if tokens[i].start >= start and tokens[i].stop <= end:
                    if begin_placed:
                        # If it is not the first token in a span mark it as continue (I)
                        nested_ners[i].append(f"I-{entity_type}")
                    else:
                        # If it the first token in a span mark it as begin (B)
                        begin_placed = True
                        nested_ners[i].append(f"B-{entity_type}")
        # Write the processed data to .txt file
        current_end = 0
        with open(f"{file_name}.txt", "a") as txt_file:
            for token, nner in zip(tokens, nested_ners):
                txt_file.write(f"{token.text} {'O' if len(nner) == 0 else ' '.join(nner)}\n")
                if token.stop >= end_positions[current_end]:
                    current_end += 1
                    txt_file.write("\n")

In [5]:
# Process each type of files
read_training_dataset("train")
read_training_dataset("test")
read_training_dataset("validation")

## Prepare data for training
Prepare data for ArabiNER model.

In [6]:
!git clone https://github.com/SinaLab/ArabicNER

Cloning into 'ArabicNER'...
remote: Enumerating objects: 595, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 595 (delta 15), reused 12 (delta 8), pack-reused 566[K
Receiving objects: 100% (595/595), 284.95 KiB | 3.65 MiB/s, done.
Resolving deltas: 100% (355/355), done.


In [7]:
!mkdir ArabicNER/RuNNE
!mkdir ArabicNER/output
!cp train.txt ArabicNER/RuNNE
!cp test.txt ArabicNER/RuNNE
!cp validation.txt ArabicNER/RuNNE

In [8]:
! echo $PYTHONPATH

# Add ArabiNER to path so we can import it as module
import os
os.environ['PYTHONPATH'] += ":ArabicNER"

! echo $PYTHONPATH

/env/python
/env/python:ArabicNER


In [9]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m30.7/43.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m792.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=5689678e653f07ded4361ce70aeed35f4c15224b081699ffa0829beabc0deee1
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


# Train ArabiNER

In [None]:
!python ArabicNER/arabiner/bin/train.py \
    --output_path ArabicNER/results \
    --train_path ArabicNER//RuNNE/train.txt \
    --val_path ArabicNER//RuNNE/validation.txt \
    --test_path ArabicNER//RuNNE/test.txt \
    --batch_size 16 \
    --data_config '{"fn":"arabiner.data.datasets.NestedTagsDataset","kwargs":{"max_seq_len":512}}' \
    --trainer_config '{"fn":"arabiner.trainers.BertNestedTrainer","kwargs":{"max_epochs":15}}' \
    --network_config '{"fn":"arabiner.nn.BertNestedTagger","kwargs":{"dropout":0.1,"bert_model":"DeepPavlov/rubert-base-cased"}}' \
    --optimizer '{"fn":"torch.optim.AdamW","kwargs":{"lr":0.0001}}'

2024-04-14 20:54:33.220520: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 20:54:33.220573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 20:54:33.221880: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-14 20:54:33.228786: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Logging to ArabicNER/results/train.log
INFO	__main__	

## Save checkpoint to Google Drive
(Optional action)

In [10]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!zip -r third_attempt.zip ArabicNER/results

  adding: ArabicNER/results/ (stored 0%)
  adding: ArabicNER/results/tag_vocab.pkl (deflated 86%)
  adding: ArabicNER/results/predictions.txt (deflated 93%)
  adding: ArabicNER/results/tensorboard/ (stored 0%)
  adding: ArabicNER/results/tensorboard/Metrics_test_micro_f1/ (stored 0%)
  adding: ArabicNER/results/tensorboard/Metrics_test_micro_f1/events.out.tfevents.1713128392.65cd2fa4555d.3088.7 (deflated 33%)
  adding: ArabicNER/results/tensorboard/Metrics_test_precision/ (stored 0%)
  adding: ArabicNER/results/tensorboard/Metrics_test_precision/events.out.tfevents.1713128392.65cd2fa4555d.3088.8 (deflated 33%)
  adding: ArabicNER/results/tensorboard/Metrics_test_recall/ (stored 0%)
  adding: ArabicNER/results/tensorboard/Metrics_test_recall/events.out.tfevents.1713128392.65cd2fa4555d.3088.9 (deflated 33%)
  adding: ArabicNER/results/tensorboard/Loss_test_loss/ (stored 0%)
  adding: ArabicNER/results/tensorboard/Loss_test_loss/events.out.tfevents.1713128392.65cd2fa4555d.3088.3 (deflated

In [None]:
!cp third_attempt.zip '/content/gdrive/MyDrive/Innopolis/PMLDL'

## Load checkpoint from Google Drive
(If you want to skip training)

In [13]:
!cp '/content/gdrive/MyDrive/Innopolis/PMLDL/third_attempt.zip' third_attempt.zip
!mkdir ArabicNER/results
!unzip third_attempt.zip -d ArabicNER/results
!cp -r ArabicNER/results/ArabicNER/results/. ArabicNER/results
!rm -r ArabicNER/results/ArabicNER

Archive:  third_attempt.zip
   creating: ArabicNER/results/ArabicNER/results/
  inflating: ArabicNER/results/ArabicNER/results/tag_vocab.pkl  
  inflating: ArabicNER/results/ArabicNER/results/predictions.txt  
   creating: ArabicNER/results/ArabicNER/results/tensorboard/
   creating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_micro_f1/
  inflating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_micro_f1/events.out.tfevents.1713128392.65cd2fa4555d.3088.7  
   creating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_precision/
  inflating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_precision/events.out.tfevents.1713128392.65cd2fa4555d.3088.8  
   creating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_recall/
  inflating: ArabicNER/results/ArabicNER/results/tensorboard/Metrics_test_recall/events.out.tfevents.1713128392.65cd2fa4555d.3088.9  
   creating: ArabicNER/results/ArabicNER/results/tensorboard/Loss_test

# Model inference

In [14]:
import sys
sys.path.append('ArabicNER')

In [16]:
from typing import List, Tuple
from arabiner.data.datasets import Token
from torchtext.vocab import vocab
from torchtext.vocab import Vocab
from collections import Counter


def custom_text2segments(text: str) -> Tuple[List[Token], Vocab, List[Tuple[int, int]]]:
    """
    Split text into tokens and their location with vocabulary of a text.

    Parameters:
        text (str): Text to process.
    Return:
        (List[Token]): Tokens from the text
        (Vocab): Word vocabulary.
        (List[Tuple[int, int]]): List of the token's spans.
    """
    # Tokenize text
    tokenized = list(tokenize(text))
    # Create token lists
    dataset = [[Token(text=token.text, gold_tag=["O"]) for token in tokenized]]
    # Get token spans
    spans = [(token.start, token.stop) for token in tokenized]
    # Get all tokens in flat list
    tokens = [token.text for segment in dataset for token in segment]

    # Generate vocabs for the tokens
    segment_vocab = vocab(Counter(tokens), specials=["UNK"])
    return dataset, segment_vocab, spans

In [17]:
from collections import namedtuple
from arabiner.utils.helpers import load_checkpoint
from arabiner.utils.data import get_dataloaders, text2segments
from tqdm import tqdm

# Load tagger
tagger, tag_vocab, train_config = load_checkpoint("ArabicNER/results")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:

def get_model_results(input_text: str) -> List[Tuple[str, List[str], Tuple[int, int]]]:
    """
    Apply NER annotation on provided text.

    Parameters:
        input_text (str): The text for NER annotation.
    Return:
        (List[Tuple[str, List[str], Tuple[int, int]]]): List of tokens with all NER linked to it and token's span.
    """
    # Convert text to a tagger dataset and index the tokens in args.text
    dataset, token_vocab, spans = custom_text2segments(input_text)

    vocabs = namedtuple("Vocab", ["tags", "tokens"])
    vocab = vocabs(tokens=token_vocab, tags=tag_vocab)

    # From the datasets generate the dataloaders
    dataloader = get_dataloaders(
        (dataset,),
        vocab,
        train_config.data_config,
        batch_size=32,
        shuffle=(False,),
    )[0]

    # Perform inference on the text and get back the tagged segments
    segments = tagger.infer(dataloader)

    inference_result = []
    # Print results
    for segment in segments:
        s = [
            (token.text, [t['tag'] for t in token.pred_tag], spans[i]) for i, token in enumerate(segment)
        ]
        inference_result.extend(s)
    return inference_result

In [19]:
get_model_results("Сэмюэл Л. Джексон вернется к образу Ника Фьюри в «Капитане Марвел» Сэмюэл Лерой Джексон Сэмюэл Лерой Джексон исполнит роль Ника Фьюри в «Капитане Марвел». Он сыграет в фильме вместе с Бри Ларсон в готовящемся проекте Marvel Studios. Джексон подписал контракт с Marvel Studios на 9 фильмов, 7 уже вышло. Актёр в этом амплуа предстал ещё в двух эпизодах сериала «Агенты „Щ.И.Т.“» и дал свой голос в трёх видеоигр. Съёмки новой ленты стартуют в феврале 2018 года. Релиз намечен на 7 марта 2019 года.")

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[('Сэмюэл',
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-PERSON',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  (0, 6)),
 ('Л',
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'I-PERSON',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  (7, 8)),
 ('.',
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'I-PERSON',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  (8, 9)),
 ('Джексон',
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'I-PERSON',
   'O',
   'O',
 

In [20]:
import json

def inference_dataset(file_path: str) -> None:
    """
    Do inference on the test dataset and save the result in submit file format.

    Parameters:
        file_path (str): Path to dataset file.
    """
    sentences = []
    sentence_ids = []

    # Read all lines from dataset file
    with open(file_path) as dataset_file:
        lines = dataset_file.readlines()

    # Write into resulting file
    with open("result.jsonl", "w") as result_file:
        for line in tqdm(lines):
            # Read content: sentence and id of sentence
            content = json.loads(line)
            sentence = content["senences"]
            sentence_id = content["id"]

            # Get tags (NER) for sentence
            tagged_sentence = get_model_results(sentence)
            # Prepare variables for formatting
            new_content = dict()
            ners = []
            last_tags = [None]*29
            # Format the tags for tokens into tag spans
            for tag in tagged_sentence:
                start_pos, end_pos = tag[2]
                end_pos = end_pos - 1
                for i, layer in enumerate(tag[1]):
                    if layer == 'O':
                        if last_tags[i] is not None:
                            ners.append(last_tags[i])
                        last_tags[i] = None
                    elif layer[:2] == 'B-':
                        if last_tags[i] is not None:
                            ners.append(last_tags[i])
                        last_tags[i] = [start_pos, end_pos, layer[2:]]
                    elif layer[:2] == "I-":
                        if last_tags[i] is None:
                            last_tags[i] = [start_pos, end_pos, layer[2:]]
                        else:
                            last_tags[i][1] = end_pos
            for ner in last_tags:
                if ner is not None:
                    ners.append(ner)
            # Save formatted data
            new_content["id"] = sentence_id
            new_content["ners"] = ners
            result_file.write(json.dumps(new_content)+"\n")


## Predict tags for test set

In [21]:
# Load into notebook dev.jsonl in content folder!
inference_dataset("dev.jsonl")

100%|██████████| 65/65 [00:27<00:00,  2.39it/s]
