# –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
from corus import load_ne5

In [2]:
dir = '../data/Collection5/'
records = load_ne5(dir)

In [3]:
df = pd.DataFrame(records)

In [4]:
df.rename(columns={0: 'id', 1: 'text', 2: 'ner'}, inplace=True)

In [5]:
df = df[['text', 'ner']]

In [6]:
df.head(4)

Unnamed: 0,text,ner
0,–ñ–∏—Ä–∏–Ω–æ–≤—Å–∫–∏–π –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç –æ–±–º–µ–Ω—è—Ç—å —Å –°–®–ê –°–Ω–æ—É–¥–µ–Ω–∞...,"[Ne5Span(index='T1', type='PER', start=0, stop..."
1,–î.–ú–µ–¥–≤–µ–¥–µ–≤ –Ω–∞–∑–Ω–∞—á–∏–ª —Ä—è–¥ –≥–ª–∞–≤ —Ä–µ–≥–∏–æ–Ω–∞–ª—å–Ω—ã—Ö –ú–í–î\...,"[Ne5Span(index='T1', type='PER', start=0, stop..."
2,–°–ú–ò: –í.–°—É—Ä–∫–æ–≤—É –Ω–∞–¥–æ–µ–ª–æ —Ä–∞–±–æ—Ç–∞—Ç—å –≤ –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ü–∏...,"[Ne5Span(index='T1', type='MEDIA', start=0, st..."
3,–î.–ú–µ–¥–≤–µ–¥–µ–≤ –æ—Å–≤–æ–±–æ–¥–∏–ª –æ—Ç –¥–æ–ª–∂–Ω–æ—Å—Ç–∏ –µ—â–µ 10 –≥–µ–Ω–µ—Ä...,"[Ne5Span(index='T1', type='PER', start=0, stop..."


# train test split –∏ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# –ü–æ–ª—É—á–∏–º –≤—Å–µ —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ —Ç–∏–ø—ã –∏–º–µ–Ω–æ–≤–∞–Ω–Ω—ã—Ö —Å—É—â–Ω–æ—Å—Ç–µ–π
entity_types = set()
for spans in df['ner']:
    for span in spans:
        entity_types.add(span.type)

# –°–æ–∑–¥–∞–¥–∏–º –º–∞–ø–ø–∏–Ω–≥ –¥–ª—è –º–µ—Ç–æ–∫
id2label = {0: "O"}  # O - –¥–ª—è —Ç–æ–∫–µ–Ω–æ–≤ –Ω–µ –≤—Ö–æ–¥—è—â–∏—Ö –≤ –∏–º–µ–Ω–æ–≤–∞–Ω–Ω—É—é —Å—É—â–Ω–æ—Å—Ç—å
for i, entity_type in enumerate(sorted(entity_types), 1):
    id2label[2*i-1] = f"B-{entity_type}"  # B- –ø—Ä–µ—Ñ–∏–∫—Å –¥–ª—è –Ω–∞—á–∞–ª–∞ —Å—É—â–Ω–æ—Å—Ç–∏
    id2label[2*i] = f"I-{entity_type}"    # I- –ø—Ä–µ—Ñ–∏–∫—Å –¥–ª—è –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏—è —Å—É—â–Ω–æ—Å—Ç–∏
    
label2id = {v: k for k, v in id2label.items()}

print(f"–í—Å–µ–≥–æ {len(entity_types)} —Ç–∏–ø–æ–≤ —Å—É—â–Ω–æ—Å—Ç–µ–π: {sorted(entity_types)}")
print(f"–í—Å–µ–≥–æ {len(id2label)} –º–µ—Ç–æ–∫: {id2label}")

–í—Å–µ–≥–æ 5 —Ç–∏–ø–æ–≤ —Å—É—â–Ω–æ—Å—Ç–µ–π: ['GEOPOLIT', 'LOC', 'MEDIA', 'ORG', 'PER']
–í—Å–µ–≥–æ 11 –º–µ—Ç–æ–∫: {0: 'O', 1: 'B-GEOPOLIT', 2: 'I-GEOPOLIT', 3: 'B-LOC', 4: 'I-LOC', 5: 'B-MEDIA', 6: 'I-MEDIA', 7: 'B-ORG', 8: 'I-ORG', 9: 'B-PER', 10: 'I-PER'}


In [9]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [10]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Ç–µ–∫—Å—Ç–∞ –∏ NER-—Ä–∞–∑–º–µ—Ç–∫–∏ –≤ —Ñ–æ—Ä–º–∞—Ç –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
def preprocess_data(examples):
    # –ü–æ–ª—É—á–∞–µ–º —Ç–µ–∫—Å—Ç –∏ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏
    text = examples["text"]
    spans = examples["ner"]
    
    # –°–æ–∑–¥–∞–µ–º —Å–ø–∏—Å–æ–∫ –º–µ—Ç–æ–∫ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Å–∏–º–≤–æ–ª–∞ –≤ —Ç–µ–∫—Å—Ç–µ (–ø–æ —É–º–æ–ª—á–∞–Ω–∏—é 'O')
    char_labels = ["O"] * len(text)
    
    # –ó–∞–ø–æ–ª–Ω—è–µ–º –º–µ—Ç–∫–∏ –¥–ª—è —Å—É—â–Ω–æ—Å—Ç–µ–π
    for span in spans:
        entity_type = span.type
        start, end = span.start, span.stop
        
        # –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º B- –¥–ª—è –ø–µ—Ä–≤–æ–≥–æ —Å–∏–º–≤–æ–ª–∞
        char_labels[start] = f"B-{entity_type}"
        
        # –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º I- –¥–ª—è –æ—Å—Ç–∞–ª—å–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤ –≤ —Å—É—â–Ω–æ—Å—Ç–∏
        for i in range(start + 1, end):
            char_labels[i] = f"I-{entity_type}"
    
    # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Ç–µ–∫—Å—Ç
    tokenized = tokenizer(text, truncation=True, return_offsets_mapping=True)
    labels = []
    
    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Å–∏–º–≤–æ–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ –º–µ—Ç–∫–∏ —Ç–æ–∫–µ–Ω–æ–≤
    for i, (start, end) in enumerate(tokenized.offset_mapping):
        # –ü—Ä–æ–ø—É—Å–∫ —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤ ([CLS], [SEP], ...)
        if start == end == 0:
            labels.append(-100)  # -100 –∏–≥–Ω–æ—Ä–∏—Ä—É–µ—Ç—Å—è –ø—Ä–∏ –≤—ã—á–∏—Å–ª–µ–Ω–∏–∏ –ø–æ—Ç–µ—Ä—å
            continue
            
        # –ù–∞—Ö–æ–¥–∏–º –Ω–∞–∏–±–æ–ª–µ–µ —á–∞—Å—Ç—É—é –º–µ—Ç–∫—É –¥–ª—è —Å–∏–º–≤–æ–ª–æ–≤ –≤ —ç—Ç–æ–º —Ç–æ–∫–µ–Ω–µ
        token_labels = char_labels[start:end]
        if not token_labels:
            labels.append(-100)
            continue
            
        # –ï—Å–ª–∏ –µ—Å—Ç—å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–∞ B- –º–µ—Ç–∫–∞, —Ç–æ –∏—Å–ø–æ–ª—å–∑—É–µ–º –µ—ë
        b_labels = [l for l in token_labels if l.startswith("B-")]
        if b_labels:
            labels.append(label2id[b_labels[0]])
        else:
            # –ï—Å–ª–∏ –µ—Å—Ç—å —Ö–æ—Ç—è –±—ã –æ–¥–Ω–∞ I- –º–µ—Ç–∫–∞, —Ç–æ –∏—Å–ø–æ–ª—å–∑—É–µ–º –ø–µ—Ä–≤—É—é I- –º–µ—Ç–∫—É
            i_labels = [l for l in token_labels if l.startswith("I-")]
            if i_labels:
                labels.append(label2id[i_labels[0]])
            else:
                # –ò–Ω–∞—á–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º "O"
                labels.append(label2id["O"])
    
    # –£–¥–∞–ª—è–µ–º offset_mapping, —Ç–∞–∫ –∫–∞–∫ –æ–Ω –Ω–µ –Ω—É–∂–µ–Ω –¥–ª—è –æ–±—É—á–µ–Ω–∏—è
    tokenized.pop("offset_mapping")
    
    tokenized["labels"] = labels
    return tokenized

In [11]:
label2id

{'O': 0,
 'B-GEOPOLIT': 1,
 'I-GEOPOLIT': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-MEDIA': 5,
 'I-MEDIA': 6,
 'B-ORG': 7,
 'I-ORG': 8,
 'B-PER': 9,
 'I-PER': 10}

In [12]:
# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∫ –æ–±—É—á–∞—é—â–µ–π –∏ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–∞–º
processed_train_data = []
processed_test_data = []

for i, row in train_df.iterrows():
    processed_train_data.append(preprocess_data(row))
    
for i, row in test_df.iterrows():
    processed_test_data.append(preprocess_data(row))

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç—ã Hugging Face
train_dataset = Dataset.from_dict({
    "input_ids": [x["input_ids"] for x in processed_train_data],
    "attention_mask": [x["attention_mask"] for x in processed_train_data],
    "labels": [x["labels"] for x in processed_train_data]
})

test_dataset = Dataset.from_dict({
    "input_ids": [x["input_ids"] for x in processed_test_data],
    "attention_mask": [x["attention_mask"] for x in processed_test_data],
    "labels": [x["labels"] for x in processed_test_data]
})

# –¥–æ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏

In [13]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2", 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –≤—ã—á–∏—Å–ª–µ–Ω–∏—è –º–µ—Ç—Ä–∏–∫
def compute_metrics(eval_preds):
    metric = evaluate.load("seqeval")
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    # –£–±–∏—Ä–∞–µ–º –∏–≥–Ω–æ—Ä–∏—Ä—É–µ–º—ã–µ –∏–Ω–¥–µ–∫—Å—ã
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [19]:
# –û—Ü–µ–Ω–∫–∞ –º–µ—Ç—Ä–∏–∫ –±–µ–∑ –¥–æ–æ–±—É—á–µ–Ω–∏—è
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='../data/models',
    per_device_eval_batch_size=8,
    no_cuda=not torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


print("–ú–µ—Ç—Ä–∏–∫–∏ –¥–æ –¥–æ–æ–±—É—á–µ–Ω–∏—è:")
pre_training_metrics = trainer.evaluate(test_dataset)
pre_training_metrics

–ú–µ—Ç—Ä–∏–∫–∏ –¥–æ –¥–æ–æ–±—É—á–µ–Ω–∏—è:




{'eval_loss': 2.3831734657287598,
 'eval_model_preparation_time': 0.0004,
 'eval_precision': 0.006539280014186913,
 'eval_recall': 0.057071000193461015,
 'eval_f1': 0.01173405461307452,
 'eval_accuracy': 0.12865108868826342,
 'eval_runtime': 4.9607,
 'eval_samples_per_second': 40.317,
 'eval_steps_per_second': 5.04}

In [21]:
training_args = TrainingArguments(
    output_dir='../data/models/rubert-tiny2-ner',
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    no_cuda=not torch.cuda.is_available()
)



In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.588897,0.465635,0.250339,0.325617,0.843717
2,No log,0.423745,0.441952,0.369704,0.402612,0.868262
3,No log,0.38683,0.472085,0.44496,0.458122,0.885422


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=300, training_loss=0.6780633036295572, metrics={'train_runtime': 488.9889, 'train_samples_per_second': 4.908, 'train_steps_per_second': 0.614, 'total_flos': 22374206344752.0, 'train_loss': 0.6780633036295572, 'epoch': 3.0})

In [25]:
# –û—Ü–µ–Ω–∫–∞ –º–µ—Ç—Ä–∏–∫ –ø–æ—Å–ª–µ –¥–æ–æ–±—É—á–µ–Ω–∏—è
post_training_metrics = trainer.evaluate(test_dataset)
post_training_metrics

{'eval_loss': 0.3868299722671509,
 'eval_precision': 0.4720853858784893,
 'eval_recall': 0.444960340491391,
 'eval_f1': 0.45812170102579425,
 'eval_accuracy': 0.8854221986192247,
 'eval_runtime': 4.3761,
 'eval_samples_per_second': 45.703,
 'eval_steps_per_second': 5.713,
 'epoch': 3.0}

#   MLM –¥–æ–æ–±—É—á–µ–Ω–∏–µ 

In [67]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [68]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [69]:
# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∫ –æ–±—É—á–∞—é—â–µ–π –∏ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–∞–º
processed_train_data_mlm = []
processed_test_data_mlm = []

for i, row in train_df.iterrows():
    processed_train_data_mlm.append(preprocess_function(row))
    
for i, row in test_df.iterrows():
    processed_test_data_mlm.append(preprocess_function(row))

# –°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç—ã Hugging Face
train_dataset_mlm = Dataset.from_dict({
    "input_ids": [x["input_ids"] for x in processed_train_data_mlm],
    "attention_mask": [x["attention_mask"] for x in processed_train_data_mlm],
})

test_dataset_mlm = Dataset.from_dict({
    "input_ids": [x["input_ids"] for x in processed_test_data_mlm],
    "attention_mask": [x["attention_mask"] for x in processed_test_data_mlm],
})

Token indices sequence length is longer than the specified maximum sequence length for this model (2308 > 2048). Running this sequence through the model will result in indexing errors


In [70]:
train_dataset_mlm_grouped = train_dataset_mlm.map(group_texts, batched=True, num_proc=4)
test_dataset_mlm_grouped = test_dataset_mlm.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/200 [00:00<?, ? examples/s]

In [76]:
from transformers import DataCollatorForLanguageModeling

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [77]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained(
    "cointegrated/rubert-tiny2", 
)

In [78]:
training_args = TrainingArguments(
    output_dir="../data/models/rubert-tiny2-mlm",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    no_cuda=not torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_mlm_grouped,
    eval_dataset=test_dataset_mlm_grouped,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.191179
2,No log,3.124142
3,No log,3.100579


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=372, training_loss=3.4454778855846775, metrics={'train_runtime': 317.2932, 'train_samples_per_second': 18.683, 'train_steps_per_second': 1.172, 'total_flos': 11310209703936.0, 'train_loss': 3.4454778855846775, 'epoch': 3.0})

In [79]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 23.54


# NER –¥–æ–æ–±—É—á–µ–Ω–∏–µ 

In [81]:
model = AutoModelForTokenClassification.from_pretrained(
    '../data/models/rubert-tiny2-mlm/checkpoint-372', 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ../data/models/rubert-tiny2-mlm/checkpoint-372 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [86]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [87]:
training_args = TrainingArguments(
    output_dir='../data/models/rubert-tiny2-ner-v2',
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    no_cuda=not torch.cuda.is_available()
)

In [88]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.576352,0.459727,0.260592,0.332634,0.843949
2,No log,0.413024,0.442426,0.390985,0.415118,0.876958
3,No log,0.375955,0.491532,0.471658,0.48139,0.895031


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=300, training_loss=0.6769447835286458, metrics={'train_runtime': 487.8083, 'train_samples_per_second': 4.92, 'train_steps_per_second': 0.615, 'total_flos': 22374206344752.0, 'train_loss': 0.6769447835286458, 'epoch': 3.0})

- —Å—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω–æ —Å –ø—Ä–æ—Å—Ç—ã–º –¥–æ–æ–±—É—á–µ–Ω–∏–µ–º –Ω–∞ ner –∑–∞–¥–∞—á—É, –ø–æ–ª—É—á–∏–ª–∏ –º–µ—Ç—Ä–∏–∫–∏ –≤—ã—à–µ –Ω–∞ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–µ –æ–±—É—á–µ–Ω–∏—è

# –æ–±—É—á–µ–Ω–∏–µ —Å –¥–æ–ø —Ä–∞–∑–º–µ—Ç–∫–æ–π

In [21]:
lenta_10000_labeled = pd.read_csv('../data/lenta_10000_labeled.csv')

In [22]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

In [23]:
def prepare_ner_data(text, entities):
    # –ü–æ–ª—É—á–∞–µ–º —Ç–æ–∫–µ–Ω—ã –∏–∑ —Ç–µ–∫—Å—Ç–∞
    encoding = tokenizer(text, return_offsets_mapping=True, padding=False, truncation=True)
    input_ids = encoding["input_ids"]
    offsets = encoding["offset_mapping"]
    
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–µ—Ç–∫–∏ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–æ–∫–µ–Ω–∞ –∫–∞–∫ "O" (outside)
    labels = ["O"] * len(input_ids)
    
    # –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –¥–ª—è –º–∞–ø–ø–∏–Ω–≥–∞ –ø–æ–∑–∏—Ü–∏–∏ –≤ —Ç–µ–∫—Å—Ç–µ –∫ –∏–Ω–¥–µ–∫—Å—É —Ç–æ–∫–µ–Ω–∞
    position_to_token_idx = {}
    for i, (start, end) in enumerate(offsets):
        if start != end:  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º —Å–ø–µ—Ü—Ç–æ–∫–µ–Ω—ã —Å –Ω—É–ª–µ–≤–æ–π –¥–ª–∏–Ω–æ–π
            for pos in range(start, end):
                position_to_token_idx[pos] = i
    
    # –†–∞–∑–º–µ—á–∞–µ–º —Ç–æ–∫–µ–Ω—ã –Ω–∞ –æ—Å–Ω–æ–≤–µ –¥–∞–Ω–Ω—ã—Ö entities
    for entity in entities:
        # –ü–æ–ª—É—á–∞–µ–º –ø–æ–∑–∏—Ü–∏–∏ –≤ —Ç–µ–∫—Å—Ç–µ
        start_pos = entity['start']
        end_pos = entity['end']
        entity_label = entity['entity']
        
        # –ù–∞—Ö–æ–¥–∏–º —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏–µ —Ç–æ–∫–µ–Ω—ã
        token_indices = []
        for pos in range(start_pos, end_pos):
            if pos in position_to_token_idx:
                token_idx = position_to_token_idx[pos]
                if token_idx not in token_indices:
                    token_indices.append(token_idx)
        
        # –ï—Å–ª–∏ –Ω–∞–π–¥–µ–Ω—ã —Ç–æ–∫–µ–Ω—ã, –ø—Ä–∏–º–µ–Ω—è–µ–º –º–µ—Ç–∫–∏
        if token_indices:
            # –ü–æ–ª—É—á–∞–µ–º —Ç–∏–ø —Å—É—â–Ω–æ—Å—Ç–∏ –∏–∑ –º–µ—Ç–∫–∏
            if '-' in entity_label:
                _, entity_type = entity_label.split('-', 1)
            else:
                continue  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º, –µ—Å–ª–∏ –º–µ—Ç–∫–∞ –Ω–µ –≤ —Ñ–æ—Ä–º–∞—Ç–µ X-TYPE
            
            # –ü—Ä–∏–º–µ–Ω—è–µ–º –º–µ—Ç–∫–∏ –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –∫–æ–ª-–≤–∞ —Ç–æ–∫–µ–Ω–æ–≤
            for i, token_idx in enumerate(token_indices):
                if len(token_indices) == 1:
                    labels[token_idx] = f"U-{entity_type}"
                elif i == 0:
                    labels[token_idx] = f"B-{entity_type}"
                elif i == len(token_indices) - 1:
                    labels[token_idx] = f"L-{entity_type}"
                else:
                    labels[token_idx] = f"I-{entity_type}"
    
    return {
        "input_ids": input_ids,
        "attention_mask": [1] * len(input_ids),
        "labels": labels
    }

In [24]:
import ast

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –∫–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏–∏ —Å—Ç—Ä–æ–∫–æ–≤–æ–≥–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏—è —Å–ø–∏—Å–∫–∞ —Å–ª–æ–≤–∞—Ä–µ–π –≤ —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π
def parse_entities(entities_str):
    if isinstance(entities_str, str):
        return ast.literal_eval(entities_str)
    return entities_str

In [25]:
lenta_train_data = []

In [26]:
# –û–±—Ä–∞–±–æ—Ç–∫–∞ –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞
for idx, row in lenta_10000_labeled.iterrows():
    try:
        text_data = prepare_ner_data(row['text'], parse_entities(row['text_entities']))        
        lenta_train_data.append(text_data)
            
        # –û—Ç–æ–±—Ä–∞–∂–µ–Ω–∏–µ –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
        if idx % 100 == 0:
            print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ {idx} —Å—Ç—Ä–æ–∫ –∏–∑ {len(lenta_10000_labeled)}")
            
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ —Å—Ç—Ä–æ–∫–∏ {idx}: {e}")
        continue

–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 0 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 100 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 200 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 300 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 400 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 500 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 600 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 700 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 800 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 900 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1000 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1100 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1200 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1300 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1400 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1500 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1600 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1700 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1800 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 1900 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ 2000 —Å—Ç—Ä–æ–∫ –∏–∑ 10000
–û–±

In [27]:
from datasets import Dataset

lenta_train_dataset = Dataset.from_dict({
    'input_ids': [x['input_ids'] for x in lenta_train_data],
    'attention_mask': [x['attention_mask'] for x in lenta_train_data],
    'labels': [x['labels'] for x in lenta_train_data]
})

In [28]:
def convert_lenta_labels_to_train_format(example):
    # –ú–∞–ø–ø–∏–Ω–≥ —Ç–∏–ø–æ–≤ —Å—É—â–Ω–æ—Å—Ç–µ–π –∏–∑ lenta –≤ —Ç–∏–ø—ã train
    entity_type_mapping = {
        # –ì–µ–æ–≥—Ä–∞—Ñ–∏—á–µ—Å–∫–∏–µ –æ–±—ä–µ–∫—Ç—ã -> LOC
        'CITY': 'LOC',
        'COUNTRY': 'LOC',
        'DISTRICT': 'LOC',
        'REGION': 'LOC',
        'STREET': 'LOC',
        'HOUSE': 'LOC',  # –î–æ–º–∞ —Ç–∞–∫–∂–µ –æ—Ç–Ω–æ—Å–∏–º –∫ –ª–æ–∫–∞—Ü–∏—è–º
        
        # –ü–µ—Ä—Å–æ–Ω—ã -> PER
        'FIRST_NAME': 'PER',
        'LAST_NAME': 'PER',
        'MIDDLE_NAME': 'PER',
        
        # –î—Ä—É–≥–∏–µ —Ç–∏–ø—ã –º–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –ø–æ –º–µ—Ä–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏
    }
    
    # –°–ª–æ–≤–∞—Ä—å –º–µ—Ç–æ–∫ train_dataset
    train_label2id = {
        'O': 0, 
        'B-GEOPOLIT': 1, 'I-GEOPOLIT': 2, 
        'B-LOC': 3, 'I-LOC': 4, 
        'B-MEDIA': 5, 'I-MEDIA': 6, 
        'B-ORG': 7, 'I-ORG': 8, 
        'B-PER': 9, 'I-PER': 10
    }
    
    # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –º–µ—Ç–∫–∏
    new_labels = []
    for label in example['labels']:
        if label == 'O':
            new_labels.append(train_label2id['O'])
            continue
            
        # –†–∞–∑–±–∏–≤–∞–µ–º –º–µ—Ç–∫—É –Ω–∞ –ø–æ–∑–∏—Ü–∏—é (B/I/L/U) –∏ —Ç–∏–ø —Å—É—â–Ω–æ—Å—Ç–∏
        position, entity_type = label.split('-', 1)
        
        # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º —Ç–∏–ø —Å—É—â–Ω–æ—Å—Ç–∏
        mapped_type = entity_type_mapping.get(entity_type)
        if not mapped_type:
            # –ï—Å–ª–∏ –Ω–µ—Ç —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è, —Å—á–∏—Ç–∞–µ–º —Ç–æ–∫–µ–Ω –Ω–µ-—Å—É—â–Ω–æ—Å—Ç—å—é
            new_labels.append(train_label2id['O'])
            continue
            
        # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º BIOLU -> BI
        if position in ['B', 'U']:  # –ù–∞—á–∞–ª–æ —Å—É—â–Ω–æ—Å—Ç–∏ –∏–ª–∏ –µ–¥–∏–Ω–∏—á–Ω–∞—è —Å—É—â–Ω–æ—Å—Ç—å -> B
            new_label = f'B-{mapped_type}'
        elif position in ['I', 'L']:  # –í–Ω—É—Ç—Ä–∏ —Å—É—â–Ω–æ—Å—Ç–∏ –∏–ª–∏ –∫–æ–Ω–µ—Ü —Å—É—â–Ω–æ—Å—Ç–∏ -> I
            new_label = f'I-{mapped_type}'
        else:
            new_label = 'O'
            
        # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ —á–∏—Å–ª–æ–≤–æ–π —Ñ–æ—Ä–º–∞—Ç
        new_labels.append(train_label2id.get(new_label, train_label2id['O']))
        
    # –û–±–Ω–æ–≤–ª—è–µ–º –ø—Ä–∏–º–µ—Ä
    example['labels'] = new_labels
    return example

- –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ LOC –∏ PER, —Ç–∫ –¥—Ä—É–≥–∏—Ö –ø–æ—Ö–æ–∂–∏—Ö –Ω–∞ –Ω–∞—à —Ü–µ–ª–µ–≤–æ–π –¥–∞—Ç–∞—Å–µ—Ç —Å—É—â–Ω–æ—Å—Ç–µ–π –Ω–µ—Ç

In [29]:
lenta_train_dataset = lenta_train_dataset.map(convert_lenta_labels_to_train_format)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# –û—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ 3000 –ø—Ä–∏–º–µ—Ä–æ–≤ –∏–∑ –¥–∞—Ç–∞—Å–µ—Ç–∞
lenta_train_dataset = lenta_train_dataset.select(range(3000))

In [31]:
from datasets import concatenate_datasets

# –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞—Ç–∞—Å–µ—Ç–æ–≤
combined_train_dataset = concatenate_datasets([lenta_train_dataset, train_dataset])

In [33]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny2", 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='../data/models/rubert-tiny2-ner-v3',
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    no_cuda=not torch.cuda.is_available(),
    dataloader_num_workers=8,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=2    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.457694,0.072017,0.104856,0.085388,0.847202
2,0.368800,0.393128,0.115288,0.160186,0.134078,0.860811
3,0.185100,0.365139,0.16824,0.230992,0.194684,0.871996


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=1425, training_loss=0.24291555839672424, metrics={'train_runtime': 1466.7503, 'train_samples_per_second': 7.772, 'train_steps_per_second': 0.972, 'total_flos': 76468786882368.0, 'train_loss': 0.24291555839672424, 'epoch': 3.0})

- —ç—Ç–æ—Ç —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç –º–µ–Ω–µ–µ —É–¥–∞—á–Ω—ã–π: –≤–æ–∑–º–æ–∂–Ω–æ –ø–æ–≤–ª–∏—è–ª–æ –∫–∞—á–µ—Å—Ç–≤–æ —Ä–∞–∑–º–µ—Ç–∫–∏ —Å–∏–Ω—Ç–µ—Ç–∏—á–µ—Å–∫–∏—Ö –¥–∞–Ω–Ω—ã—Ö –∏–ª–∏ –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏–µ —Ç–æ–ª—å–∫–æ —Å –¥–≤—É–º—è ner –∫–ª–∞—Å—Å–∞–º–∏ —Å –∏–∑–Ω–∞—á–∞–ª—å–Ω—ã–º –¥–∞—Ç–∞—Å–µ—Ç–æ–º

- –ø–æ–¥—Ö–æ–¥ —Å –¥–æ–æ–±—É—á–µ–Ω–∏–µ–º –Ω–∞ –∑–∞–¥–∞—á—É mlm –∏ –ø–æ—Å–ª–µ–¥—É—é—â–∏–º –¥–æ–æ–±—É—á–µ–Ω–∏–µ–º –Ω–∞ –∑–∞–¥–∞—á—É ner –¥–∞–ª –±–æ–ª–µ–µ –∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç. —Å–∫–æ—Ä–µ–µ –≤—Å–µ–≥–æ –ø—Ä–∏ mlm –º–æ–¥–µ–ª—å –Ω–∞—á–∏–Ω–∞–µ—Ç –ª—É—á—à–µ —É–ª–∞–≤–ª–∏–≤–∞—Ç—å –≤–∑–∞–∏–º–æ—Å–≤—è–∑–∏ –º–µ–∂–¥—É —Å–ª–æ–≤–∞–º–∏ –≤ –Ω–∞—à–µ–º –∫–æ—Ä–ø—É—Å–µ, —á—Ç–æ –ø–æ–∑–≤–æ–ª—è–µ—Ç –µ–π –ø–æ–∫–∞–∑—ã–≤–∞—Ç—å —Ö–æ—Ä–æ—à–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø—Ä–∏ –∏–∑–≤–ª–µ—á–µ–Ω–∏–∏ –∏–º–µ–Ω–æ–≤–∞–Ω—ã—Ö —Å—É—â–Ω–æ—Å—Ç–µ–π