In [1]:
! pip install spacy transformers torch pandas



## Data Loading

In [2]:
import json

def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

file_path = '../train.jsonl'
data = load_data(file_path)
print("Loaded", len(data), "documents")

Loaded 519 documents


## Data Processing

In [3]:
# Extract Labels

def extract_labels(data):
    labels_set = set()
    for item in data:
        entities = item['ners']
        for _, _, label in entities:
            labels_set.add(label)
    return list(labels_set)

labels = extract_labels(data)

all_labels = extract_labels(data)
print(f"No. of unique labels: {len(all_labels)}")
print("Unique labels found:", all_labels)

No. of unique labels: 29
Unique labels found: ['RELIGION', 'PENALTY', 'PROFESSION', 'PERSON', 'NATIONALITY', 'IDEOLOGY', 'EVENT', 'FAMILY', 'ORDINAL', 'AGE', 'CITY', 'DISTRICT', 'PRODUCT', 'LANGUAGE', 'COUNTRY', 'CRIME', 'STATE_OR_PROVINCE', 'LAW', 'PERCENT', 'FACILITY', 'DISEASE', 'MONEY', 'ORGANIZATION', 'TIME', 'WORK_OF_ART', 'LOCATION', 'AWARD', 'DATE', 'NUMBER']


In [4]:
def create_label_id_map(labels):
    return {label: idx for idx, label in enumerate(labels)}

label_to_id_map = create_label_id_map(all_labels)
print("Label to ID Map:", label_to_id_map)

Label to ID Map: {'RELIGION': 0, 'PENALTY': 1, 'PROFESSION': 2, 'PERSON': 3, 'NATIONALITY': 4, 'IDEOLOGY': 5, 'EVENT': 6, 'FAMILY': 7, 'ORDINAL': 8, 'AGE': 9, 'CITY': 10, 'DISTRICT': 11, 'PRODUCT': 12, 'LANGUAGE': 13, 'COUNTRY': 14, 'CRIME': 15, 'STATE_OR_PROVINCE': 16, 'LAW': 17, 'PERCENT': 18, 'FACILITY': 19, 'DISEASE': 20, 'MONEY': 21, 'ORGANIZATION': 22, 'TIME': 23, 'WORK_OF_ART': 24, 'LOCATION': 25, 'AWARD': 26, 'DATE': 27, 'NUMBER': 28}


In [5]:
from transformers import AutoTokenizer

model_name='sberbank-ai/ruBert-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def encode_tags(text, tags, encodings):
    # Initialize label array with -100 (ignore index in PyTorch CrossEntropyLoss)
    labels = [-100] * len(encodings['input_ids'])
    tag_index = 0
    
    for idx, offset in enumerate(encodings['offset_mapping']):
        if tag_index < len(tags) and offset[1] != 0:  # Not a special token
            if offset[0] == tags[tag_index][0]:
                while offset[0] == tags[tag_index][0] and offset[1] <= tags[tag_index][1]:
                    # Only label the first token of the entity span
                    labels[idx] = tags[tag_index][2]  
                    break
            if offset[1] == tags[tag_index][1]:
                tag_index += 1  # Move to the next tag
                
    return labels

def label_to_id(label):
    return label_to_id_map[label]

def preprocess_data(data, tokenizer):
    tokenized_inputs = []
    label_outputs = []
    
    for item in data:
        text = item['sentences']
        entities = item['ners']
        tags = [(start, end, label_to_id(label)) for start, end, label in entities]  # Convert labels to IDs
        
        # Tokenize text and generate offset mapping
        encodings = tokenizer(text, return_offsets_mapping=True, padding='max_length', max_length=128, truncation=True)
        
        # Encode the tags according to the token offsets
        labels = encode_tags(text, tags, encodings)
        
        tokenized_inputs.append(encodings)
        label_outputs.append(labels)
    
    return tokenized_inputs, label_outputs

tokenized_inputs, label_outputs = preprocess_data(data, tokenizer)

## Model Training

In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

def create_dataloader(tokenized_data, label_outputs):
    input_ids = torch.tensor([td["input_ids"] for td in tokenized_data])
    attention_masks = torch.tensor([td["attention_mask"] for td in tokenized_data])
    labels = torch.tensor(label_outputs)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    sampler = RandomSampler(dataset)  # Random sampler for training
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=8) 
    return dataloader

train_dataloader = create_dataloader(tokenized_inputs, label_outputs)


In [8]:
from transformers import AutoModelForTokenClassification, AdamW

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=29, 
    output_attentions = False,
    output_hidden_states = False
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at sberbank-ai/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
!set CUDA_LAUNCH_BLOCKING=1

In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)

def train_model(model, dataloader, optimizer):
    model.train()  # Set the model to training mode

    for epoch in range(5):  # Number of epochs
        total_loss = 0
        for batch in dataloader:
            batch = tuple(t.to(model.device) for t in batch)  # Move batch to device
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()  # Clear any previously calculated gradients

            # Perform a forward pass. This will return the model's loss.
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs.loss
            loss.backward()  # Perform backpropagation
            optimizer.step()  # Update parameters

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}: Loss {total_loss / len(dataloader)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Start training
train_model(model, train_dataloader, optimizer)




Epoch 1: Loss 1.6098168946229494
Epoch 2: Loss 0.6137536784204153
Epoch 3: Loss 0.26771202396888
Epoch 4: Loss 0.11073513372013202
Epoch 5: Loss 0.05984992935107304


In [11]:
torch.save(model, 'model.pth')

## Evaluating the Model

In [12]:
dev_data = load_data('../dev.jsonl')
test_data = load_data('../test.jsonl')

In [13]:
print(dev_data[0])

{'senences': 'Генерал Д.Петреус назначен на пост главы ЦРУ.\n\nГенерал Дэвид Петреус назначен на пост главы Центрального разведывательного управления (ЦРУ). Такое решение сегодня принял Сенат США, передает Reuters.\n\nСообщается, что Д.Петреус избран главой ЦРУ подавляющим большинством членов Сената. Таким образом, генерал сменяет на этом посту Леона Панетту, который сегодня был назначен министром обороны США вместо ушедшего в отставку Роберта Гейтса. Д.Петреус приступит к исполнению своих обязанностей в сентябре 2011г.\n\nДо сегодняшнего назначения 58-летний Д.Петреус командовал контингентом НАТО в Афганистане. Теперь этот ставший вакантным пост займет генерал-лейтенант Джон Аллен.', 'id': 519}


In [14]:
def predict_entities(text, model, tokenizer, label_to_id_map):
    # Switch model to evaluation mode
    model.eval()
    
    # Prepare text input
    encoding = tokenizer.encode_plus(
        text,
        return_tensors="pt",  # Return PyTorch tensors
        padding='max_length',
        truncation=True,
        max_length=128,
        return_offsets_mapping=True
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    # Move tensors to the same device as model
    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Convert logits to entity IDs
    logits = outputs.logits
    entity_ids = torch.argmax(logits, dim=-1).squeeze().tolist()  # Remove batch dimension and get the highest probability label
    
    # Decode entity IDs to spans
    entities = []
    last_entity = None
    offset_mapping = encoding['offset_mapping'].squeeze().tolist()
    
    for idx, entity_id in enumerate(entity_ids):
        if entity_id != -100 and entity_id > 0:  # Ignore 'O' and special tokens
            label = list(label_to_id_map.keys())[list(label_to_id_map.values()).index(entity_id)]
            start, end = offset_mapping[idx]
            if last_entity and last_entity[2] == label and last_entity[1] == start:
                # Extend the entity
                last_entity[1] = end
            else:
                # Append the last entity if exists
                if last_entity:
                    entities.append(last_entity)
                last_entity = [start, end, label]
    
    if last_entity:
        entities.append(last_entity)
    
    return entities


In [15]:
def gets_predictions(test_data, model, tokenizer, label_to_id_map):
    updated_data = []
    for item in test_data:
        sentence = item['senences']
        entities = predict_entities(sentence, model, tokenizer, label_to_id_map)
        updated_record = {'ners': entities, 'id': item['id']}
        updated_data.append(updated_record)
    return updated_data

dev_data_inference = gets_predictions(dev_data, model, tokenizer, label_to_id_map)
test_data_inference = gets_predictions(test_data, model, tokenizer, label_to_id_map)

## Saving the Data file with Inference

In [16]:
def save_data_to_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for item in data:
            # Serialize dict to JSON formatted string
            json_string = json.dumps(item, ensure_ascii=False)
            # Write the JSON string to the file with a newline to separate records
            f.write(json_string + '\n')

save_data_to_jsonl(test_data_inference, 'test.jsonl')