In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

import jsonlines
import random

In [3]:
spacy.require_gpu()  

True

In [4]:
nlp = spacy.load("ru_core_news_lg")


def load_data(file_path):
    """Load the data from a .jsonl file and return it as spaCy Examples."""
    db = DocBin()  # Create a DocBin to store the documents

    with jsonlines.open(file_path) as reader:
        for obj in reader:
            text = obj["sentences"]
            annotations = obj["ners"]
            doc = nlp.make_doc(text)
            ents = []
            covered_positions = set()
            for start, end, label in sorted(
                annotations, key=lambda x: (x[1] - x[0]), reverse=True
            ):
                # Check for overlaps and only add non-overlapping spans
                if any(pos in covered_positions for pos in range(start, end)):
                    continue  # Skip this entity as it overlaps
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
                    covered_positions.update(range(start, end))
            doc.ents = ents
            db.add(doc)

    return db


def train_ner_model(train_data, output_model="ru_ner_model"):
    """Train a spaCy NER model."""
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.get_pipe("ner")

    # Add entity labels to the NER
    for text, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # Disable other pipes during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    min_loss = 9999
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.initialize()
        for itn in range(100):  # Number of training iterations
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
            loss = losses["ner"]
            print(f"Epoch {itn}, Loss: {loss}")
            if loss < min_loss:
                min_loss = loss
                nlp.to_disk(output_model)
                print(f"Saved model to {output_model}")


# Load training data
train_db = load_data("data/public_data/train.jsonl")
train_data = [
    (
        doc.text,
        {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]},
    )
    for doc in train_db.get_docs(nlp.vocab)
]

# Train the model
train_ner_model(train_data)

  jitify._init_module()


Epoch 0, Loss: 4493.005781329067
Saved model to ru_ner_model
Epoch 1, Loss: 575.5816081126676
Saved model to ru_ner_model
Epoch 2, Loss: 878.710912739215
Epoch 3, Loss: 842.5309573816911
Epoch 4, Loss: 1296.6850911893848
Epoch 5, Loss: 965.7312281668344
Epoch 6, Loss: 1272.5708262220717
Epoch 7, Loss: 1344.860793013894
Epoch 8, Loss: 1244.115213938884
Epoch 9, Loss: 1006.2494468113614
Epoch 10, Loss: 938.411341103796
Epoch 11, Loss: 1686.3125738544536
Epoch 12, Loss: 1415.5119604915626
Epoch 13, Loss: 1903.0011714997374
Epoch 14, Loss: 1039.117847860448
Epoch 15, Loss: 2072.933374936782
Epoch 16, Loss: 1159.8350409318314
Epoch 17, Loss: 1113.230507966867
Epoch 18, Loss: 1492.6382262693312
Epoch 19, Loss: 1347.3779260717117
Epoch 20, Loss: 1226.0192295949078
Epoch 21, Loss: 1583.8874984633715
Epoch 22, Loss: 1306.445245536049
Epoch 23, Loss: 1778.0710467328965
Epoch 24, Loss: 1489.7662180222048
Epoch 25, Loss: 1300.345126708341
Epoch 26, Loss: 2474.8229488427723
Epoch 27, Loss: 1960.609

In [5]:
nlp = spacy.load("ru_ner_model")

In [6]:
labeled_result = []

with jsonlines.open('data/public_data/test.jsonl') as reader:
    for obj in reader:
        text = obj["senences"]
        id = obj["id"]
        doc = nlp(text)
        ners = []
        for ent in doc.ents:
            ners.append([ent.start_char, ent.end_char, ent.label_])
        labeled_result.append({"ners": ners, "id": id})

In [7]:
import json

with open('test.jsonl', 'w') as f:
    for item in labeled_result:
        f.write(json.dumps(item) + '\n')

In [8]:
import zipfile

# Create a new ZIP file
with zipfile.ZipFile('test.zip', 'w') as zip_file:
    # Add a single file to the ZIP file
    zip_file.write('test.jsonl')