python -m spacy download uk_core_news_sm

In [1]:
import spacy
from spacy.tokens import DocBin
from collections import defaultdict
import math
import random
from paths import *
import pandas as pd
import json
import torch

In [2]:
spacy.util.get_installed_models()

['uk_core_news_sm']

In [3]:
nlp = spacy.load("uk_core_news_sm")


In [36]:
with open(f'{DATA_FOLDER}/DocProperties/annotations.json', encoding='utf-8') as f:
    json_data = json.load(f)

In [40]:
converted_data = []
for item in json_data["annotations"]:
    if item is None:
        continue
    text, annotation = item
    entities = annotation["entities"]
    converted_data.append((text, entities))

In [42]:
with open(f"{PROCESSED_DATA_FOLDER}/training_data_augmented.txt", "w", encoding="utf-8") as f:
    for entry in converted_data:
        f.write(f"{entry}\n")

In [4]:
def convert_to_spacy_format(data, nlp):
    doc_bin = DocBin()
    for item in data:
        text, annotations = item
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin

In [5]:
data = [
    eval(line.strip().rstrip(","))
    for line in open(f"{PROCESSED_DATA_FOLDER}/training_data.txt", encoding="utf-8")
]

In [6]:
class_data = defaultdict(list)
for item in data:
    if isinstance(item, tuple) and len(item) == 2:
        _, annotations = item
        for _, _, label in annotations:
            class_data[label].append(item)
            break

In [7]:
train_ratio = 0.7
dev_ratio = 0.2
test_ratio = 0.1

train_data = []
dev_data = []
test_data = []

In [8]:
for label, items in class_data.items():
    n_items = len(items)
    n_train = math.ceil(train_ratio * n_items)
    n_dev = math.ceil(dev_ratio * n_items)
    n_test = n_items - n_train - n_dev  # Оставшееся количество идёт в тестовый набор

    # Добавляем данные в наборы
    train_data.extend(items[:n_train])
    dev_data.extend(items[n_train:n_train + n_dev])
    test_data.extend(items[n_train + n_dev:])

In [9]:
random.shuffle(train_data)
random.shuffle(dev_data)
random.shuffle(test_data)

In [10]:
train_doc_bin = convert_to_spacy_format(train_data, nlp)
train_doc_bin.to_disk(f"{PROCESSED_DATA_FOLDER}/train.spacy")

dev_doc_bin = convert_to_spacy_format(dev_data, nlp)
dev_doc_bin.to_disk(f"{PROCESSED_DATA_FOLDER}/dev.spacy")

test_doc_bin = convert_to_spacy_format(test_data, nlp)
test_doc_bin.to_disk(f"{PROCESSED_DATA_FOLDER}/test.spacy")

In [53]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3060'

In [None]:
# TODO change cfg for cpu