# Initialize paths to data

In [None]:
train_path_dataset = '/kaggle/input/github/Assignment_3/data/train.jsonl'
test_path_dataset = '/kaggle/input/github/Assignment_3/data/test.jsonl'

# Install and import libraries

In [10]:
# Install required packages
!pip install -q peft transformers datasets evaluate seqeval wandb spancat thinc==8.2.3
!python3 -m spacy download ru_core_news_lg

^C
Traceback (most recent call last):
  File "/opt/conda/bin/pip", line 10, in <module>
    sys.exit(main())
  File "/opt/conda/lib/python3.10/site-packages/pip/_internal/cli/main.py", line 77, in main
    command = create_command(cmd_name, isolated=("--isolated" in cmd_args))
  File "/opt/conda/lib/python3.10/site-packages/pip/_internal/commands/__init__.py", line 114, in create_command
    module = importlib.import_module(module_path)
  File "/opt/conda/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _c

In [19]:
import pandas as pd
import spacy
from spacy.training import Example
from typing import List
import random
from spacy.util import compounding, minibatch
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
from thinc.api import RAdam

# Preprocessing

In [12]:
# Read training data from a JSONL file
json_data = pd.read_json(train_path_dataset, lines=True)
json_data = json_data[json_data.columns[::-1]]

# Display training data from a JSONL file
json_data

Unnamed: 0,id,sentences,ners
0,0,Бостон взорвали Тамерлан и Джохар Царнаевы из ...,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS..."
1,1,Умер избитый до комы гитарист и сооснователь г...,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],..."
2,2,Путин подписал распоряжение о выходе России из...,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O..."
3,3,Бенедикт XVI носил кардиостимулятор\nПапа Римс...,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6..."
4,4,Обама назначит в Верховный суд латиноамериканк...,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ..."
...,...,...,...
514,514,Глава Малайзии: мы не хотим противостоять Кита...,"[[42, 46, COUNTRY], [82, 87, COUNTRY], [104, 1..."
515,515,«Союз» впервые пристыковался к МКС за 6 часов\...,"[[1, 4, PRODUCT], [31, 33, FACILITY], [35, 44,..."
516,516,Трамп и Путин сделали совместное заявление к 7...,"[[0, 4, PERSON], [8, 12, PERSON], [45, 52, AGE..."
517,517,Российский магнат устроил самую дорогую свадьб...,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ..."


In [13]:
def get_tokens(text: str, annotations: List[List[int]]) -> List[str]:
    """
    Split the text based on annotations.

    Args:
    - text (str): Input text.
    - annotations (List[List[int]]): List of annotations.

    Returns:
    - List[str]: List of tokens.
    """
    split_sentences = []

    for annotation in annotations:
        start_idx, end_idx = annotation[0], annotation[1]
        word = text[start_idx:end_idx+1]
        split_sentences.append(word)

    return split_sentences

In [14]:
# Add tokens and tags columns to the DataFrame
json_data['tokens'] = [get_tokens(data[1]['sentences'], data[1]['ners']) for data in json_data.iterrows()]
json_data['tags'] = [[tags[2] for tags in data] for data in json_data['ners']]

# Training

In [15]:
# Load pretrained spaCy model
nlp = spacy.load("ru_core_news_lg")

# Define span key
span_key = "sc"

In [16]:
def get_span_and_text(tags, entities):
    """
    Get span entities and cleaned text.

    Args:
    - tags (List[str]): List of tags.
    - entities (List[str]): List of entities.

    Returns:
    - Tuple[List[Tuple[int, int, str]], str]: Tuple containing span entities and cleaned text.
    """
    span_ents = []
    text = ''
    for label, entity in zip(tags, entities):
        start = len(text)
        entity = entity.strip()
        text = text + entity + ' '
        end = len(text)-1
        span_ents.append((start, end, label))

    return span_ents, text

In [17]:
# Create spaCy compliant training data
train_data = []
for i, row in json_data.iterrows():
    text = row['tokens']
    span_ents, text = get_span_and_text(row['tags'], text)
    doc = nlp(text)
    annotation = {"spans": {span_key: span_ents}}
    train_data.append(Example.from_dict(doc, annotation))

In [20]:
# Define label list
label_list = [
    'AGE', 'AWARD', 'CITY', 'COUNTRY', 'CRIME', 'DATE', 'DISEASE', 'DISTRICT', 'EVENT', 'FACILITY',
    'FAMILY', 'IDEOLOGY', 'LANGUAGE', 'LAW', 'LOCATION', 'MONEY', 'NATIONALITY', 'NUMBER', 'ORDINAL',
    'ORGANIZATION', 'PENALTY', 'PERCENT', 'PERSON', 'PRODUCT', 'PROFESSION', 'RELIGION',
    'STATE_OR_PROVINCE', 'TIME', 'WORK_OF_ART'
]

# Define spancat config
config = {
    "threshold": 0.5,
    "spans_key": span_key,
    "max_positive": 1,
    "model": DEFAULT_SPANCAT_MODEL,
    "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
}

In [21]:
# Add spancat component to spaCy pipeline
nlp.add_pipe("spancat", config=config)
span = nlp.get_pipe('spancat')

# Add labels to spancat component
for label in label_list:
    span.add_label(label)

In [22]:
# Define optimizer
optimizer = RAdam(
    learn_rate=0.01,
    beta1=0.9,
    beta2=0.999,
    eps=1e-08,
    grad_clip=1.0,
    use_averages=True,
)

# Get pipes for training
pipe_exceptions = ["spancat"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Initialize spaCy object
nlp.initialize()

[2024-04-23 13:01:13,742] [INFO] Created vocabulary
[2024-04-23 13:01:13,743] [INFO] Finished initializing nlp object


<thinc.optimizers.Optimizer at 0x7ae03f1258a0>

In [24]:
# Start training the spancat component
all_losses = []
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in range(50):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(list(batch), losses=losses, drop=0.2, sgd=optimizer)
        print("epoch: {} Losses: {}".format(iteration, str(losses)))
        all_losses.append(losses['spancat'])

epoch: 0 Losses: {'spancat': 3350.5360975265503}
epoch: 1 Losses: {'spancat': 3396.0509462356567}
epoch: 2 Losses: {'spancat': 3325.958384513855}
epoch: 3 Losses: {'spancat': 3234.6865282058716}
epoch: 4 Losses: {'spancat': 3136.7268104553223}
epoch: 5 Losses: {'spancat': 3185.872924566269}
epoch: 6 Losses: {'spancat': 3084.6125264167786}
epoch: 7 Losses: {'spancat': 3057.916656970978}
epoch: 8 Losses: {'spancat': 3015.0736298561096}
epoch: 9 Losses: {'spancat': 2956.8053131103516}
epoch: 10 Losses: {'spancat': 3029.975591659546}
epoch: 11 Losses: {'spancat': 2976.8134326934814}
epoch: 12 Losses: {'spancat': 2948.631758213043}
epoch: 13 Losses: {'spancat': 2897.289381504059}
epoch: 14 Losses: {'spancat': 2865.2538924217224}
epoch: 15 Losses: {'spancat': 2814.6702075004578}
epoch: 16 Losses: {'spancat': 2839.282263278961}
epoch: 17 Losses: {'spancat': 2830.113566160202}
epoch: 18 Losses: {'spancat': 2797.7954564094543}
epoch: 19 Losses: {'spancat': 2718.7410435676575}
epoch: 20 Losses: 

# Prediction

In [25]:
# Read test data from a JSONL file
test_data = pd.read_json(test_path_dataset, lines=True)
test_data = test_data[test_data.columns[::-1]]

In [28]:
from spacy import displacy

# Perform NER on test data
ners = []
for text in test_data['senences']:
    doc = nlp(text)
    preds = []
    spans = doc.spans[span_key]
    ents_parse = displacy.parse_spans(doc, options={"spans_key": span_key})
    for span in ents_parse['spans']:
        preds.append([span['start'], span['end'], span['label']])

    ners.append(preds)

test_data['ners'] = ners

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


In [30]:
# Save results to a JSONL file
submission_df = test_data[['id', 'ners']]
submission_df.to_json('test.jsonl', lines=True, orient='records', force_ascii=False)

In [31]:
submission_df

Unnamed: 0,id,ners
0,519,"[[0, 7, PROFESSION], [8, 17, PERSON], [47, 54,..."
1,520,"[[213, 222, PERSON], [272, 280, PROFESSION], [..."
2,521,"[[0, 10, EVENT], [52, 55, COUNTRY], [55, 57, P..."
3,522,"[[125, 130, PROFESSION], [136, 143, CITY], [54..."
4,523,"[[0, 5, PROFESSION], [163, 169, EVENT], [170, ..."
...,...,...
60,579,"[[0, 6, ORDINAL], [114, 121, PROFESSION], [155..."
61,580,"[[0, 8, PERSON], [9, 15, EVENT], [16, 29, CITY..."
62,581,"[[17, 23, CITY], [51, 53, PROFESSION], [53, 57..."
63,582,"[[22, 28, PERSON], [29, 35, PROFESSION], [36, ..."
