In [None]:
from IPython.display import clear_output
import ipywidgets as widgets
import os

def inf(msg='\u2714 Done', style='success', wdth='50px'):
    inf = widgets.Button(description=msg,
                         disabled=True,
                         button_style=style,
                         layout=widgets.Layout(min_width=wdth))
    clear_output()
    display(inf)

In [None]:
! pip install -qqq wandb

In [None]:
import spacy
from spacy import displacy
from spacy.tokens import Span, Doc
import warnings
warnings.filterwarnings("ignore")

In [None]:
! python -m spacy download ru_core_news_md

inf()

Button(button_style='success', description='✔ Done', disabled=True, layout=Layout(min_width='50px'), style=But…

# Data Preprocessing

In [None]:
! gdown 1VE6VvBQZ3GL1s6glHaa8UTa7h1WHQijW
! gdown 1fzBIyZr1_VQnwuiDdJP60xEs3e4ebTVT

Downloading...
From: https://drive.google.com/uc?id=1VE6VvBQZ3GL1s6glHaa8UTa7h1WHQijW
To: /content/train_data.csv
100% 8.85M/8.85M [00:00<00:00, 66.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fzBIyZr1_VQnwuiDdJP60xEs3e4ebTVT
To: /content/train_data_fixed.csv
100% 12.9M/12.9M [00:00<00:00, 36.4MB/s]


In [None]:
import pandas as pd
import json

def _convert_str_to_json(s):
    try:
        s = s.replace('\'', '\"')
        return json.loads(s)
    except:
        return None

def _expand_labels(text, labels_positions):
    words = text.split()
    positions = _convert_str_to_json(labels_positions)
    i_value = positions.get('I-value', [None])
    b_value = positions.get('B-value', [None])
    b_discount = positions.get('B-discount', [None])
    labels = []

    # first, label all words from b_value idx to i_value idx as 'value' entity
    char_start_pos = 0
    if b_value is not None:
        i_value = i_value if i_value is not None else b_value
        for idx, word in enumerate(words):
            if idx in b_value or idx in i_value:
                char_end_pos = char_start_pos + len(word)
                # print(f"{word} | {text[char_start_pos:char_end_pos]} | VALUE")
                labels.append({'start': char_start_pos, 'end': char_end_pos, 'labels': 'value', 'text': word})
            char_start_pos += len(word) + 1

    # next, label b_discount entity (relabel if needed)
    char_start_pos = 0
    if b_discount is not None:
        for idx, word in enumerate(words):
            if idx in b_discount:
                char_end_pos = char_start_pos + len(word)
                # print(f"{word} | {text[char_start_pos:char_end_pos]} | DISCOUNT")
                labels.append({'start': char_start_pos, 'end': char_end_pos, 'labels': 'discount', 'text': word})
            char_start_pos += len(word) + 1

    return labels




df = pd.read_csv('train_data_fixed.csv')
df['spacy_labels'] = df.apply(lambda x: _expand_labels(x['processed_text'], x['target_labels_positions']), axis=1)

df.head()

Unnamed: 0,processed_text,labels,target_labels_positions,Проверка,Процент_для_классификатора,Процент_в_тексте,Unnamed: 6,spacy_labels
0,аа союзная тридцать пять дробь один лариса сое...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[]
1,аа приложение мне показывает к оплате у меня п...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[]
2,да лисное по призрак лишнее ну почему иду пять...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","{'I-value': [140], 'B-value': [139], 'B-discou...",ok,2.0,2.0,,"[{'start': 904, 'end': 907, 'labels': 'value',..."
3,а что добрый день NAME у меня пришел какой то ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{'B-discount': [12]},ok,,,,"[{'start': 60, 'end': 66, 'labels': 'discount'..."
4,у меня западный с утра да да еще да да самый в...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[]


In [None]:
# label studio annotations format:
"""
{'data':
    {'text': ...,
     'preditions': {
      "model_version": "toloka",
      "result": [
        {
          "from_name": "label",
          "to_name": "text",
          "type": "labels",
          "value": {
            "start": ...,
            "end": ...,
            "text": "... ... ...",
            "labels": [
              "LABEL_1"
            ]
          }
        },
        ...

"""

'\n{\'data\':\n    {\'text\': ...,\n     \'preditions\': {\n      "model_version": "toloka",\n      "result": [\n        {\n          "from_name": "label",\n          "to_name": "text",\n          "type": "labels",\n          "value": {\n            "start": ...,\n            "end": ...,\n            "text": "... ... ...",\n            "labels": [\n              "LABEL_1"\n            ]\n          }\n        },\n        ...\n\n'

In [None]:
def convert_to_label_studio_format(text, labels):
    label_studio_result = []
    for label in labels:
        start = label['start']
        end = label['end']
        text_ = label['text']
        labels = [label['labels']]
        label_studio_result.append({
            'from_name': 'label',
            'to_name': 'text',
            'type': 'labels',
            'value': {'start': start, 'end': end, 'text': text_, 'labels': labels}})
    label_studio_format = {'data': {'text': text},
                           'predictions': [
                               {'model_version': 'toloka',
                                'result': label_studio_result
                                }]}

    return label_studio_format


def convert_to_spacy_format(text, labels):
    # spacy_format = (text, {'entities': (start, end, label)})
    entities = []
    for label in labels:
        entities.append((label['start'], label['end'], label['labels']))
    return (text, {'entities': entities})




df['label_studio_format'] = df.apply(lambda x: convert_to_label_studio_format(x['processed_text'], x['spacy_labels']), axis=1)
df['spacy_format'] = df.apply(lambda x: convert_to_spacy_format(x['processed_text'], x['spacy_labels']), axis=1)
df.head()

Unnamed: 0,processed_text,labels,target_labels_positions,Проверка,Процент_для_классификатора,Процент_в_тексте,Unnamed: 6,spacy_labels,label_studio_format,spacy_format
0,аа союзная тридцать пять дробь один лариса сое...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[],{'data': {'text': 'аа союзная тридцать пять др...,(аа союзная тридцать пять дробь один лариса со...
1,аа приложение мне показывает к оплате у меня п...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[],{'data': {'text': 'аа приложение мне показывае...,(аа приложение мне показывает к оплате у меня ...
2,да лисное по призрак лишнее ну почему иду пять...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","{'I-value': [140], 'B-value': [139], 'B-discou...",ok,2.0,2.0,,"[{'start': 904, 'end': 907, 'labels': 'value',...",{'data': {'text': 'да лисное по призрак лишнее...,(да лисное по призрак лишнее ну почему иду пят...
3,а что добрый день NAME у меня пришел какой то ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{'B-discount': [12]},ok,,,,"[{'start': 60, 'end': 66, 'labels': 'discount'...",{'data': {'text': 'а что добрый день NAME у ме...,(а что добрый день NAME у меня пришел какой то...
4,у меня западный с утра да да еще да да самый в...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",{},,,,,[],{'data': {'text': 'у меня западный с утра да д...,(у меня западный с утра да да еще да да самый ...


In [None]:
# load label studio annotations to json
import json

list_of_annotations = []
for index, row in df.iterrows():
    list_of_annotations.append(row['label_studio_format'])


with open('annotations.json', 'w', encoding='utf-8') as f:
    json.dump(list_of_annotations, f)

# Training

In [None]:
spacy_annotations = df.loc[:, ['spacy_format',]]['spacy_format'].tolist()

In [None]:
# ▶️ ANNOTATIONS PREPROCESSING

import hashlib

def hash_text(text):
    return hashlib.sha256(text.encode()).hexdigest()

def remove_overlapping_entities(data, deduplicate=False, verbose=False):
    print(f"remove_overlapping_entities: {len(data)}", end=" >>> ") \
    if verbose else None

    cleaned_data = []
    seen_hashes = set()

    for text, annotations in data:
        text_hash = hash_text(text)
        if text_hash in seen_hashes and deduplicate:  # Skip this text if we've already added it
            continue

        entity_spans = annotations['entities']
        entity_spans.sort(key=lambda x: x[0])  # Sort by start character index

        # Check for overlaps and skip those entities
        non_overlapping_entities = []
        last_end = 0
        for start, end, label in entity_spans:
            if start >= last_end:
                non_overlapping_entities.append((start, end, label))
                last_end = end

        # Append cleaned annotations to the cleaned_data list
        cleaned_data.append((text, {'entities': non_overlapping_entities}))
        seen_hashes.add(text_hash)

    print(f"\033[094m{len(cleaned_data)}\033[0m") if verbose else None
    return cleaned_data


def upsample_entities(data,
                      upsample_label='PER',
                      upsample_factor=4,
                      negative_label='DATE',
                      wo_negative=True,
                      verbose=True):
    print(f"upsample_entities [{upsample_label}]: {len(data)}", end=" >>> ") \
    if verbose else None

    def _condition(ents, lab, lab_neg, condition_type='any'):
        with_lab = any(label == lab for _, _, label in ents)
        with_lab_neg = any(label == lab_neg for _, _, label in ents)
        if condition_type == 'any':
            return with_lab
        elif condition_type == 'any_except':
            return with_lab and not with_lab_neg

    condition_type = 'any_except' if wo_negative else 'any'
    label_examples = [example for example in data \
                       if _condition(ents=example[1]['entities'],
                                     lab=upsample_label,
                                     lab_neg=negative_label,
                                     condition_type=condition_type)]

    upsampled_data = label_examples * upsample_factor
    upsampled_data = data + upsampled_data

    print(f"\033[094m{len(upsampled_data)}\033[0m") \
    if verbose else None


In [None]:
# ▶️ TARGET BATCH ACCURACY

SPACY_ENTS = [
]

def simple_check(actual, preds):
    if actual == preds:
        return 1
    if len(actual) == 0 or len(preds) == 0:
        return 0
    correct = 0
    for e in preds:
        if e in actual:
            correct += 1
    return correct/len(actual)

def flat_accuracy(text, annotations, nlp_model, use_spacy_ents=False,
                  important_entities=SPACY_ENTS):
    # print(text, annotations)
    actual_ents = [ents[2] if len(ents) > 0 else "" for ents in annotations]
    prediction = nlp_model(text)
    pred_ents = [ent.label_ for ent in prediction.ents]

    # filter only important
    if use_spacy_ents:
        actual_ents = [e for e in actual_ents if e in important_entities]
        pred_ents = [e for e in pred_ents if e in important_entities]

    # print(actual_ents, pred_ents)

    # return 1 if actual_ents == pred_ents else 0
    acc = simple_check(actual_ents, pred_ents)
    # print(acc)
    return acc

def batch_accuracy(batch, nlp_model):
    predict_points = sum(flat_accuracy(test[0], test[1]['entities'], nlp_model) \
                         for test in batch)
    output = (predict_points/len(batch)) * 100
    return output

In [None]:
# ▶️ METRICS

from collections import defaultdict
from tqdm.notebook import tqdm_notebook as tqdm

def calculate_metrics(test, nlp_model):
    TP = defaultdict(int)
    FP = defaultdict(int)
    FN = defaultdict(int)

    for text, annot in tqdm(test):
        doc_gold_text = nlp_model.make_doc(text)
        gold = Example.from_dict(doc_gold_text, annot)
        pred_value = nlp_model(text)

        gold_ents = [(ent.start, ent.end, ent.label_) for ent in gold.reference.ents]
        pred_ents = [(ent.start, ent.end, ent.label_) for ent in pred_value.ents]

        for start, end, label in gold_ents:
            if (start, end, label) in pred_ents:
                TP[label] += 1
            else:
                FN[label] += 1

        for start, end, label in pred_ents:
            if (start, end, label) not in gold_ents:
                FP[label] += 1

    metrics = {}

    # Calculate F1-score, Precision, Recall for each class
    f1_scores = {}
    precisions = {}
    recalls = {}
    for label in TP.keys():
        precision = TP[label] / (TP[label] + FP[label]) if TP[label] + FP[label] > 0 else 0
        recall = TP[label] / (TP[label] + FN[label]) if TP[label] + FN[label] > 0 else 0
        f1_scores[label] = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        precisions[label] = precision
        recalls[label] = recall
    metrics = {"f1_scores": f1_scores, "precisions": precisions, "recalls": recalls}
    return metrics

In [None]:
# ▶️ STRATIFY TEST AND TRAIN

from sklearn.model_selection import train_test_split

def has_label(annotations, target_label):
    """Check if the annotation has the defined label."""
    for _, _, label in annotations['entities']:
        if label == target_label:
            return True
    return False

def train_test_split_stratified(data, test_size=0.2, label='PER'):
    """Stratify based on the binary labels"""
    labels = [has_label(annotations, label) for _, annotations in data]
    labels = None if label is None else labels
    train_data, test_data = train_test_split(data,
                                             test_size=test_size,
                                             stratify=labels)
    return train_data, test_data


In [None]:
# ▶️ PROCESS ANNOTATIONS
LABELS = {'discount', 'value'}

from spacy.training.example import Example
import random

random.seed(2589)
labels_to_ignore = []

print(f"\n\033[094mspacy_annotations: {len(spacy_annotations)}\033[0m\n")

train, test1 = train_test_split_stratified(spacy_annotations, test_size=0.1, label='value')
train, test2 = train_test_split_stratified(train, test_size=0.1, label='discount')
test = test1 + test2

# upsampling for underrepresented class
# train = upsample_entities(train, 'value', 2)

print("-"*100+f"\n\033[094mtrain: {len(train)} | test: {len(test)}\033[0m\n"+"-"*100)


[094mspacy_annotations: 3399[0m

----------------------------------------------------------------------------------------------------
[094mtrain: 2753 | test: 646[0m
----------------------------------------------------------------------------------------------------


In [None]:
def calculate_stats(data):
  """
  Calculate statistics for the given data.
  number of texts / number of entities for entity_group
  output: {ENT1: {"num_texts": int, "num_entities": int}, ... }
  """
  stats = {}
  for text, annotations in data:
    entities = annotations['entities']
    labels_in_text = []
    for _, _, label in entities:
        if label not in stats:
            num_texts = 1
            num_entities = 1
            stats[label] = {"num_texts": num_texts, "num_entities": num_entities}
            labels_in_text.append(label)
        else:
            stats[label]['num_entities'] += 1
            stats[label]['num_texts'] += 1 if label not in labels_in_text else 0
            labels_in_text.append(label)


  return stats

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
print("train stats:")
pp.pprint(calculate_stats(train))
print("\ntest stats:")
pp.pprint(calculate_stats(test))

train stats:
{   'discount': {'num_entities': 536, 'num_texts': 407},
    'value': {'num_entities': 709, 'num_texts': 298}}

test stats:
{   'discount': {'num_entities': 110, 'num_texts': 96},
    'value': {'num_entities': 156, 'num_texts': 70}}


In [None]:
# ▶️ INITIALIZE WANDB

import wandb
import uuid
from datetime import datetime

# Initialize new model version
model_version = f"ner_m24_{datetime.now().strftime('%Y-%m-%d_%H-%M')}"

# Logging setup
os.environ['WANDB_API_KEY'] = 'a582367d9b411d8d1d97adad0f4ebe88c9f18751'

# If init failed >>> wandb login --relogin
wandb.init(
    project="hack_m24",
    name=model_version,
    config={
      "lr": 1e-3,
      "batch_size": 4,
      "decay_factor": 0.6,
      "decay_after": 20,
      "decay_every": 6,
      "epochs": 42,
      "comments": {"split": True, "upsample": False, "stratify": "value"}
      },
    settings=wandb.Settings(start_method="thread")
           )

nlp = spacy.load("ru_core_news_md")
ner = nlp.get_pipe("ner")

for label in LABELS:
    ner.add_label(label)

examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train]

VBox(children=(Label(value='0.001 MB of 0.020 MB uploaded\r'), FloatProgress(value=0.06431867864950207, max=1.…

0,1
Iteration,▁

0,1
Iteration,0


In [None]:
FINE_TUNING = False  #@param {type: "boolean"}
EPOCHS = 42  #@param {type: "integer"}
BATCH_SIZE = 4  #@param {type: "integer"}
LEARNING_RATE = 1e-3  #@param {type: "number"}
RELOAD_EXAMPLES = False  #@param {type: "boolean"}

if FINE_TUNING:
    # update wandb learning rate
    wandb.config.lr = LEARNING_RATE
    # update batch size
    wandb.config.batch_size = BATCH_SIZE
    # update epochs
    wandb.config.epochs = EPOCHS

if RELOAD_EXAMPLES:
    examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train]

In [None]:
# ▶️ TRAIN

def pretty_print_metrics(itn, losses, metrics):
  metrics_ = " ".join([f"{k}: {v:.4f}" for k, v in metrics['f1_scores'].items()])
  print(f"\033[090mIteration: {itn} | Loss: {losses['ner']}%\033[094m | {metrics_}\033[0m")

n_iter = wandb.config.epochs

# Training loop
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
optimizer = nlp.resume_training()
optimizer.learn_rate = wandb.config.lr
decay_factor = wandb.config.decay_factor
decay_after = wandb.config.decay_after
decay_every = wandb.config.decay_every
batch_size = wandb.config.batch_size
all_losses = []

wandb.log({"Iteration": 0})

with nlp.disable_pipes(*disabled_pipes):
    for itn in range(n_iter):
        random.shuffle(examples)
        losses = {}
        for batch in spacy.util.minibatch(examples, size=batch_size):
              nlp.update(batch, drop=0.4, losses=losses)

        print(f"Iteration: {itn} | Loss: {losses['ner']}")

        all_losses.append(losses.get('ner', 0))
        wandb.log({
            "Iteration": itn,
            "Loss": losses['ner'],
            "Learning Rate": optimizer.learn_rate,}
                  )

        # Measure and log metrics every 4 iterations
        if itn % 4 == 0 and itn != 0:
            test_accuracy = batch_accuracy(test, nlp)
            train_accuracy = batch_accuracy(train[:len(test)], nlp)
            metrics = calculate_metrics(test, nlp)
            wandb.log({"Test_Accuracy": test_accuracy,
                       "Train_Accuracy": train_accuracy,
                       "F1_per_class": metrics['f1_scores'],
                       "Recall_per_class": metrics['recalls'],
                       "Precision_per_class": metrics['precisions'],
                       "Threshold": 0.94})

            # print(f"\033[090mIteration: {itn} | Loss: {losses['ner']} | \
            # \033[094mAccuracy: {accuracy:.4f}%\033[0m | {metrics['recalls']}")

            pretty_print_metrics(itn, losses, metrics)

        # if itn % 6 == 0 and itn != 0:
        #     svg = wandb_snapshot(nlp)
        #     wandb.log({"Entity Example": wandb.Html(svg)})

        # Adjust learning rate
        if itn > decay_after and itn % decay_every == 0:
            optimizer.learn_rate *= decay_factor

Iteration: 0 | Loss: 2348.2571551308993
Iteration: 1 | Loss: 2246.1032361776993
Iteration: 2 | Loss: 2080.1600313020194
Iteration: 3 | Loss: 2051.466635489423
Iteration: 4 | Loss: 2045.1298669968871


  0%|          | 0/646 [00:00<?, ?it/s]

[090mIteration: 4 | Loss: 2045.1298669968871%[094m | discount: 0.3270 value: 0.4901[0m
Iteration: 5 | Loss: 1973.2982268534897
Iteration: 6 | Loss: 1919.1122326455493
Iteration: 7 | Loss: 1793.2316554158529
Iteration: 8 | Loss: 1719.3441853886995


  0%|          | 0/646 [00:00<?, ?it/s]

[090mIteration: 8 | Loss: 1719.3441853886995%[094m | discount: 0.4628 value: 0.5233[0m
Iteration: 9 | Loss: 1615.2223732310633
Iteration: 10 | Loss: 1486.57166576068


KeyboardInterrupt: 