In [1]:
from ast import literal_eval
import pandas as pd

train_data = pd.read_csv('data_base/train_with_aug.csv', sep=';')
train_df = train_data.drop_duplicates()

train_data = train_data.drop_duplicates()

def parse_annotation(ann):
    return literal_eval(ann)

def align_tokens_and_labels_v2(text, annotations):
    """
    Улучшенная версия с обработкой граничных случаев.
    """
    tokens = text.split()
    labels = ['O'] * len(tokens)
    
    # Сортируем аннотации по начальной позиции
    annotations = sorted(annotations, key=lambda x: x[0])
    
    # Находим позиции токенов
    current_pos = 0
    token_positions = []
    for token in tokens:
        start = text.find(token, current_pos)
        end = start + len(token)
        current_pos = end
        token_positions.append((start, end))
    
    # Сопоставляем токены с аннотациями
    for i, (token_start, token_end) in enumerate(token_positions):
        best_annotation = None
        best_overlap = 0
        
        for start, end, label in annotations:
            # Вычисляем перекрытие токена и аннотации
            overlap = min(token_end, end) - max(token_start, start)
            
            if overlap > 0 and overlap > best_overlap:
                best_overlap = overlap
                best_annotation = label
        
        if best_annotation:
            labels[i] = best_annotation
    
    return tokens, labels

def make_txt(df, txt_path):
    # Применяем функции ко всему датафрейму
    formatted_data = []
    for index, row in df.iterrows():
        text = row['sample']
        annotations = parse_annotation(row['annotation'])
        tokens, bio_labels = align_tokens_and_labels_v2(text, annotations)

        # Формируем строки для записи в файл
        for token, label in zip(tokens, bio_labels):
            formatted_data.append(f"{token} {label}")
        formatted_data.append('')  # Пустая строка между предложениями

    # Сохраняем данные в файл train.txt в формате, ожидаемом DeepPavlov
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(formatted_data))

In [3]:
new_train_df = train_df[train_df["sample"].str.split(" ").apply(lambda x: len(x) > 0)]
new_train_df["len_pred"] = new_train_df["annotation"].apply(lambda x: len(literal_eval(x)))
new_train_df = new_train_df[new_train_df.len_pred > 0]
new_train_df["sample"] = new_train_df["sample"].str.lower()
new_train_df = new_train_df.loc[new_train_df["sample"].str.lower().drop_duplicates().index]
new_train_df["annotation"][new_train_df["annotation"].str.contains("'0'")] = "[(0, 7, 'B-TYPE'), (8, 9, 'O'), (10, 17, 'O')]"
new_train_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_df["annotation"][new_train_df["annotation"].str.contains("'0'")] = "[(0, 7, 'B-TYPE'), (8, 9, 'O'), (10, 17, 'O')]"


(89278, 3)

In [5]:
from sklearn.model_selection import train_test_split

train_df_new, val_df_new = train_test_split(new_train_df, test_size=0.2)

In [6]:
make_txt(train_df_new, 'data_base/train_all_word.txt')
make_txt(val_df_new, 'data_base/val_all_word.txt')

In [7]:
!python3 -m spacy convert data_base/train_all_word.txt data_base/ -c ner
!python3 -m spacy convert data_base/val_all_word.txt data_base/ -c ner

[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (71422 documents):
data_base/train_all_word.spacy[0m
[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (17856 documents):
data_base/val_all_word.spacy[0m


In [None]:
!python3 -m spacy train config/config_transformer.cfg --output model_ner_all_word/ --paths.train data_base/train_all_word.spacy --paths.dev data_base/val_all_word.spacy --gpu-id 1


[38;5;4mℹ Saving to output directory: model_ner_all_word[0m
[38;5;4mℹ Using GPU: 1[0m
[1m
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
!python3 -m spacy debug config config/config_transformer.cfg

[1m
[38;5;1m✘ Config validation error[0m
disabled	field required
before_creation	field required
after_creation	field required
after_pipeline_creation	field required
{'lang': 'ru', 'pipeline': ['transformer', 'ner'], 'batch_size': 128, 'tokenizer': {'@tokenizers': 'spacy.Tokenizer.v1'}, 'vectors': {'@vectors': 'spacy.Vectors.v1'}}

If your config contains missing values, you can run the 'init fill-config'
command to fill in all the defaults, if possible:

python -m spacy init fill-config config/config_transformer.cfg config/config_transformer.cfg 

