### Подготовка данных

In [21]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

label2id = {'B-width': 1,
            'B-height': 2,
            'B-radius': 3,
            'B-brand': 4,
            'B-line': 5,
            'I-line': 6,
            'O': 0}

id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'B-brand', 
            5: 'B-line', 
            6: 'I-line',
            0: 'O'}

device = 'cuda'

test_csv_pth = '/home/sondors/Documents/price/BERT_NER/csv/prod_train/test_02our.csv'
df_test = pd.read_csv(test_csv_pth, sep=';')
df_test


Unnamed: 0,offer,width,height,radius,v_ind,line,brand,type,BIO_Tags
0,14 / 165 / 65 Bridgestone Blizzak LM005 79T,165,65,16,79T,Blizzak LM005,['Bridgestone'],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
1,15 / 175 / 65 Bridgestone Turanza T005 84H,175,65,15,84H,Turanza T005,['Bridgestone'],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
2,15 / 185 / 55 Bridgestone Turanza T005 82V,185,55,18,82V,Turanza T005,['Bridgestone'],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
3,15 / 205 / 70 Bridgestone Blizzak LM005 96T,205,70,15,96T,Blizzak LM005,['Bridgestone'],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
4,15169 BRIDGESTONE Автошина R 15 185 / 60 Bridg...,185,60,15,88T,Blizzak LM005,"['BRIDGESTONE', 'Bridgestone']",наши_данные,"O,B-brand,O,O,B-radius,B-width,O,B-height,B-br..."
...,...,...,...,...,...,...,...,...,...
1651,Автошина R 17 215 / 60 Bridgestone Dueler H/T ...,215,60,17,96H,Dueler H/T D843,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1652,Автошина R 17 215 / 65 Bridgestone Blizzak LM0...,215,65,17,99H,Blizzak LM001,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1653,Шины R 17 225 / 50 Bridgestone Blizzak Ice 98T...,225,50,17,98T,Blizzak Ice,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1654,R 18 225 / 45 91S Bridgestone Blizzak VRX BRID...,225,45,18,91S,Blizzak VRX,"['Bridgestone', 'BRIDGESTONE']",наши_данные,"O,B-radius,B-width,O,B-height,O,B-brand,B-line..."


In [22]:
def extract_brands(text):
    brands = re.findall(r'Bridgestone|BRIDGESTONE|Бриджстоун|bridgestone|Бриджстон|Bridgeston|бриджстоун|бриджстон', text)
    return brands

print(type(df_test.iloc[0].brand))
df_test['brand'] = df_test['offer'].apply(extract_brands)
print(type(df_test.iloc[0].brand))
df_test

<class 'str'>
<class 'list'>


Unnamed: 0,offer,width,height,radius,v_ind,line,brand,type,BIO_Tags
0,14 / 165 / 65 Bridgestone Blizzak LM005 79T,165,65,16,79T,Blizzak LM005,[Bridgestone],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
1,15 / 175 / 65 Bridgestone Turanza T005 84H,175,65,15,84H,Turanza T005,[Bridgestone],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
2,15 / 185 / 55 Bridgestone Turanza T005 82V,185,55,18,82V,Turanza T005,[Bridgestone],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
3,15 / 205 / 70 Bridgestone Blizzak LM005 96T,205,70,15,96T,Blizzak LM005,[Bridgestone],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
4,15169 BRIDGESTONE Автошина R 15 185 / 60 Bridg...,185,60,15,88T,Blizzak LM005,"[BRIDGESTONE, Bridgestone]",наши_данные,"O,B-brand,O,O,B-radius,B-width,O,B-height,B-br..."
...,...,...,...,...,...,...,...,...,...
1651,Автошина R 17 215 / 60 Bridgestone Dueler H/T ...,215,60,17,96H,Dueler H/T D843,[Bridgestone],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1652,Автошина R 17 215 / 65 Bridgestone Blizzak LM0...,215,65,17,99H,Blizzak LM001,[Bridgestone],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1653,Шины R 17 225 / 50 Bridgestone Blizzak Ice 98T...,225,50,17,98T,Blizzak Ice,[Bridgestone],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1654,R 18 225 / 45 91S Bridgestone Blizzak VRX BRID...,225,45,18,91S,Blizzak VRX,"[Bridgestone, BRIDGESTONE]",наши_данные,"O,B-radius,B-width,O,B-height,O,B-brand,B-line..."


### Проверка CANINE

In [23]:
model_pth = "/home/sondors/Documents/price/BERT_NER/weights/CANINE/our_data/epoch_3"
tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [53]:
def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    # Формируем строки для каждой сущности
    brand_str = ''.join(entities.get('brand', []))
    width_str = ''.join(entities.get('width', []))
    height_str = ''.join(entities.get('height', []))
    radius_str = ''.join(entities.get('radius', []))
    line_str = ''.join(entities.get('line', []))

    print("entities = ", entities)
    # Выводим строки
    print(f"Brand: {brand_str}")
    print(f"Line: {line_str}")
    print(f"Width: {width_str}")
    print(f"Height: {height_str}")
    print(f"Radius: {radius_str}")
    return brand_str, line_str, width_str, height_str, radius_str
    
offer_1 = df_test.iloc[0].offer
result = inference(offer_1, model_CANINE, tokenizer_CANINE)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'radius', 'score': 0.93779564, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9507278, 'word': '4', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9963199, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.6828842, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9882548, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.5473908, 'word': ' ', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.76421386, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99978477, 'word': 'B', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998091, 'word': 'r', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998368, 'word': 'i', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998018, 'word': 'd', 'start': None, 'end': None}, {'entity_group': '

('Bridgestone', 'Blizzak LM005 79', '415', '6', '1')

In [35]:
offer_1 = df_test.iloc[0].offer.replace(" / ", "/")
result = inference(offer_1, model_CANINE, tokenizer_CANINE)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'radius', 'score': 0.98937607, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'radius', 'score': 0.98118675, 'word': '4', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9541591, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.98878163, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9996069, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.99947137, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.99863106, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99982435, 'word': 'B', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998259, 'word': 'r', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99984455, 'word': 'i', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.999818, 'word': 'd', 'start': None, 'end': None}, {'entity_grou

('Bridgestone', 'Blizzak LM005 79', '165', '65', '14')

### Проверка BERT

In [26]:
model_pth = "/home/sondors/Documents/price/BERT_NER/weights/BERT/rubert-tiny2_our_data/epoch_13"
tokenizer, model = load_model_BERT(model_pth, device, label2id, id2label)

In [31]:
offer_1 = df_test.iloc[0].offer
result = inference(offer_1, model, tokenizer)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'width', 'score': 0.85786164, 'word': '14', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.4455394, 'word': '165', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.94862944, 'word': '65', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.34516385, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.523839, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.4635591, 'word': '[UNK]', 'start': None, 'end': None}]
offer_1: 14 / 165 / 65 Bridgestone Blizzak LM005 79T

Brand: 
Line: 
Width: 14
Height: 65
Radius: 


('', '', '14', '65', '')

In [30]:
offer_1 = df_test.iloc[0].offer.replace(" / ", "/")
result = inference(offer_1, model, tokenizer)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'width', 'score': 0.85786164, 'word': '14', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.4455394, 'word': '165', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.94862944, 'word': '65', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.34516385, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.523839, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.4635591, 'word': '[UNK]', 'start': None, 'end': None}]
offer_1: 14/165/65 Bridgestone Blizzak LM005 79T

Brand: 
Line: [UNK]
Width: 14
Height: 65
Radius: 


('', '[UNK]', '14', '65', '')