### Подготовка данных

In [1]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

label2id = {'B-width': 1,
            'B-height': 2,
            'B-radius': 3,
            'B-brand': 4,
            'B-line': 5,
            'I-line': 6,
            'O': 0}

id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'B-brand', 
            5: 'B-line', 
            6: 'I-line',
            0: 'O'}

device = "cpu"#'cuda'

test_csv_pth = '/home/sondors/Documents/price/BERT_NER/csv/prod_train/test_02our.csv'
df_test = pd.read_csv(test_csv_pth, sep=';')
df_test


Unnamed: 0,offer,width,height,radius,v_ind,line,brand,type,BIO_Tags
0,14 / 165 / 65 Bridgestone Blizzak LM005 79T,165,65,16,79T,Blizzak LM005,['Bridgestone'],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
1,15 / 175 / 65 Bridgestone Turanza T005 84H,175,65,15,84H,Turanza T005,['Bridgestone'],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
2,15 / 185 / 55 Bridgestone Turanza T005 82V,185,55,18,82V,Turanza T005,['Bridgestone'],наши_данные,"O,O,B-width,O,B-height,B-brand,B-line,I-line,O"
3,15 / 205 / 70 Bridgestone Blizzak LM005 96T,205,70,15,96T,Blizzak LM005,['Bridgestone'],наши_данные,"B-radius,O,B-width,O,B-height,B-brand,B-line,I..."
4,15169 BRIDGESTONE Автошина R 15 185 / 60 Bridg...,185,60,15,88T,Blizzak LM005,"['BRIDGESTONE', 'Bridgestone']",наши_данные,"O,B-brand,O,O,B-radius,B-width,O,B-height,B-br..."
...,...,...,...,...,...,...,...,...,...
1651,Автошина R 17 215 / 60 Bridgestone Dueler H/T ...,215,60,17,96H,Dueler H/T D843,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1652,Автошина R 17 215 / 65 Bridgestone Blizzak LM0...,215,65,17,99H,Blizzak LM001,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1653,Шины R 17 225 / 50 Bridgestone Blizzak Ice 98T...,225,50,17,98T,Blizzak Ice,['Bridgestone'],наши_данные,"O,O,B-radius,B-width,O,B-height,B-brand,B-line..."
1654,R 18 225 / 45 91S Bridgestone Blizzak VRX BRID...,225,45,18,91S,Blizzak VRX,"['Bridgestone', 'BRIDGESTONE']",наши_данные,"O,B-radius,B-width,O,B-height,O,B-brand,B-line..."


In [2]:
def extract_brands(text):
    brands = re.findall(r'Bridgestone|BRIDGESTONE|Бриджстоун|bridgestone|Бриджстон|Bridgeston|бриджстоун|бриджстон', text)
    return brands
def concat_brands(lst):
    return ''.join(lst)

print(type(df_test.iloc[0].brand))
df_test['brand'] = df_test['offer'].apply(extract_brands)
print(type(df_test.iloc[0].brand))
df_test['brand'] = df_test['brand'].apply(concat_brands)
print(type(df_test.iloc[0].brand))

clms = ['BIO_Tags',
        'type',
        'v_ind']
df_test = df_test.drop(columns=clms)

df_test = df_test.astype(str)
df_test

<class 'str'>
<class 'list'>
<class 'str'>


Unnamed: 0,offer,width,height,radius,line,brand
0,14 / 165 / 65 Bridgestone Blizzak LM005 79T,165,65,16,Blizzak LM005,Bridgestone
1,15 / 175 / 65 Bridgestone Turanza T005 84H,175,65,15,Turanza T005,Bridgestone
2,15 / 185 / 55 Bridgestone Turanza T005 82V,185,55,18,Turanza T005,Bridgestone
3,15 / 205 / 70 Bridgestone Blizzak LM005 96T,205,70,15,Blizzak LM005,Bridgestone
4,15169 BRIDGESTONE Автошина R 15 185 / 60 Bridg...,185,60,15,Blizzak LM005,BRIDGESTONEBridgestone
...,...,...,...,...,...,...
1651,Автошина R 17 215 / 60 Bridgestone Dueler H/T ...,215,60,17,Dueler H/T D843,Bridgestone
1652,Автошина R 17 215 / 65 Bridgestone Blizzak LM0...,215,65,17,Blizzak LM001,Bridgestone
1653,Шины R 17 225 / 50 Bridgestone Blizzak Ice 98T...,225,50,17,Blizzak Ice,Bridgestone
1654,R 18 225 / 45 91S Bridgestone Blizzak VRX BRID...,225,45,18,Blizzak VRX,BridgestoneBRIDGESTONE


In [18]:
def concat_brands(lst):
    return ''.join(lst)

def unfuck_offer(text):
    pattern1 = r'(R) (\d+)'
    processed_text = re.sub(pattern1, r'R\2', text)

    pattern2 = r'(\d+) (R)'
    processed_text = re.sub(pattern2, r'\1R', processed_text)

    pattern3 = r'(\/) (\d+)'
    processed_text = re.sub(pattern3, r'/\2', processed_text)

    pattern4 = r'(\d+) (\/)'
    processed_text = re.sub(pattern4, r'\1/', processed_text)

    pattern5 = r'(\\) (\d+)'
    processed_text = re.sub(pattern5, r'\\\2', processed_text)

    pattern6 = r'(\d+) (\\)'
    processed_text = re.sub(pattern6, r'\1\\', processed_text)

    return processed_text

df_test_CANINE = df_test.copy()
df_test_CANINE['offer'] = df_test_CANINE['offer'].apply(unfuck_offer)
df_test_CANINE

Unnamed: 0,offer,width,height,radius,line,brand,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,14/165/65 Bridgestone Blizzak LM005 79T,165,65,16,Blizzak LM005,Bridgestone,Bridgestone,165,65,14,Blizzak LM005 79
1,15/175/65 Bridgestone Turanza T005 84H,175,65,15,Turanza T005,Bridgestone,Bridgestone,175,65,15,Turanza T005 84
2,15/185/55 Bridgestone Turanza T005 82V,185,55,18,Turanza T005,Bridgestone,Bridgestone,185,55,15,Turanza T005 82
3,15/205/70 Bridgestone Blizzak LM005 96T,205,70,15,Blizzak LM005,Bridgestone,Bridgestone,205,70,15,Blizzak LM005 96
4,15169 BRIDGESTONE Автошина R15 185/60 Bridgest...,185,60,15,Blizzak LM005,BRIDGESTONEBridgestone,BRIDGESTONEBridgestone,185,6,,Blizzak LM005
...,...,...,...,...,...,...,...,...,...,...,...
1651,Автошина R17 215/60 Bridgestone Dueler H/T D84...,215,60,17,Dueler H/T D843,Bridgestone,Bridgestone,215,6,1,Dueler H/T D
1652,Автошина R17 215/65 Bridgestone Blizzak LM001 ...,215,65,17,Blizzak LM001,Bridgestone,Bridgestone,215,65,1,Blizzak LM001
1653,Шины R17 225/50 Bridgestone Blizzak Ice 98T XL...,225,50,17,Blizzak Ice,Bridgestone,Bridgestone,225,5,1,Blizzak Ice
1654,R18 225/45 91S Bridgestone Blizzak VRX BRIDGES...,225,45,18,Blizzak VRX,BridgestoneBRIDGESTONE,BridgestoneBRIDGESTONE,225,4,1,Blizzak VR Blizzak VRX


### Проверка CANINE

In [3]:
model_pth = "/home/sondors/Documents/price/BERT_NER/weights/CANINE/our_data/epoch_3"
tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [19]:
def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    for key, value in entities.items():
        entities[key] = ''.join(value)
    return entities

def apply_on_df(model, tokenizer, df):
    for index, row in df.iterrows():
        offer = row['offer']
        result = inference(offer, model, tokenizer)
        entities = brand_line_width_height_radius(result)

        # Заполнение DataFrame
        df.at[index, 'brand_pred'] = entities.get('brand', '')
        df.at[index, 'width_pred'] = entities.get('width', '')
        df.at[index, 'height_pred'] = entities.get('height', '')
        df.at[index, 'radius_pred'] = entities.get('radius', '')
        df.at[index, 'line_pred'] = entities.get('line', '')
    return df

df_pred = apply_on_df(model_CANINE, tokenizer_CANINE, df_test_CANINE)
df_pred

Unnamed: 0,offer,width,height,radius,line,brand,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,14/165/65 Bridgestone Blizzak LM005 79T,165,65,16,Blizzak LM005,Bridgestone,Bridgestone,165,65,14,Blizzak LM005 79
1,15/175/65 Bridgestone Turanza T005 84H,175,65,15,Turanza T005,Bridgestone,Bridgestone,175,65,15,Turanza T005 84
2,15/185/55 Bridgestone Turanza T005 82V,185,55,18,Turanza T005,Bridgestone,Bridgestone,185,55,15,Turanza T005 82
3,15/205/70 Bridgestone Blizzak LM005 96T,205,70,15,Blizzak LM005,Bridgestone,Bridgestone,205,70,15,Blizzak LM005 96
4,15169 BRIDGESTONE Автошина R15 185/60 Bridgest...,185,60,15,Blizzak LM005,BRIDGESTONEBridgestone,BRIDGESTONEBridgestone,185,6,,Blizzak LM005
...,...,...,...,...,...,...,...,...,...,...,...
1651,Автошина R17 215/60 Bridgestone Dueler H/T D84...,215,60,17,Dueler H/T D843,Bridgestone,Bridgestone,215,6,1,Dueler H/T D8
1652,Автошина R17 215/65 Bridgestone Blizzak LM001 ...,215,65,17,Blizzak LM001,Bridgestone,Bridgestone,215,6,1,Blizzak LM0010
1653,Шины R17 225/50 Bridgestone Blizzak Ice 98T XL...,225,50,17,Blizzak Ice,Bridgestone,Bridgestone,225,5,1,Blizzak Ice 9
1654,R18 225/45 91S Bridgestone Blizzak VRX BRIDGES...,225,45,18,Blizzak VRX,BridgestoneBRIDGESTONE,BridgestoneBRIDGESTONE,225,45,18,Blizzak VRX Blizzak VRX


In [20]:
accuracy_brand = round(len(df_pred[df_pred.brand == df_pred.brand_pred])/len(df_pred), 3)
accuracy_line = round(len(df_pred[df_pred.line == df_pred.line_pred])/len(df_pred), 3)
accuracy_width = round(len(df_pred[df_pred.width == df_pred.width_pred])/len(df_pred), 3)
accuracy_height = round(len(df_pred[df_pred.height == df_pred.height_pred])/len(df_pred), 3)
accuracy_radius = round(len(df_pred[df_pred.radius == df_pred.radius_pred])/len(df_pred), 3)

print(f"accuracy_brand = {accuracy_brand}")
print(f"accuracy_line = {accuracy_line}")
print(f"accuracy_width = {accuracy_width}")
print(f"accuracy_height = {accuracy_height}")
print(f"accuracy_radius = {accuracy_radius}")

accuracy_brand = 0.932
accuracy_line = 0.042
accuracy_width = 0.992
accuracy_height = 0.968
accuracy_radius = 0.599


In [23]:
offer_1 = df_test.iloc[0].offer
result = inference(offer_1, model_CANINE, tokenizer_CANINE)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

entities = brand_line_width_height_radius(result)
print(entities)

result: [{'entity_group': 'radius', 'score': 0.93779564, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9507278, 'word': '4', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9963199, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.6828842, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9882548, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.5473908, 'word': ' ', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.76421386, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99978477, 'word': 'B', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998091, 'word': 'r', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998368, 'word': 'i', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998018, 'word': 'd', 'start': None, 'end': None}, {'entity_group': '

In [8]:
offer_1 = df_test.iloc[0].offer.replace(" / ", "/")
result = inference(offer_1, model_CANINE, tokenizer_CANINE)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

entities = brand_line_width_height_radius(result)
print(entities)

result: [{'entity_group': 'radius', 'score': 0.98937607, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'radius', 'score': 0.98118675, 'word': '4', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9541591, 'word': '1', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.98878163, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.9996069, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.99947137, 'word': '6', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.99863106, 'word': '5', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99982435, 'word': 'B', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.9998259, 'word': 'r', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.99984455, 'word': 'i', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.999818, 'word': 'd', 'start': None, 'end': None}, {'entity_grou

### Проверка BERT

In [9]:
model_pth = "/home/sondors/Documents/price/BERT_NER/weights/BERT/rubert-tiny2_our_data/epoch_13"
tokenizer, model = load_model_BERT(model_pth, device, label2id, id2label)

In [21]:
df_pred = apply_on_df(model, tokenizer, df_test)
df_pred

Unnamed: 0,offer,width,height,radius,line,brand,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,14 / 165 / 65 Bridgestone Blizzak LM005 79T,165,65,16,Blizzak LM005,Bridgestone,,14,65,,
1,15 / 175 / 65 Bridgestone Turanza T005 84H,175,65,15,Turanza T005,Bridgestone,,15,65,,
2,15 / 185 / 55 Bridgestone Turanza T005 82V,185,55,18,Turanza T005,Bridgestone,,15,55,,
3,15 / 205 / 70 Bridgestone Blizzak LM005 96T,205,70,15,Blizzak LM005,Bridgestone,,15,70,,
4,15169 BRIDGESTONE Автошина R 15 185 / 60 Bridg...,185,60,15,Blizzak LM005,BRIDGESTONEBridgestone,,185,60,15,
...,...,...,...,...,...,...,...,...,...,...,...
1651,Автошина R 17 215 / 60 Bridgestone Dueler H/T ...,215,60,17,Dueler H/T D843,Bridgestone,,215,60,17,
1652,Автошина R 17 215 / 65 Bridgestone Blizzak LM0...,215,65,17,Blizzak LM001,Bridgestone,,215,65,17,
1653,Шины R 17 225 / 50 Bridgestone Blizzak Ice 98T...,225,50,17,Blizzak Ice,Bridgestone,,225,50,17,
1654,R 18 225 / 45 91S Bridgestone Blizzak VRX BRID...,225,45,18,Blizzak VRX,BridgestoneBRIDGESTONE,,225,45,,


In [22]:
accuracy_brand = round(len(df_pred[df_pred.brand == df_pred.brand_pred])/len(df_pred), 3)
accuracy_line = round(len(df_pred[df_pred.line == df_pred.line_pred])/len(df_pred), 3)
accuracy_width = round(len(df_pred[df_pred.width == df_pred.width_pred])/len(df_pred), 3)
accuracy_height = round(len(df_pred[df_pred.height == df_pred.height_pred])/len(df_pred), 3)
accuracy_radius = round(len(df_pred[df_pred.radius == df_pred.radius_pred])/len(df_pred), 3)

print(f"accuracy_brand = {accuracy_brand}")
print(f"accuracy_line = {accuracy_line}")
print(f"accuracy_width = {accuracy_width}")
print(f"accuracy_height = {accuracy_height}")
print(f"accuracy_radius = {accuracy_radius}")

accuracy_brand = 0.001
accuracy_line = 0.0
accuracy_width = 0.98
accuracy_height = 0.998
accuracy_radius = 0.948


In [10]:
offer_1 = df_test.iloc[0].offer
result = inference(offer_1, model, tokenizer)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'width', 'score': 0.85786164, 'word': '14', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.4455394, 'word': '165', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.94862944, 'word': '65', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.34516385, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.523839, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.4635591, 'word': '[UNK]', 'start': None, 'end': None}]
offer_1: 14 / 165 / 65 Bridgestone Blizzak LM005 79T



{'width': '14', 'height': '65'}

In [11]:
offer_1 = df_test.iloc[0].offer.replace(" / ", "/")
result = inference(offer_1, model, tokenizer)
print(f"result: {result}")
print(f"offer_1: {offer_1}\n")

brand_line_width_height_radius(result)

result: [{'entity_group': 'width', 'score': 0.85786164, 'word': '14', 'start': None, 'end': None}, {'entity_group': 'width', 'score': 0.4455394, 'word': '165', 'start': None, 'end': None}, {'entity_group': 'height', 'score': 0.94862944, 'word': '65', 'start': None, 'end': None}, {'entity_group': 'brand', 'score': 0.34516385, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.523839, 'word': '[UNK]', 'start': None, 'end': None}, {'entity_group': 'line', 'score': 0.4635591, 'word': '[UNK]', 'start': None, 'end': None}]
offer_1: 14/165/65 Bridgestone Blizzak LM005 79T



{'width': '14', 'height': '65'}