### Загрузка нейронок

In [1]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    for key, value in entities.items():
        entities[key] = ''.join(value)
    return entities

def apply_on_df(model, tokenizer, df, column = 'offer'):
    for index, row in df.iterrows():
        offer = row[column]
        result = inference(offer, model, tokenizer)
        entities = brand_line_width_height_radius(result)

        # Заполнение DataFrame
        df.at[index, 'brand_pred'] = entities.get('brand', '')
        df.at[index, 'width_pred'] = entities.get('width', '')
        df.at[index, 'height_pred'] = entities.get('height', '')
        df.at[index, 'radius_pred'] = entities.get('radius', '')
        df.at[index, 'line_pred'] = entities.get('line', '')
    return df

label2id = {'B-width': 1,
            'B-height': 2,
            'B-radius': 3,
            'B-brand': 4,
            'B-line': 5,
            'I-line': 6,
            'O': 0}

id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'B-brand', 
            5: 'B-line', 
            6: 'I-line',
            0: 'O'}

device = "cpu"

model_pth = "/home/sondors/Documents/price/BERT_NER/weights/CANINE/our_data/epoch_3"
tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

model_pth = "/home/sondors/Documents/price/BERT_NER/weights/BERT/rubert-tiny2_our_data/epoch_13"
tokenizer, model = load_model_BERT(model_pth, device, label2id, id2label)


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


### Загрузка данных

In [2]:
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Gislaved.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Nordman.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Pirelli.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Yokohama.xlsx"
pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Кама.xlsx"
df_original = pd.read_excel(pth, dtype=str)
df = pd.DataFrame()
df['PRICE_NAME'] = df_original['PRICE_NAME']
# df['Brand'] = df_original['Brand']

df

Unnamed: 0,PRICE_NAME
0,165/70R13 Кама НК-244 79N
1,175/70R13 Кама Alga НК-531 82T шип
2,175/70R13 Кама НК-531 82T шип
3,175/80R16 Кама И-511 88Q шип (без камеры)
4,185/60R14 КАМА-365 (НК-241) 82H
...,...
6066,Автошина КАМА 365 LT (НК-243) 185/75 R16С 104/...
6067,135/80 R12 Кама-503 68Q шип.
6068,Шина летняя R16C 185/75 104/102N Кама Euro-131
6069,Автошина КАМА 365 LT (НК-243) 175 R16С 98/96 N


In [3]:
def process_text(input_text):
    # Добавляем пробелы к букве "R", если она идет слитно с цифрой
    pattern1 = r'(R)(\d+)'
    processed_text = re.sub(pattern1, r'R \2', input_text)

    pattern2 = r'(\d+)(R)'
    processed_text = re.sub(pattern2, r'\1 R', processed_text)

    pattern3 = r'(\/)(\d+)'
    processed_text = re.sub(pattern3, r'/ \2', processed_text)

    pattern4 = r'(\d+)(\/)'
    processed_text = re.sub(pattern4, r'\1 /', processed_text)

    pattern5 = r'(\\)(\d+)'
    processed_text = re.sub(pattern5, r'\\ \2', processed_text)

    pattern6 = r'(\d+)(\\)'
    processed_text = re.sub(pattern6, r'\1 \\', processed_text)

    # Заменяем символы "/", "\", "|" на пробелы
    # processed_text = re.sub(r'[\/\\|]', ' ', processed_text)

    # Заменяем символы "/" на " / "
    # processed_text = processed_text.replace("/", " / ")
    # Заменяем символы "\" на " \ "
    # processed_text = processed_text.replace("\\", " \\ ")
    # Заменяем символы "|" на " | "
    processed_text = processed_text.replace("|", " | ")

    # Убираем повторяющиеся пробелы
    processed_text = re.sub(r'  +', ' ', processed_text)
    return processed_text

def unfuck_offer(text):
    pattern1 = r'(R) (\d+)'
    processed_text = re.sub(pattern1, r'R\2', text)

    pattern2 = r'(\d+) (R)'
    processed_text = re.sub(pattern2, r'\1R', processed_text)

    pattern3 = r'(\/) (\d+)'
    processed_text = re.sub(pattern3, r'/\2', processed_text)

    pattern4 = r'(\d+) (\/)'
    processed_text = re.sub(pattern4, r'\1/', processed_text)

    pattern5 = r'(\\) (\d+)'
    processed_text = re.sub(pattern5, r'\\\2', processed_text)

    pattern6 = r'(\d+) (\\)'
    processed_text = re.sub(pattern6, r'\1\\', processed_text)

    return processed_text

df_CANINE = df.copy()
df_CANINE['PRICE_NAME'] = df_CANINE['PRICE_NAME'].apply(unfuck_offer)

df_BERT = df.copy()
df_BERT['PRICE_NAME'] = df_BERT['PRICE_NAME'].apply(process_text)

print(df_CANINE)
print(df_BERT)

                                             PRICE_NAME
0                             165/70R13 Кама НК-244 79N
1                    175/70R13 Кама Alga НК-531 82T шип
2                         175/70R13 Кама НК-531 82T шип
3             175/80R16 Кама И-511 88Q шип (без камеры)
4                       185/60R14 КАМА-365 (НК-241) 82H
...                                                 ...
6066  Автошина КАМА 365 LT (НК-243) 185/75R16С 104/1...
6067                        135/80R12 Кама-503 68Q шип.
6068     Шина летняя R16C 185/75 104/102N Кама Euro-131
6069      Автошина КАМА 365 LT (НК-243) 175R16С 98/96 N
6070                       Kama Euro-518 155/65R13 73 T

[6071 rows x 1 columns]
                                             PRICE_NAME
0                         165 / 70 R 13 Кама НК-244 79N
1                175 / 70 R 13 Кама Alga НК-531 82T шип
2                     175 / 70 R 13 Кама НК-531 82T шип
3         175 / 80 R 16 Кама И-511 88Q шип (без камеры)
4                   185

### BERT

In [4]:
df_BERT = apply_on_df(model, tokenizer, df_BERT, column = 'PRICE_NAME')
df_BERT

Unnamed: 0,PRICE_NAME,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,165 / 70 R 13 Кама НК-244 79N,,165,70,13,
1,175 / 70 R 13 Кама Alga НК-531 82T шип,,175,70,13,
2,175 / 70 R 13 Кама НК-531 82T шип,,175,70,13,
3,175 / 80 R 16 Кама И-511 88Q шип (без камеры),,175,80,16,
4,185 / 60 R 14 КАМА-365 (НК-241) 82H,,185,60,14,
...,...,...,...,...,...,...
6066,Автошина КАМА 365 LT (НК-243) 185 / 75 R 16С 1...,,185,75,,
6067,135 / 80 R 12 Кама-503 68Q шип.,,135,80,12,
6068,Шина летняя R 16C 185 / 75 104 / 102N Кама Eur...,,185,75,,
6069,Автошина КАМА 365 LT (НК-243) 175 R 16С 98 / 96 N,,98,,,


In [5]:
df_original['width_pred'] = df_BERT['width_pred']
df_original['height_pred'] = df_BERT['height_pred']
df_original['radius_pred'] = df_BERT['radius_pred']

df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,kpl,ppl,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID,width_pred,height_pred,radius_pred
0,99084,,8101,,4932,Kama,165/70R13 Кама НК-244 79N,,,,...,Шины,,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 832132, 1403, 2002]",2023-08-16 15:25:16,Продажа от 2х штук; меньше по согласованию.,1622539920,165,70,13
1,99084,,8101,,4932,Kama,175/70R13 Кама Alga НК-531 82T шип,,,,...,Шины,,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-08-09 08:45:04,Продажа от 2х штук; меньше по согласованию.,1628982103,175,70,13
2,99084,,8101,,4932,Kama,175/70R13 Кама НК-531 82T шип,,,,...,Шины,,,[8101],2022-11-25 12:53:27,Продажа от 2х штук; меньше по согласованию.,1627952538,175,70,13
3,99084,,8101,,4932,Kama,175/80R16 Кама И-511 88Q шип (без камеры),,,,...,Шины,,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 3319, 1403, 2812, 719811]",2023-05-30 06:10:51,Продажа от 2х штук; меньше по согласованию.,1651213808,175,80,16
4,99084,,8101,,4932,Kama,185/60R14 КАМА-365 (НК-241) 82H,,,,...,Шины,,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-07-27 10:40:31,Продажа от 2х штук; меньше по согласованию.,1622530518,185,60,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6066,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 185/75 R16С 104/...,185,,,...,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-31 19:52:55,,1696364312,185,75,
6067,96794,,8101,,4932,Kama,135/80 R12 Кама-503 68Q шип.,,,,...,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/522193...,[8101],2023-08-15 01:15:45,,1696350809,135,80,12
6068,96794,,8101,,4932,Kama,Шина летняя R16C 185/75 104/102N Кама Euro-131,185,,,...,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/542530...,[8101],2023-04-13 06:37:43,,1696284765,185,75,
6069,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 175 R16С 98/96 N,,,,...,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-24 14:21:51,,1696212025,98,,


In [6]:
df_original.to_excel("/home/sondors/Documents/price/BERT_NER/csv_to_label/Кама_Igor.xlsx")

### Проверка CANINE

In [None]:
# df_CANINE = apply_on_df(model_CANINE, tokenizer_CANINE, df_CANINE, column = 'PRICE_NAME')
# df_CANINE