### Загрузка нейронок

In [1]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, AutoTokenizer, LukeForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def load_model_LUKE(model_pth, device, label2id, id2label):
    model = LukeForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = AutoTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def inference(offer, model, tokenizer, device):
    pipe = pipeline(task="ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    for key, value in entities.items():
        entities[key] = ''.join(value)
    return entities

def apply_on_df(model, tokenizer, df, device, column = 'offer'):
    for index, row in df.iterrows():
        offer = row[column]
        result = inference(offer, model, tokenizer, device)
        entities = brand_line_width_height_radius(result)

        # Заполнение DataFrame
        df.at[index, 'brand_pred'] = entities.get('brand', '')
        df.at[index, 'width_pred'] = entities.get('width', '')
        df.at[index, 'height_pred'] = entities.get('height', '')
        df.at[index, 'radius_pred'] = entities.get('radius', '')
        df.at[index, 'line_pred'] = entities.get('line', '')
        df.at[index, 'v_ind_pred'] = entities.get('v_ind', '')
    return df

label2id = {'B-width': 1,
            'B-height': 2, 
            'B-radius': 3, 
            'I-radius': 4,
            'B-brand': 5, 
            'B-line': 6, 
            'I-line': 7,
            'B-v_ind': 8,
            'I-v_ind': 9,
            'O': 0}
    
id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'I-radius',
            5: 'B-brand', 
            6: 'B-line', 
            7: 'I-line',
            8: 'B-v_ind',
            9: 'I-v_ind',
            0: 'O'}

device = "cpu"

# model_pth = "/home/sondors/CANINE-epoch_4"
# tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

model_pth = "/home/sondors/luke-base-epoch_5"
tokenizer, model = load_model_LUKE(model_pth, device, label2id, id2label)


### Загрузка данных

In [2]:
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Gislaved.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Nordman.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Pirelli.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Yokohama.xlsx"
pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Кама.xlsx"
df_original = pd.read_excel(pth, dtype=str)
df = pd.DataFrame()
df['PRICE_NAME'] = df_original['PRICE_NAME']
# df['PRICE_NAME'] = df_original['Unnamed: 6'] # Pirelli

pth_dst = "/home/sondors/Documents/price/BERT_NER/csv_to_label/luke-base-241123/Кама_Igor241123.xlsx"

df

Unnamed: 0,PRICE_NAME
0,165/70R13 Кама НК-244 79N
1,175/70R13 Кама Alga НК-531 82T шип
2,175/70R13 Кама НК-531 82T шип
3,175/80R16 Кама И-511 88Q шип (без камеры)
4,185/60R14 КАМА-365 (НК-241) 82H
...,...
6066,Автошина КАМА 365 LT (НК-243) 185/75 R16С 104/...
6067,135/80 R12 Кама-503 68Q шип.
6068,Шина летняя R16C 185/75 104/102N Кама Euro-131
6069,Автошина КАМА 365 LT (НК-243) 175 R16С 98/96 N


In [3]:
df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,PRICE_PCOD,date,PRICE_RSLT,kpl,ppl,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID
0,99084,,8101,,4932,Kama,165/70R13 Кама НК-244 79N,,,,...,,Active,Auto,Шины,,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 832132, 1403, 2002]",2023-08-16 15:25:16,Продажа от 2х штук; меньше по согласованию.,1622539920
1,99084,,8101,,4932,Kama,175/70R13 Кама Alga НК-531 82T шип,,,,...,,Active,Auto,Шины,,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-08-09 08:45:04,Продажа от 2х штук; меньше по согласованию.,1628982103
2,99084,,8101,,4932,Kama,175/70R13 Кама НК-531 82T шип,,,,...,,,Auto,Шины,,,[8101],2022-11-25 12:53:27,Продажа от 2х штук; меньше по согласованию.,1627952538
3,99084,,8101,,4932,Kama,175/80R16 Кама И-511 88Q шип (без камеры),,,,...,,,Auto,Шины,,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 3319, 1403, 2812, 719811]",2023-05-30 06:10:51,Продажа от 2х штук; меньше по согласованию.,1651213808
4,99084,,8101,,4932,Kama,185/60R14 КАМА-365 (НК-241) 82H,,,,...,,,Auto,Шины,,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-07-27 10:40:31,Продажа от 2х штук; меньше по согласованию.,1622530518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6066,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 185/75 R16С 104/...,185,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-31 19:52:55,,1696364312
6067,96794,,8101,,4932,Kama,135/80 R12 Кама-503 68Q шип.,,,,...,,Active,Auto,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/522193...,[8101],2023-08-15 01:15:45,,1696350809
6068,96794,,8101,,4932,Kama,Шина летняя R16C 185/75 104/102N Кама Euro-131,185,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/542530...,[8101],2023-04-13 06:37:43,,1696284765
6069,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 175 R16С 98/96 N,,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,КАМА,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-24 14:21:51,,1696212025


In [4]:
def process_text(input_text):
    def separate_letters_and_numbers(input_text):
        # Используем регулярное выражение для поиска сочетаний букв и цифр
        pattern = re.compile(r'(\D+|\d+)')
        
        # Используем findall для нахождения всех сочетаний
        matches = pattern.findall(input_text)
        # Возвращаем строку с пробелами между буквами и цифрами
        return ' '.join(matches)

    processed_text = separate_letters_and_numbers(input_text)    
    processed_text = processed_text.replace("|", " | ")
    processed_text = processed_text.replace("(", " ( ")
    processed_text = processed_text.replace(")", " ) ")
    processed_text = processed_text.replace("[", " [ ")
    processed_text = processed_text.replace("]", " ] ")
    # Убираем повторяющиеся пробелы
    processed_text = re.sub(r'  +', ' ', processed_text)
    return processed_text

def unfuck_offer(text):
    pattern1 = r'(R) (\d+)'
    processed_text = re.sub(pattern1, r'R\2', text)

    pattern2 = r'(\d+) (R)'
    processed_text = re.sub(pattern2, r'\1R', processed_text)

    pattern3 = r'(\/) (\d+)'
    processed_text = re.sub(pattern3, r'/\2', processed_text)

    pattern4 = r'(\d+) (\/)'
    processed_text = re.sub(pattern4, r'\1/', processed_text)

    pattern5 = r'(\\) (\d+)'
    processed_text = re.sub(pattern5, r'\\\2', processed_text)

    pattern6 = r'(\d+) (\\)'
    processed_text = re.sub(pattern6, r'\1\\', processed_text)

    return processed_text

df_CANINE = df.copy()
df_CANINE['PRICE_NAME'] = df_CANINE['PRICE_NAME'].apply(unfuck_offer)

df_BERT = df.copy()
df_BERT['PRICE_NAME'] = df_BERT['PRICE_NAME'].apply(process_text)

print(df_CANINE)
print(df_BERT)

                                             PRICE_NAME
0                             165/70R13 Кама НК-244 79N
1                    175/70R13 Кама Alga НК-531 82T шип
2                         175/70R13 Кама НК-531 82T шип
3             175/80R16 Кама И-511 88Q шип (без камеры)
4                       185/60R14 КАМА-365 (НК-241) 82H
...                                                 ...
6066  Автошина КАМА 365 LT (НК-243) 185/75R16С 104/1...
6067                        135/80R12 Кама-503 68Q шип.
6068     Шина летняя R16C 185/75 104/102N Кама Euro-131
6069      Автошина КАМА 365 LT (НК-243) 175R16С 98/96 N
6070                       Kama Euro-518 155/65R13 73 T

[6071 rows x 1 columns]
                                             PRICE_NAME
0                       165 / 70 R 13 Кама НК- 244 79 N
1              175 / 70 R 13 Кама Alga НК- 531 82 T шип
2                   175 / 70 R 13 Кама НК- 531 82 T шип
3     175 / 80 R 16 Кама И- 511 88 Q шип ( без камер...
4              185 / 60

### BERT

In [5]:
df_BERT = apply_on_df(model, tokenizer, df_BERT, device, column = 'PRICE_NAME')
df_BERT

Unnamed: 0,PRICE_NAME,brand_pred,width_pred,height_pred,radius_pred,line_pred,v_ind_pred
0,165 / 70 R 13 Кама НК- 244 79 N,��ама,165,70,R 13,�- 244,79
1,175 / 70 R 13 Кама Alga НК- 531 82 T шип,�ама,175,70,R 13,Alga 531,82
2,175 / 70 R 13 Кама НК- 531 82 T шип,�ама,175,70,R 13,��- 531,82
3,175 / 80 R 16 Кама И- 511 88 Q шип ( без камер...,��ама,175,80,R 16,��- 511 ( ),88
4,185 / 60 R 14 КАМА- 365 ( НК- 241 ) 82 H,,185,60,R 14,,82
...,...,...,...,...,...,...,...
6066,Автошина КАМА 365 LT ( НК- 243 ) 185 / 75 R 16...,,185,75,R 16,365 ( 243,104 Q
6067,135 / 80 R 12 Кама- 503 68 Q шип.,��ама,135,80,R 12,- 503,68 Q
6068,Шина летняя R 16 C 185 / 75 104 / 102 N Кама E...,,185,75,R 16,,104
6069,Автошина КАМА 365 LT ( НК- 243 ) 175 R 16 С 98...,,175,,R 16,365 243,98


In [6]:
def process_digits(txt):
    return re.sub(r"[^0123456789A-Za-z]","", txt)
def process_brand_line(txt):
    return re.sub(r"[^0123456789A-Za-zА-Яа-я/ ]","", txt)

df_original['width_pred'] = df_BERT['width_pred'].apply(process_digits)
df_original['height_pred'] = df_BERT['height_pred'].apply(process_digits)
df_original['radius_pred'] = df_BERT['radius_pred'].apply(process_digits)
df_original['v_ind_pred'] = df_BERT['v_ind_pred'].apply(process_digits)

# df_original['brand_pred'] = df_BERT['brand_pred'].apply(process_brand_line)
df_original['line_pred'] = df_BERT['line_pred'].apply(process_brand_line)

df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID,width_pred,height_pred,radius_pred,v_ind_pred,line_pred
0,99084,,8101,,4932,Kama,165/70R13 Кама НК-244 79N,,,,...,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 832132, 1403, 2002]",2023-08-16 15:25:16,Продажа от 2х штук; меньше по согласованию.,1622539920,165,70,R13,79,244
1,99084,,8101,,4932,Kama,175/70R13 Кама Alga НК-531 82T шип,,,,...,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-08-09 08:45:04,Продажа от 2х штук; меньше по согласованию.,1628982103,175,70,R13,82,Alga 531
2,99084,,8101,,4932,Kama,175/70R13 Кама НК-531 82T шип,,,,...,,[8101],2022-11-25 12:53:27,Продажа от 2х штук; меньше по согласованию.,1627952538,175,70,R13,82,531
3,99084,,8101,,4932,Kama,175/80R16 Кама И-511 88Q шип (без камеры),,,,...,https://zavgar.ru/uploads/catalog_items/images...,"[8101, 3319, 1403, 2812, 719811]",2023-05-30 06:10:51,Продажа от 2х штук; меньше по согласованию.,1651213808,175,80,R16,88,511
4,99084,,8101,,4932,Kama,185/60R14 КАМА-365 (НК-241) 82H,,,,...,https://zavgar.ru/uploads/catalog_items/images...,[8101],2023-07-27 10:40:31,Продажа от 2х штук; меньше по согласованию.,1622530518,185,60,R14,82,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6066,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 185/75 R16С 104/...,185,,,...,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-31 19:52:55,,1696364312,185,75,R16,104Q,365 243
6067,96794,,8101,,4932,Kama,135/80 R12 Кама-503 68Q шип.,,,,...,https://avatars.mds.yandex.net/get-mpic/522193...,[8101],2023-08-15 01:15:45,,1696350809,135,80,R12,68Q,503
6068,96794,,8101,,4932,Kama,Шина летняя R16C 185/75 104/102N Кама Euro-131,185,,,...,https://avatars.mds.yandex.net/get-mpic/542530...,[8101],2023-04-13 06:37:43,,1696284765,185,75,R16,104,
6069,96794,,8101,,4932,Kama,Автошина КАМА 365 LT (НК-243) 175 R16С 98/96 N,,,,...,https://avatars.mds.yandex.net/get-mpic/448422...,[8101],2023-05-24 14:21:51,,1696212025,175,,R16,98,365 243


In [7]:
dict(df_original.radius_pred.value_counts())

{'R16': 2038,
 'R14': 1241,
 'R15': 781,
 'R13': 426,
 'R': 193,
 '': 120,
 'R22': 120,
 '14': 119,
 '5R': 108,
 'R20': 107,
 '16': 95,
 '19R14': 93,
 'R12': 75,
 'R17': 66,
 '1R': 59,
 '15': 42,
 '13': 39,
 'R18': 36,
 '18R13': 36,
 '19R13': 26,
 '16R': 25,
 'R19': 22,
 '20': 21,
 '15R': 20,
 '515R': 16,
 '19R': 14,
 'R1419': 12,
 '14R': 10,
 '2R': 7,
 'R24': 5,
 '12': 4,
 '18': 4,
 '1914': 4,
 '519R': 4,
 '18R': 4,
 '6R': 3,
 '22': 3,
 'r16': 3,
 'R21': 3,
 'R1319': 3,
 '15R15': 3,
 '1': 3,
 '1419': 3,
 'R1318': 3,
 'R10': 2,
 '17': 2,
 '1414': 2,
 'r14': 2,
 'R2016': 2,
 '1322': 2,
 '20R': 2,
 '13R': 2,
 '19R16': 2,
 '1919R14': 2,
 '1813': 2,
 'R155': 2,
 'R2012': 1,
 'r13': 1,
 'R2018': 1,
 '10': 1,
 'r17': 1,
 'R1322': 1,
 '165': 1,
 '19': 1,
 '16r20': 1,
 'R2112': 1,
 'R5': 1,
 '16R20': 1,
 '1919R13': 1,
 '1420': 1,
 '1514': 1,
 '22R16': 1,
 '2016': 1,
 '14R8': 1,
 '1218': 1,
 '1515': 1,
 '1020': 1,
 '1620': 1,
 '6': 1,
 '1516': 1,
 '282R': 1,
 '515': 1,
 '21': 1,
 'r15': 1}

In [8]:
df_original.to_excel(pth_dst)

### Проверка CANINE

In [9]:
# df_CANINE = apply_on_df(model_CANINE, tokenizer_CANINE, df_CANINE, column = 'PRICE_NAME')
# df_CANINE