### Загрузка нейронок

In [1]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    for key, value in entities.items():
        entities[key] = ''.join(value)
    return entities

def apply_on_df(model, tokenizer, df, column = 'offer'):
    for index, row in df.iterrows():
        offer = row[column]
        result = inference(offer, model, tokenizer)
        entities = brand_line_width_height_radius(result)

        # Заполнение DataFrame
        df.at[index, 'brand_pred'] = entities.get('brand', '')
        df.at[index, 'width_pred'] = entities.get('width', '')
        df.at[index, 'height_pred'] = entities.get('height', '')
        df.at[index, 'radius_pred'] = entities.get('radius', '')
        df.at[index, 'line_pred'] = entities.get('line', '')
    return df

label2id = {'B-width': 1,
            'B-height': 2,
            'B-radius': 3,
            'B-brand': 4,
            'B-line': 5,
            'I-line': 6,
            'O': 0}

id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'B-brand', 
            5: 'B-line', 
            6: 'I-line',
            0: 'O'}

device = "cpu"

model_pth = "/home/sondors/Documents/price/BERT_NER/weights/CANINE/our_data/epoch_3"
tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

model_pth = "/home/sondors/Documents/price/BERT_NER/weights/BERT/rubert-tiny2_our_data/epoch_13"
tokenizer, model = load_model_BERT(model_pth, device, label2id, id2label)


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


### Загрузка данных

In [2]:
pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Gislaved.xlsx"
df_original = pd.read_excel(pth, dtype=str)
df = pd.DataFrame()
df['PRICE_NAME'] = df_original['PRICE_NAME']
df['Brand'] = df_original['Brand']

df

Unnamed: 0,PRICE_NAME,Brand
0,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR,Gislaved
1,155/65 R14 Gislaved Soft Frost 200 75T,Gislaved
2,155/70 R13 Gislaved Nord Frost 200 HD 75T,Gislaved
3,155/70 R13 Gislaved Nord Frost 200 ID 75T шип,Gislaved
4,16/205/55 Gislaved Soft Frost 200 94T XL,Gislaved
...,...,...
4637,Легковые шины Gislaved Nord*Frost 200 SUV 215/...,Gislaved
4638,Легковые шины Gislaved Soft*Frost 200 SUV 215/...,Gislaved
4639,Gislaved Soft Frost 200 SUV 235/55 R19 105T зи...,Gislaved
4640,235/65 R17 108T SF200 SUV XL FR Gislaved а/шина,Gislaved


In [3]:
def process_text(input_text):
    # Добавляем пробелы к букве "R", если она идет слитно с цифрой
    pattern1 = r'(R)(\d+)'
    processed_text = re.sub(pattern1, r'R \2', input_text)

    pattern2 = r'(\d+)(R)'
    processed_text = re.sub(pattern2, r'\1 R', processed_text)

    pattern3 = r'(\/)(\d+)'
    processed_text = re.sub(pattern3, r'/ \2', processed_text)

    pattern4 = r'(\d+)(\/)'
    processed_text = re.sub(pattern4, r'\1 /', processed_text)

    pattern5 = r'(\\)(\d+)'
    processed_text = re.sub(pattern5, r'\\ \2', processed_text)

    pattern6 = r'(\d+)(\\)'
    processed_text = re.sub(pattern6, r'\1 \\', processed_text)

    # Заменяем символы "/", "\", "|" на пробелы
    # processed_text = re.sub(r'[\/\\|]', ' ', processed_text)

    # Заменяем символы "/" на " / "
    # processed_text = processed_text.replace("/", " / ")
    # Заменяем символы "\" на " \ "
    # processed_text = processed_text.replace("\\", " \\ ")
    # Заменяем символы "|" на " | "
    processed_text = processed_text.replace("|", " | ")

    # Убираем повторяющиеся пробелы
    processed_text = re.sub(r'  +', ' ', processed_text)
    return processed_text

def unfuck_offer(text):
    pattern1 = r'(R) (\d+)'
    processed_text = re.sub(pattern1, r'R\2', text)

    pattern2 = r'(\d+) (R)'
    processed_text = re.sub(pattern2, r'\1R', processed_text)

    pattern3 = r'(\/) (\d+)'
    processed_text = re.sub(pattern3, r'/\2', processed_text)

    pattern4 = r'(\d+) (\/)'
    processed_text = re.sub(pattern4, r'\1/', processed_text)

    pattern5 = r'(\\) (\d+)'
    processed_text = re.sub(pattern5, r'\\\2', processed_text)

    pattern6 = r'(\d+) (\\)'
    processed_text = re.sub(pattern6, r'\1\\', processed_text)

    return processed_text

df_CANINE = df.copy()
df_CANINE['PRICE_NAME'] = df_CANINE['PRICE_NAME'].apply(unfuck_offer)

df_BERT = df.copy()
df_BERT['PRICE_NAME'] = df_BERT['PRICE_NAME'].apply(process_text)

print(df_CANINE)
print(df_BERT)

                                             PRICE_NAME     Brand
0        14/185 Gislaved Nord Frost Van SD 102/100Q 8PR  Gislaved
1                 155/65R14 Gislaved Soft Frost 200 75T  Gislaved
2              155/70R13 Gislaved Nord Frost 200 HD 75T  Gislaved
3          155/70R13 Gislaved Nord Frost 200 ID 75T шип  Gislaved
4              16/205/55 Gislaved Soft Frost 200 94T XL  Gislaved
...                                                 ...       ...
4637  Легковые шины Gislaved Nord*Frost 200 SUV 215/...  Gislaved
4638  Легковые шины Gislaved Soft*Frost 200 SUV 215/...  Gislaved
4639  Gislaved Soft Frost 200 SUV 235/55R19 105T зимняя  Gislaved
4640     235/65R17 108T SF200 SUV XL FR Gislaved а/шина  Gislaved
4641     Автошина Gislaved SOFT FROST 200 225/50R17 98T  Gislaved

[4642 rows x 2 columns]
                                             PRICE_NAME     Brand
0     14 / 185 Gislaved Nord Frost Van SD 102 / 100Q...  Gislaved
1             155 / 65 R 14 Gislaved Soft Frost 200

### Проверка CANINE

In [4]:
df_CANINE = apply_on_df(model_CANINE, tokenizer_CANINE, df_CANINE, column = 'PRICE_NAME')
df_CANINE

Unnamed: 0,PRICE_NAME,Brand,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR,Gislaved,,,,1,laved Nrd Frost Van SD
1,155/65R14 Gislaved Soft Frost 200 75T,Gislaved,,155,65,14,laed Soft Frost 200 75
2,155/70R13 Gislaved Nord Frost 200 HD 75T,Gislaved,,155,70,1,drd Frost 200 HD 75
3,155/70R13 Gislaved Nord Frost 200 ID 75T шип,Gislaved,,155,70,1,ved rd Frost 200 I
4,16/205/55 Gislaved Soft Frost 200 94T XL,Gislaved,Gs,205,55,16,lavdt Frost 200 94 X
...,...,...,...,...,...,...,...
4637,Легковые шины Gislaved Nord*Frost 200 SUV 215/...,Gislaved,,215,65,1,ilaved Nord*Frost 200 S
4638,Легковые шины Gislaved Soft*Frost 200 SUV 215/...,Gislaved,,215,60,17,laved Soft*Frost 200 SU
4639,Gislaved Soft Frost 200 SUV 235/55R19 105T зимняя,Gislaved,Gislae,235,55,19,Frost 200 SUV
4640,235/65R17 108T SF200 SUV XL FR Gislaved а/шина,Gislaved,ашн,235,65,17,SUV X FR Gislaved


### BERT

In [6]:
df_BERT = apply_on_df(model, tokenizer, df_BERT, column = 'PRICE_NAME')
df_BERT

Unnamed: 0,PRICE_NAME,Brand,brand_pred,width_pred,height_pred,radius_pred,line_pred
0,14 / 185 Gislaved Nord Frost Van SD 102 / 100Q...,Gislaved,,14,185,,
1,155 / 65 R 14 Gislaved Soft Frost 200 75T,Gislaved,,155,65,14,
2,155 / 70 R 13 Gislaved Nord Frost 200 HD 75T,Gislaved,,155,70,13,
3,155 / 70 R 13 Gislaved Nord Frost 200 ID 75T шип,Gislaved,,155,70,13,
4,16 / 205 / 55 Gislaved Soft Frost 200 94T XL,Gislaved,,16,55,,
...,...,...,...,...,...,...,...
4637,Легковые шины Gislaved Nord*Frost 200 SUV 215 ...,Gislaved,,215,65,16,
4638,Легковые шины Gislaved Soft*Frost 200 SUV 215 ...,Gislaved,,215,60,17,
4639,Gislaved Soft Frost 200 SUV 235 / 55 R 19 105T...,Gislaved,,235,55,19,
4640,235 / 65 R 17 108T SF200 SUV XL FR Gislaved а/...,Gislaved,,235,65,17,


In [7]:
df_original['width_pred'] = df_BERT['width_pred']
df_original['height_pred'] = df_BERT['height_pred']
df_original['radius_pred'] = df_BERT['radius_pred']

df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,kpl,ppl,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID,width_pred,height_pred,radius_pred
0,61852,,8101,,7365,Gislaved,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR,,,,...,Шины,Gislaved,https://euro-diski.ru/upload/iblock/54e/9890ed...,[8101],44932.77446759259,,1655305854,14,185,
1,61852,,8101,,7365,Gislaved,155/65 R14 Gislaved Soft Frost 200 75T,,,,...,Шины,Gislaved,https://euro-diski.ru/upload/iblock/04b/dbcb85...,[8101],45155.68755787037,,1628286460,155,65,14
2,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 HD 75T,,,,...,Шины,Gislaved,https://euro-diski.ru/upload/iblock/cae/4a8486...,[8101],45154.097708333335,,1628297189,155,70,13
3,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 ID 75T шип,,,,...,Шины,Gislaved,https://euro-diski.ru/upload/iblock/fae/98d76f...,[8101],45119.503541666665,,1655173261,155,70,13
4,96794,,8101,,7365,Gislaved,16/205/55 Gislaved Soft Frost 200 94T XL,,,,...,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/452559...,[8101],45135.841099537036,,1695881808,16,55,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4637,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Nord*Frost 200 SUV 215/...,215,,,...,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/164436...,[8101],2023-06-24 11:40:45,,1696351747,215,65,16
4638,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Soft*Frost 200 SUV 215/...,215,,,...,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-07-01 04:48:49,,1696330169,215,60,17
4639,96794,,8101,,7365,Gislaved,Gislaved Soft Frost 200 SUV 235/55 R19 105T зи...,235,,,...,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-08-14 21:45:54,,1696316775,235,55,19
4640,96794,,8101,,7365,Gislaved,235/65 R17 108T SF200 SUV XL FR Gislaved а/шина,235,,,...,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-05-20 13:42:18,,1696303595,235,65,17


In [8]:
df_original.to_excel("/home/sondors/Documents/price/BERT_NER/csv_to_label/Gislaved_Igor.xlsx")