### Загрузка нейронок

In [1]:
from transformers import CanineTokenizer, CanineForTokenClassification, BertTokenizer, BertForTokenClassification, pipeline
import pandas as pd
import re

def load_model_CANINE(model_pth, device, label2id, id2label):
    tokenizer = CanineTokenizer.from_pretrained(model_pth)
    model = CanineForTokenClassification.from_pretrained(model_pth, 
                                            num_labels=len(id2label),
                                            id2label=id2label,
                                            label2id=label2id)
    return tokenizer, model.to(device)

def load_model_BERT(model_pth, device, label2id, id2label):
    model = BertForTokenClassification.from_pretrained(model_pth, 
                                        num_labels=len(id2label),
                                        id2label=id2label,
                                        label2id=label2id)
    tokenizer = BertTokenizer.from_pretrained(model_pth)
    return tokenizer, model.to(device)

def inference(offer, model, tokenizer):
    pipe = pipeline(task="ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

def brand_line_width_height_radius(result):
    # Создаем словари для каждой сущности
    entities = {}
    for item in result:
        entity_group = item['entity_group']
        word = item['word']
        score = item['score']
        if score > 0.7:
            if entity_group in entities:
                entities[entity_group].append(word)
            else:
                entities[entity_group] = [word]

    for key, value in entities.items():
        entities[key] = ''.join(value)
    return entities

def apply_on_df(model, tokenizer, df, column = 'offer'):
    for index, row in df.iterrows():
        offer = row[column]
        result = inference(offer, model, tokenizer)
        entities = brand_line_width_height_radius(result)

        # Заполнение DataFrame
        df.at[index, 'brand_pred'] = entities.get('brand', '')
        df.at[index, 'width_pred'] = entities.get('width', '')
        df.at[index, 'height_pred'] = entities.get('height', '')
        df.at[index, 'radius_pred'] = entities.get('radius', '')
        df.at[index, 'line_pred'] = entities.get('line', '')
        df.at[index, 'v_ind_pred'] = entities.get('v_ind', '')
    return df

label2id = {'B-width': 1,
            'B-height': 2, 
            'B-radius': 3, 
            'I-radius': 4,
            'B-brand': 5, 
            'B-line': 6, 
            'I-line': 7,
            'B-v_ind': 8,
            'I-v_ind': 9,
            'O': 0}
    
id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'I-radius',
            5: 'B-brand', 
            6: 'B-line', 
            7: 'I-line',
            8: 'B-v_ind',
            9: 'I-v_ind',
            0: 'O'}

device = "cpu"

model_pth = "/home/sondors/CANINE-epoch_4"
tokenizer_CANINE, model_CANINE = load_model_CANINE(model_pth, device, label2id, id2label)

model_pth = "/home/sondors/bert-base-uncased-epoch_7"
tokenizer, model = load_model_BERT(model_pth, device, label2id, id2label)


Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


### Загрузка данных

In [2]:
pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Gislaved.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Nordman.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Pirelli.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Yokohama.xlsx"
# pth = "/home/sondors/Documents/price/BERT_NER/csv_to_label/Кама.xlsx"
df_original = pd.read_excel(pth, dtype=str)
df = pd.DataFrame()
df['PRICE_NAME'] = df_original['PRICE_NAME']
# df['Brand'] = df_original['Brand']

pth_dst = "/home/sondors/Documents/price/BERT_NER/csv_to_label/bert-base-uncased-241123/Gislaved_Igor241123.xlsx"

df

Unnamed: 0,PRICE_NAME
0,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR
1,155/65 R14 Gislaved Soft Frost 200 75T
2,155/70 R13 Gislaved Nord Frost 200 HD 75T
3,155/70 R13 Gislaved Nord Frost 200 ID 75T шип
4,16/205/55 Gislaved Soft Frost 200 94T XL
...,...
4637,Легковые шины Gislaved Nord*Frost 200 SUV 215/...
4638,Легковые шины Gislaved Soft*Frost 200 SUV 215/...
4639,Gislaved Soft Frost 200 SUV 235/55 R19 105T зи...
4640,235/65 R17 108T SF200 SUV XL FR Gislaved а/шина


In [3]:
df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,PRICE_PCOD,date,PRICE_RSLT,kpl,ppl,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID
0,61852,,8101,,7365,Gislaved,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR,,,,...,455006,,Auto,Шины,Gislaved,https://euro-diski.ru/upload/iblock/54e/9890ed...,[8101],44932.77446759259,,1655305854
1,61852,,8101,,7365,Gislaved,155/65 R14 Gislaved Soft Frost 200 75T,,,,...,348152,Active,Auto,Шины,Gislaved,https://euro-diski.ru/upload/iblock/04b/dbcb85...,[8101],45155.68755787037,,1628286460
2,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 HD 75T,,,,...,348200,Active,Auto,Шины,Gislaved,https://euro-diski.ru/upload/iblock/cae/4a8486...,[8101],45154.097708333335,,1628297189
3,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 ID 75T шип,,,,...,348003,,Auto,Шины,Gislaved,https://euro-diski.ru/upload/iblock/fae/98d76f...,[8101],45119.503541666665,,1655173261
4,96794,,8101,,7365,Gislaved,16/205/55 Gislaved Soft Frost 200 94T XL,,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/452559...,[8101],45135.841099537036,,1695881808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4637,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Nord*Frost 200 SUV 215/...,215,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/164436...,[8101],2023-06-24 11:40:45,,1696351747
4638,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Soft*Frost 200 SUV 215/...,215,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-07-01 04:48:49,,1696330169
4639,96794,,8101,,7365,Gislaved,Gislaved Soft Frost 200 SUV 235/55 R19 105T зи...,235,,,...,,Active,Auto,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-08-14 21:45:54,,1696316775
4640,96794,,8101,,7365,Gislaved,235/65 R17 108T SF200 SUV XL FR Gislaved а/шина,235,,,...,,,Auto,Все товары :: Авто :: Шины и диски :: Шины,Gislaved,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-05-20 13:42:18,,1696303595


In [4]:
def process_text(input_text):
    def separate_letters_and_numbers(input_text):
        # Используем регулярное выражение для поиска сочетаний букв и цифр
        pattern = re.compile(r'(\D+|\d+)')
        
        # Используем findall для нахождения всех сочетаний
        matches = pattern.findall(input_text)
        # Возвращаем строку с пробелами между буквами и цифрами
        return ' '.join(matches)

    processed_text = separate_letters_and_numbers(input_text)    
    processed_text = processed_text.replace("|", " | ")
    processed_text = processed_text.replace("(", " ( ")
    processed_text = processed_text.replace(")", " ) ")
    processed_text = processed_text.replace("[", " [ ")
    processed_text = processed_text.replace("]", " ] ")
    # Убираем повторяющиеся пробелы
    processed_text = re.sub(r'  +', ' ', processed_text)
    return processed_text

def unfuck_offer(text):
    pattern1 = r'(R) (\d+)'
    processed_text = re.sub(pattern1, r'R\2', text)

    pattern2 = r'(\d+) (R)'
    processed_text = re.sub(pattern2, r'\1R', processed_text)

    pattern3 = r'(\/) (\d+)'
    processed_text = re.sub(pattern3, r'/\2', processed_text)

    pattern4 = r'(\d+) (\/)'
    processed_text = re.sub(pattern4, r'\1/', processed_text)

    pattern5 = r'(\\) (\d+)'
    processed_text = re.sub(pattern5, r'\\\2', processed_text)

    pattern6 = r'(\d+) (\\)'
    processed_text = re.sub(pattern6, r'\1\\', processed_text)

    return processed_text

df_CANINE = df.copy()
df_CANINE['PRICE_NAME'] = df_CANINE['PRICE_NAME'].apply(unfuck_offer)

df_BERT = df.copy()
df_BERT['PRICE_NAME'] = df_BERT['PRICE_NAME'].apply(process_text)

print(df_CANINE)
print(df_BERT)

                                             PRICE_NAME
0        14/185 Gislaved Nord Frost Van SD 102/100Q 8PR
1                 155/65R14 Gislaved Soft Frost 200 75T
2              155/70R13 Gislaved Nord Frost 200 HD 75T
3          155/70R13 Gislaved Nord Frost 200 ID 75T шип
4              16/205/55 Gislaved Soft Frost 200 94T XL
...                                                 ...
4637  Легковые шины Gislaved Nord*Frost 200 SUV 215/...
4638  Легковые шины Gislaved Soft*Frost 200 SUV 215/...
4639  Gislaved Soft Frost 200 SUV 235/55R19 105T зимняя
4640     235/65R17 108T SF200 SUV XL FR Gislaved а/шина
4641     Автошина Gislaved SOFT FROST 200 225/50R17 98T

[4642 rows x 1 columns]
                                             PRICE_NAME
0     14 / 185 Gislaved Nord Frost Van SD 102 / 100 ...
1            155 / 65 R 14 Gislaved Soft Frost 200 75 T
2         155 / 70 R 13 Gislaved Nord Frost 200 HD 75 T
3     155 / 70 R 13 Gislaved Nord Frost 200 ID 75 T шип
4         16 / 205 / 55

### BERT

In [5]:
df_BERT = apply_on_df(model, tokenizer, df_BERT, column = 'PRICE_NAME')
df_BERT

Unnamed: 0,PRICE_NAME,brand_pred,width_pred,height_pred,radius_pred,line_pred,v_ind_pred
0,14 / 185 Gislaved Nord Frost Van SD 102 / 100 ...,gi##sl##ave##d,185,,14,nord frost van,102q
1,155 / 65 R 14 Gislaved Soft Frost 200 75 T,gi##sl##ave##d,155,65,r 14,soft frost 200,75
2,155 / 70 R 13 Gislaved Nord Frost 200 HD 75 T,gi##sl##ave##d,155,70,r 13,nord frost 200,75
3,155 / 70 R 13 Gislaved Nord Frost 200 ID 75 T шип,gi##sl##ave##d,155,70,r 13,nord frost 200,75 t
4,16 / 205 / 55 Gislaved Soft Frost 200 94 T XL,gi##sl##ave##d,205,55,16,soft frost 200,94 t
...,...,...,...,...,...,...,...
4637,Легковые шины Gislaved Nord*Frost 200 SUV 215 ...,лgi##sl##ave##d,215,65,r 16,suv,102
4638,Легковые шины Gislaved Soft*Frost 200 SUV 215 ...,л##гgi##sl##ave##d,215,60,r 17,suv,96
4639,Gislaved Soft Frost 200 SUV 235 / 55 R 19 105 ...,gi##sl##ave##d,235,55,r 19,soft frost 200 suv,105 t
4640,235 / 65 R 17 108 T SF 200 SUV XL FR Gislaved ...,gi##sl##ave##d,235,65,r 17,suv,108 t


In [6]:
def process_digits(txt):
    return re.sub(r"[^0123456789A-Za-z]","", txt)
def process_brand_line(txt):
    return re.sub(r"[^0123456789A_Za-zА-Яа-я/ ]","", txt)

df_original['width_pred'] = df_BERT['width_pred'].apply(process_digits)
df_original['height_pred'] = df_BERT['height_pred'].apply(process_digits)
df_original['radius_pred'] = df_BERT['radius_pred'].apply(process_digits)
df_original['v_ind_pred'] = df_BERT['v_ind_pred'].apply(process_digits)

# df_original['brand_pred'] = df_BERT['brand_pred'].apply(process_brand_line)
df_original['line_pred'] = df_BERT['line_pred'].apply(process_brand_line)

df_original

Unnamed: 0,PRICE_FRNO,PRICE_FNAM,PRICE_CAID,PRICE_CNAM,PRICE_TMNO,Brand,PRICE_NAME,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,PRICE_IMGN,PRICE_LOCD,lineNumber,PRICE_SALES,PRICE_PLID,width_pred,height_pred,radius_pred,v_ind_pred,line_pred
0,61852,,8101,,7365,Gislaved,14/185 Gislaved Nord Frost Van SD 102/100Q 8PR,,,,...,https://euro-diski.ru/upload/iblock/54e/9890ed...,[8101],44932.77446759259,,1655305854,185,,14,102q,nord frost van
1,61852,,8101,,7365,Gislaved,155/65 R14 Gislaved Soft Frost 200 75T,,,,...,https://euro-diski.ru/upload/iblock/04b/dbcb85...,[8101],45155.68755787037,,1628286460,155,65,r14,75,soft frost 200
2,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 HD 75T,,,,...,https://euro-diski.ru/upload/iblock/cae/4a8486...,[8101],45154.097708333335,,1628297189,155,70,r13,75,nord frost 200
3,61852,,8101,,7365,Gislaved,155/70 R13 Gislaved Nord Frost 200 ID 75T шип,,,,...,https://euro-diski.ru/upload/iblock/fae/98d76f...,[8101],45119.503541666665,,1655173261,155,70,r13,75t,nord frost 200
4,96794,,8101,,7365,Gislaved,16/205/55 Gislaved Soft Frost 200 94T XL,,,,...,https://avatars.mds.yandex.net/get-mpic/452559...,[8101],45135.841099537036,,1695881808,205,55,16,94t,soft frost 200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4637,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Nord*Frost 200 SUV 215/...,215,,,...,https://avatars.mds.yandex.net/get-mpic/164436...,[8101],2023-06-24 11:40:45,,1696351747,215,65,r16,102,suv
4638,96794,,8101,,7365,Gislaved,Легковые шины Gislaved Soft*Frost 200 SUV 215/...,215,,,...,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-07-01 04:48:49,,1696330169,215,60,r17,96,suv
4639,96794,,8101,,7365,Gislaved,Gislaved Soft Frost 200 SUV 235/55 R19 105T зи...,235,,,...,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-08-14 21:45:54,,1696316775,235,55,r19,105t,soft frost 200 suv
4640,96794,,8101,,7365,Gislaved,235/65 R17 108T SF200 SUV XL FR Gislaved а/шина,235,,,...,https://avatars.mds.yandex.net/get-mpic/528853...,[8101],2023-05-20 13:42:18,,1696303595,235,65,r17,108t,suv


In [7]:
dict(df_original.radius_pred.value_counts())

{'r16': 1372,
 'r17': 1223,
 'r15': 744,
 'r18': 542,
 'r14': 277,
 'r19': 226,
 'r13': 46,
 '': 39,
 'r': 33,
 '17': 27,
 '16': 25,
 'r20': 23,
 '6r': 17,
 '16r': 12,
 '15': 9,
 '14': 9,
 '18': 6,
 '19': 5,
 '16r6': 2,
 '2r': 2,
 'rr16': 1,
 'rr15': 1,
 '2': 1}

In [8]:
df_original.to_excel(pth_dst)

### Проверка CANINE

In [9]:
# df_CANINE = apply_on_df(model_CANINE, tokenizer_CANINE, df_CANINE, column = 'PRICE_NAME')
# df_CANINE