In [1]:
from transformers import BertTokenizer, BertForTokenClassification, pipeline, get_linear_schedule_with_warmup
from seqeval.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import pandas as pd
import torch
import os


In [2]:
from custom_train_loop import load_model, load_data, valid

In [3]:
label2id = {'B-width': 1,
        'B-height': 2,
        'B-radius': 3,
        'B-brand': 4,
        'B-line': 5,
        'O': 0}

id2label = {1: 'B-width',
            2: 'B-height', 
            3: 'B-radius', 
            4: 'B-brand', 
            5: 'B-line', 
            0: 'O'}

# model_pth = "/home/sondors/29887"
model_pth = '/home/sondors/weights/epoch_5'
device = 'cuda'

# AutoModelForSequenceClassification.from_pretrained("path/to/model")
tokenizer, model = load_model(model_pth, device, label2id, id2label)


In [4]:
pth = '/home/sondors/Documents/price/BERT_NER/csv/NER_2609.csv'
max_seq_length = 512
train_batch_size = 150
val_batch_size = 150
train_size = 0.8

training_loader, testing_loader = load_data(tokenizer, pth, train_size, label2id, train_batch_size, val_batch_size, max_seq_length)

FULL Dataset: (7018, 8)
TRAIN Dataset: (5614, 8)
TEST Dataset: (1404, 8)


In [5]:
report = valid(model, testing_loader, id2label, device)
print(f"{report}\n")

Validation loss per 100 evaluation steps: 0.001532487920485437
Validation Loss: 0.0012766074389219284
Validation Accuracy: 0.9939138061104377
              precision    recall  f1-score   support

       brand       1.00      1.00      1.00      4882
      height       1.00      0.99      1.00      1238
        line       0.90      0.93      0.91       803
      radius       0.99      0.99      0.99      1499
       width       0.99      1.00      1.00      1725

   micro avg       0.99      0.99      0.99     10147
   macro avg       0.98      0.98      0.98     10147
weighted avg       0.99      0.99      0.99     10147




In [6]:
def price_data_raw():
    pth = "/home/sondors/Documents/price/BERT_NER/csv/list1.xlsx"
    df = pd.read_excel(pth)

    clms = ['PRICE_FRNO', 
        'PRICE_FNAM', 
        'PRICE_CAID', 
        'PRICE_CNAM',  
        'PRICE_TMNO', 
        'PRICE_PRCN', 
        'PRICE_VERT', 
        'PRICE_PCOD', 
        'date', 
        'PRICE_IMGN', 
        'PRICE_LOCD', 
        'lineNumber',
        'PRICE_PRDS',
        'PRICE_DPRN',
        'PRICE_PTYP',
        'PRICE_PROMO',
        'PRICE_CURR',
        'PRICE_LOCA',
        'PRICE_WARR',
        'PRICE_RSLT',
        'PRICE_PLID',
        'PRICE_SALES',
        'PRICE_DTSH',
        'PRICE_BARCODE',
        'комментарий',
        'kpl',
        'PRICE_PARAM',
        'PRICE_NOTE']

    clms_hz = ['kpl',
        'PRICE_NOTE',
        'PRICE_PARAM']

    df = df.drop(columns=clms + clms_hz)
    return df
df = price_data_raw()
df

Unnamed: 0,PRICE_NAME,Ширина,Высота,Радиус,Инд. Скорост,XL,RFT,Линейка,ppl
0,"10,5/31 R15 Bridgestone Dueler A/T 001 109S",,,,109S,,,Dueler,Bridgestone
1,12067 Bridgestone Автошина R20 275/50 Bridgest...,275,50,20,113R,XL,,Blizzak DM-V2,Bridgestone
2,12088 BRIDGESTONE Автошина R20 255/50 Bridgest...,255,50,20,109T,XL,,Blizzak DM-V2,Bridgestone
3,13/175/70 Bridgestone Ice Cruiser 7000S 82T ш,175,70,17,82S,,,Ice Cruiser 7000S,Bridgestone
4,13602 BRIDGESTONE Автошина R18 275/35 Bridgest...,275,35,18,95S,,,Blizzak Ice,Bridgestone
...,...,...,...,...,...,...,...,...,...
23143,"Бриджстоун 2657015 T 112 AT001, BRIDGESTONE 12...",,,,,,,,Bridgestone
23144,"Бриджстоун 2753518 S 95 VRX, BRIDGESTONE 8393 ...",,,,,,,Blizzak VRX,Bridgestone
23145,Бриджстоун 315/70r22.5 R249 Tl 152/154 L/M Рул...,315,70,15,152/154,,,,BRIDGESTONE
23146,шинаBridgestone LT245/75R16 120Q Dueler M/T 674,245,75,16,120Q,,,Dueler,Bridgestone


In [10]:
offer_1 = df.iloc[1]['PRICE_NAME']
offer_1

'12067 Bridgestone Автошина R20 275/50 Bridgestone Blizzak DM-V2 113R XL зима 12067'

In [None]:
[{'entity_group': 'line',
  'score': 0.6152378,
  'word': '120',
  'start': None,
  'end': None},
 {'entity_group': 'line',
  'score': 0.7517311,
  'word': '##6',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.43341485,
  'word': '##7',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.92565125,
  'word': 'bridges',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.8361361,
  'word': '##tone',
  'start': None,
  'end': None},
 {'entity_group': 'width',
  'score': 0.9795234,
  'word': '275',
  'start': None,
  'end': None},
 {'entity_group': 'height',
  'score': 0.93771434,
  'word': '50',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.6648184,
  'word': 'bridges',
  'start': None,
  'end': None}]

In [11]:
def inference(offer, model, tokenizer):
    pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

result = inference(offer_1, model, tokenizer)
result

[{'entity_group': 'line',
  'score': 0.6152378,
  'word': '120',
  'start': None,
  'end': None},
 {'entity_group': 'line',
  'score': 0.7517311,
  'word': '##6',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.43341485,
  'word': '##7',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.92565125,
  'word': 'bridges',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.8361361,
  'word': '##tone',
  'start': None,
  'end': None},
 {'entity_group': 'width',
  'score': 0.9795234,
  'word': '275',
  'start': None,
  'end': None},
 {'entity_group': 'height',
  'score': 0.93771434,
  'word': '50',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.6648184,
  'word': 'bridges',
  'start': None,
  'end': None}]

In [13]:
import re
def process_text(input_text):
    # Добавляем пробелы к букве "R", если она идет слитно с цифрой
    pattern1 = r'(R)(\d)'
    processed_text = re.sub(pattern1, r'R \2', input_text)

    pattern2 = r'(\d)(R)'
    processed_text = re.sub(pattern2, r'\1 R', processed_text)

    # Заменяем символы "/", "\", "|" на пробелы
    # processed_text = re.sub(r'[\/\\|]', ' ', processed_text)

    # Заменяем символы "/" на " / "
    processed_text = processed_text.replace("/", " / ")
    # Заменяем символы "\" на " \ "
    processed_text = processed_text.replace("\\", " \\ ")
    # Заменяем символы "|" на " | "
    processed_text = processed_text.replace("|", " | ")

    # Убираем повторяющиеся пробелы
    processed_text = re.sub(r'  +', ' ', processed_text)
    return processed_text

offer_1_processed = process_text(offer_1)
offer_1_processed

'12067 Bridgestone Автошина R 20 275 / 50 Bridgestone Blizzak DM-V2 113 R XL зима 12067'

In [12]:
df.iloc[1]

PRICE_NAME      12067 Bridgestone Автошина R20 275/50 Bridgest...
Ширина                                                        275
Высота                                                         50
Радиус                                                         20
Инд. Скорост                                                 113R
XL                                                             XL
RFT                                                           NaN
Линейка                                             Blizzak DM-V2
ppl                                                   Bridgestone
Name: 1, dtype: object

In [None]:
[
 {'entity_group': 'brand',
  'score': 0.9554528,
  'word': 'bridges',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.9067478,
  'word': '##tone',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.71723557,
  'word': '##и',
  'start': None,
  'end': None},
 {'entity_group': 'radius',
  'score': 0.9732476,
  'word': '20',
  'start': None,
  'end': None},
 {'entity_group': 'width',
  'score': 0.91325927,
  'word': '275',
  'start': None,
  'end': None},
 {'entity_group': 'height',
  'score': 0.9093071,
  'word': '50',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.55411714,
  'word': 'bridges',
  'start': None,
  'end': None}]

In [14]:
def inference(offer, model, tokenizer):
    pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
    return pipe(offer)

result = inference(offer_1_processed, model, tokenizer)
result

[{'entity_group': 'line',
  'score': 0.6651023,
  'word': '120',
  'start': None,
  'end': None},
 {'entity_group': 'line',
  'score': 0.68972594,
  'word': '##6',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.5085858,
  'word': '##7',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.9554528,
  'word': 'bridges',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.9067478,
  'word': '##tone',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.71723557,
  'word': '##и',
  'start': None,
  'end': None},
 {'entity_group': 'radius',
  'score': 0.9732476,
  'word': '20',
  'start': None,
  'end': None},
 {'entity_group': 'width',
  'score': 0.91325927,
  'word': '275',
  'start': None,
  'end': None},
 {'entity_group': 'height',
  'score': 0.9093071,
  'word': '50',
  'start': None,
  'end': None},
 {'entity_group': 'brand',
  'score': 0.55411714,
  'word': 'bridges',
  'start': None,
  'end': None