<a href="https://colab.research.google.com/github/MariaGorelik/NER_furniture_stores/blob/main/NER_furniture_stores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [None]:
pip install transformers datasets



In [73]:
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
from google.colab import drive

data = {
    'tokens': [[]],
    'labels': [[]]  # 0 = O, 1 = B-Product, 2 = I-Product
}

with open('all_annotation.txt', 'r', encoding='utf-8') as bio_file:
    for line in bio_file:
        words = line.split()
        data['tokens'][0].append(words[0])
        if words[1] == 'O':
            data['labels'][0].append(0)
        elif words[1] == 'B-Product':
            data['labels'][0].append(1)
        elif words[1] == 'I-Product':
            data['labels'][0].append(2)

train_data, test_data = train_test_split(data['tokens'][0], test_size=0.2, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_dict({'tokens': [train_data], 'labels': [data['labels'][0][:len(train_data)]]}),
    "test": Dataset.from_dict({'tokens': [test_data], 'labels': [data['labels'][0][len(train_data):]]})
})

model_name = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Убираем -100 из лейблов (их игнорируем)
    true_labels = []
    true_predictions = []

    for pred_seq, label_seq in zip(predictions, labels):
        current_labels = []
        current_preds = []
        for pred, label in zip(pred_seq, label_seq):
            if label != -100:  # Убираем специальные токены
                current_labels.append(label)
                current_preds.append(pred)
        true_labels.append(current_labels)
        true_predictions.append(current_preds)

    precision, recall, f1, _ = precision_recall_fscore_support(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in true_predictions for item in sublist],
        average='macro'
    )
    accuracy = accuracy_score(
        [item for sublist in true_labels for item in sublist],
        [item for sublist in true_predictions for item in sublist]
    )

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }



training_args = TrainingArguments(
    output_dir="./results",                     # Директория для сохранения модели
    evaluation_strategy="epoch",                # Оценка на каждой эпохе
    save_strategy="epoch",                      # Сохранение модели после каждой эпохе
    save_total_limit=1,                         # Сохранение только последней лучшей модели
    load_best_model_at_end=True,                # Загружать лучшую модель после завершения
    per_device_train_batch_size=16,             # Размер батча для обучения
    per_device_eval_batch_size=16,              # Размер батча для оценки
    num_train_epochs=12,                        # Количество эпох
    weight_decay=0.01,                          # Уменьшение весов для регуляризации
    logging_dir='./logs',                       # Директория для логов
    metric_for_best_model="f1",                 # Метрика для выбора лучшей модели
    greater_is_better=True,
    learning_rate=5e-5,                         # Начальная скорость обучения
    lr_scheduler_type="linear",                 # Тип scheduler'а для изменения скорости обучения
    warmup_steps=500,                           # Количество шагов для "разогрева"
)


from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Останавливаем обучение после 3 эпох без улучшений
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Test results: {eval_results}")

model_save_path = '/content/drive/MyDrive/bert_best_model'

if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Модель сохранена в {model_save_path}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.062837,0.345022,0.302853,0.298106,0.382353
2,No log,1.06209,0.343256,0.300522,0.296303,0.380392
3,No log,1.06061,0.342221,0.299171,0.29608,0.382353
4,No log,1.058418,0.344254,0.301008,0.299514,0.392157
5,No log,1.055533,0.344906,0.301988,0.300904,0.396078
6,No log,1.051985,0.33527,0.297717,0.294685,0.4
7,No log,1.047803,0.336404,0.299554,0.297226,0.409804
8,No log,1.043025,0.326445,0.307671,0.301674,0.431373
9,No log,1.037697,0.334272,0.312819,0.306701,0.447059
10,No log,1.031876,0.35506,0.326674,0.318828,0.480392


Test results: {'eval_loss': 1.0256397724151611, 'eval_precision': 0.3645392193816841, 'eval_recall': 0.33157504746908717, 'eval_f1': 0.3212029401961137, 'eval_accuracy': 0.5, 'eval_runtime': 0.061, 'eval_samples_per_second': 16.399, 'eval_steps_per_second': 16.399, 'epoch': 12.0}
Модель сохранена в /content/drive/MyDrive/bert_best_model


In [74]:
# Обучение модели еще на 12 эпохах

training_args_new = TrainingArguments(
    output_dir="./new_results",                # Новая директория для сохранения модели
    evaluation_strategy="epoch",                # Оценка на каждой эпохе
    save_strategy="epoch",                      # Сохранение модели после каждой эпохи
    save_total_limit=1,                         # Сохранение только последней лучшей модели
    load_best_model_at_end=True,                # Загружать лучшую модель после завершения
    per_device_train_batch_size=16,             # Размер батча для обучения
    per_device_eval_batch_size=16,              # Размер батча для оценки
    num_train_epochs=12,                        # Количество эпох
    weight_decay=0.01,                          # Уменьшение весов для регуляризации
    logging_dir='./new_logs',                   # Директория для логов
    metric_for_best_model="f1",                 # Метрика для выбора лучшей модели
    greater_is_better=True,
    learning_rate=5e-5,                         # Начальная скорость обучения
    lr_scheduler_type="linear",                  # Тип scheduler'а для изменения скорости обучения
    warmup_steps=500,                           # Количество шагов для "разогрева"
)

trainer_new = Trainer(
    model=model,                                # Используем уже обученную модель
    args=training_args_new,
    train_dataset=tokenized_dataset["train"],  # Тренировочный набор
    eval_dataset=tokenized_dataset["test"],    # Тестовый набор
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Останавливаем обучение после 3 эпох без улучшений
)

trainer_new.train()

eval_results_new = trainer_new.evaluate()
print(f"New test results: {eval_results_new}")

new_model_save_path = '/content/drive/MyDrive/bert_best_model_v2'

if not os.path.exists(new_model_save_path):
    os.makedirs(new_model_save_path)

trainer_new.save_model(new_model_save_path)
tokenizer.save_pretrained(new_model_save_path)

print(f"Новая модель сохранена в {new_model_save_path}")




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.02564,0.364539,0.331575,0.321203,0.5
2,No log,1.025023,0.36727,0.333783,0.323109,0.503922
3,No log,1.023804,0.36692,0.333659,0.322429,0.505882
4,No log,1.022002,0.369256,0.335743,0.323599,0.511765
5,No log,1.019639,0.353218,0.331719,0.315726,0.511765
6,No log,1.016745,0.364897,0.336743,0.316742,0.529412
7,No log,1.013356,0.369978,0.334289,0.312664,0.529412


New test results: {'eval_loss': 1.0220024585723877, 'eval_precision': 0.369256193043255, 'eval_recall': 0.33574306488213773, 'eval_f1': 0.32359872264736683, 'eval_accuracy': 0.5117647058823529, 'eval_runtime': 0.0585, 'eval_samples_per_second': 17.087, 'eval_steps_per_second': 17.087, 'epoch': 7.0}
Новая модель сохранена в /content/drive/MyDrive/bert_best_model_v2


In [75]:
# Обучение модели еще на 12 эпохах

training_args_new = TrainingArguments(
    output_dir="./new_results",                # Новая директория для сохранения модели
    evaluation_strategy="epoch",                # Оценка на каждой эпохе
    save_strategy="epoch",                      # Сохранение модели после каждой эпохи
    save_total_limit=1,                         # Сохранение только последней лучшей модели
    load_best_model_at_end=True,                # Загружать лучшую модель после завершения
    per_device_train_batch_size=16,             # Размер батча для обучения
    per_device_eval_batch_size=16,              # Размер батча для оценки
    num_train_epochs=12,                        # Количество эпох
    weight_decay=0.01,                          # Уменьшение весов для регуляризации
    logging_dir='./new_logs',                   # Директория для логов
    metric_for_best_model="f1",                 # Метрика для выбора лучшей модели
    greater_is_better=True,
    learning_rate=5e-5,                         # Начальная скорость обучения
    lr_scheduler_type="linear",                  # Тип scheduler'а для изменения скорости обучения
    warmup_steps=500,                           # Количество шагов для "разогрева"
)

trainer_new = Trainer(
    model=model,                                # Используем уже обученную модель
    args=training_args_new,
    train_dataset=tokenized_dataset["train"],  # Тренировочный набор
    eval_dataset=tokenized_dataset["test"],    # Тестовый набор
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Останавливаем обучение после 3 эпох без улучшений
)

trainer_new.train()

eval_results_new = trainer_new.evaluate()
print(f"New test results: {eval_results_new}")

new_model_save_path = '/content/drive/MyDrive/bert_best_model_v3'

if not os.path.exists(new_model_save_path):
    os.makedirs(new_model_save_path)

trainer_new.save_model(new_model_save_path)
tokenizer.save_pretrained(new_model_save_path)

print(f"Новая модель сохранена в {new_model_save_path}")




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.022002,0.369256,0.335743,0.323599,0.511765
2,No log,1.021401,0.369629,0.335743,0.323595,0.511765
3,No log,1.020211,0.353218,0.331719,0.315726,0.511765
4,No log,1.018454,0.354779,0.333556,0.315013,0.521569


New test results: {'eval_loss': 1.0220024585723877, 'eval_precision': 0.369256193043255, 'eval_recall': 0.33574306488213773, 'eval_f1': 0.32359872264736683, 'eval_accuracy': 0.5117647058823529, 'eval_runtime': 0.0584, 'eval_samples_per_second': 17.114, 'eval_steps_per_second': 17.114, 'epoch': 4.0}
Новая модель сохранена в /content/drive/MyDrive/bert_best_model_v3


In [76]:
# Обучение модели еще на 12 эпохах

training_args_new = TrainingArguments(
    output_dir="./new_results",                # Новая директория для сохранения модели
    evaluation_strategy="epoch",                # Оценка на каждой эпохе
    save_strategy="epoch",                      # Сохранение модели после каждой эпохи
    save_total_limit=1,                         # Сохранение только последней лучшей модели
    load_best_model_at_end=True,                # Загружать лучшую модель после завершения
    per_device_train_batch_size=64,             # Размер батча для обучения
    per_device_eval_batch_size=64,              # Размер батча для оценки
    num_train_epochs=12,                        # Количество эпох
    weight_decay=0.01,                          # Уменьшение весов для регуляризации
    logging_dir='./new_logs',                   # Директория для логов
    metric_for_best_model="f1",                 # Метрика для выбора лучшей модели
    greater_is_better=True,
    learning_rate=5e-5,                         # Начальная скорость обучения
    lr_scheduler_type="linear",                  # Тип scheduler'а для изменения скорости обучения
    warmup_steps=500,                           # Количество шагов для "разогрева"
)

trainer_new = Trainer(
    model=model,                                # Используем уже обученную модель
    args=training_args_new,
    train_dataset=tokenized_dataset["train"],  # Тренировочный набор
    eval_dataset=tokenized_dataset["test"],    # Тестовый набор
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Останавливаем обучение после 3 эпох без улучшений
)

trainer_new.train()

eval_results_new = trainer_new.evaluate()
print(f"New test results: {eval_results_new}")

new_model_save_path = '/content/drive/MyDrive/bert_best_model_v4'

if not os.path.exists(new_model_save_path):
    os.makedirs(new_model_save_path)

trainer_new.save_model(new_model_save_path)
tokenizer.save_pretrained(new_model_save_path)

print(f"Новая модель сохранена в {new_model_save_path}")




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.022002,0.369256,0.335743,0.323599,0.511765
2,No log,1.021401,0.369629,0.335743,0.323595,0.511765
3,No log,1.020211,0.353218,0.331719,0.315726,0.511765
4,No log,1.018454,0.354779,0.333556,0.315013,0.521569


New test results: {'eval_loss': 1.0220024585723877, 'eval_precision': 0.369256193043255, 'eval_recall': 0.33574306488213773, 'eval_f1': 0.32359872264736683, 'eval_accuracy': 0.5117647058823529, 'eval_runtime': 0.0578, 'eval_samples_per_second': 17.291, 'eval_steps_per_second': 17.291, 'epoch': 4.0}
Новая модель сохранена в /content/drive/MyDrive/bert_best_model_v4


In [79]:
# Обучение модели еще на 12 эпохах

training_args_new = TrainingArguments(
    output_dir="./new_results",                # Новая директория для сохранения модели
    evaluation_strategy="epoch",                # Оценка на каждой эпохе
    save_strategy="epoch",                      # Сохранение модели после каждой эпохи
    save_total_limit=1,                         # Сохранение только последней лучшей модели
    load_best_model_at_end=True,                # Загружать лучшую модель после завершения
    per_device_train_batch_size=16,             # Размер батча для обучения
    per_device_eval_batch_size=16,              # Размер батча для оценки
    num_train_epochs=12,                        # Количество эпох
    weight_decay=0.01,                          # Уменьшение весов для регуляризации
    logging_dir='./new_logs',                   # Директория для логов
    metric_for_best_model="f1",                 # Метрика для выбора лучшей модели
    greater_is_better=True,
    learning_rate=5e-5,                         # Начальная скорость обучения
    lr_scheduler_type="linear",                  # Тип scheduler'а для изменения скорости обучения
    warmup_steps=500,                           # Количество шагов для "разогрева"
)

trainer_new = Trainer(
    model=model,                                # Используем уже обученную модель
    args=training_args_new,
    train_dataset=tokenized_dataset["train"],  # Тренировочный набор
    eval_dataset=tokenized_dataset["test"],    # Тестовый набор
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Останавливаем обучение после 3 эпох без улучшений
)

# Запуск дополнительного обучения
trainer_new.train()

# Оценка на тестовом наборе
eval_results_new = trainer_new.evaluate()
print(f"New test results: {eval_results_new}")

# Путь для сохранения новой модели на Google Диск
new_model_save_path = '/content/drive/MyDrive/bert_best_model_v5'

# Создаем директорию, если она не существует
if not os.path.exists(new_model_save_path):
    os.makedirs(new_model_save_path)

# Сохраняем модель
trainer_new.save_model(new_model_save_path)
tokenizer.save_pretrained(new_model_save_path)

print(f"Новая модель сохранена в {new_model_save_path}")




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.022002,0.369256,0.335743,0.323599,0.511765
2,No log,1.021401,0.369629,0.335743,0.323595,0.511765
3,No log,1.020211,0.353218,0.331719,0.315726,0.511765
4,No log,1.018454,0.354779,0.333556,0.315013,0.521569


New test results: {'eval_loss': 1.0220024585723877, 'eval_precision': 0.369256193043255, 'eval_recall': 0.33574306488213773, 'eval_f1': 0.32359872264736683, 'eval_accuracy': 0.5117647058823529, 'eval_runtime': 0.0577, 'eval_samples_per_second': 17.339, 'eval_steps_per_second': 17.339, 'epoch': 4.0}
Новая модель сохранена в /content/drive/MyDrive/bert_best_model_v5


In [86]:
def predict_products(text, model, tokenizer):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)
  model.eval()
  inputs = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True, truncation=True)
  inputs = {key: val.to(device) for key, val in inputs.items()}
  with torch.no_grad():
      outputs = model(**inputs)
  logits = outputs.logits
  predictions = torch.argmax(logits, dim=2)
  id_to_label = {0: "O", 1: "B-Product", 2: "I-Product"}
  predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]
  result = []
  start_word = False
  product = ''
  for word, label in zip(text.split(), predicted_labels):
      if label == "B-Product" or label == "I-Product":
        if not start_word:
          start_word = True
          product = word
        else:
          product += ' ' + word
      elif start_word:
        start_word = False
        if product != '':
          result.append(product)
          product = ''
  if product != '':
          result.append(product)

  return result

In [93]:
products = predict_products('Amish McKee 6pc Dining Set', model, tokenizer)
for product in products:
  print(product)

Amish McKee 6pc Dining Set


In [89]:
import requests
from bs4 import BeautifulSoup

def scrape_webpage(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            full_text = soup.get_text(separator='\n')
            lines = [line.strip() for line in full_text.split('\n') if line.strip()]
            words = []
            for line in lines:
              for w in line.split():
                words.append(w)
            return ' '.join(words)
        else:
            print(f"Failed to retrieve page: {url}, status code={response.status_code}")
            return ''
    except Exception as e:
        print(f"Error fetching {url}: {str(e)}")
        return ''

In [91]:
products = predict_products(scrape_webpage("https://www.royaloakfurniture.co.uk/products/pop-bench"), model, tokenizer)

for product in products:
  print(product)

Arncliffe Pop
Royal
Company Search
Collapse submenu
Ready
Expand
submenu
Tables
Products
Products
TV
Office
Shades
&
About
Knightsbridge
Pop
Ready
Dining
Benches Living
Tables
Samples
About
Delivery
Cart
cart
softened
x H47cm Also
as a stool. Regular price Sale price £450.00 Unit
per
to
Items
We
you to
details
including
and discuss
All
in
Homewares
are
for any reason
is
delay in
these
out
contact you
you
Share Share
Tweet Tweet
Pin on Pinterest Search
Contact Us Delivery
Balmoral Canterbury Clifford County
Linton Pateley Pop
Twitter Instagram ©
Royal Oak Furniture Company Powered
American
Pay Diners Club
Pay Union Pay
choosing
selection results
a full page
