In [1]:
import os
 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

import sys
sys.path.append('../')

In [3]:
from src.bert.utils.data_constructor import CompanyDataset
from src.bert.utils.criteriation import LabelSmoothingCrossEntropy
from src.bert.utils.bert_clf_trainer import BertTrainClf

In [4]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)

In [5]:
path_data = '../data/preprocess_train.csv'
train_dataset = CompanyDataset(path_data, tokenizer) 
val_dataset = CompanyDataset(path_data, tokenizer, train=False) 

In [6]:
trainDataLoader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=32
)

valDataLoader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=32
)

next(iter(trainDataLoader))



{'input_ids': tensor([[[  101,   100,   111,  ...,     0,     0,     0]],
 
         [[  101,  1611,   228,  ...,     0,     0,     0]],
 
         [[  101,   180, 14088,  ...,     0,     0,     0]],
 
         ...,
 
         [[  101,  2861,  9915,  ...,     0,     0,     0]],
 
         [[  101,   846,   205,  ...,     0,     0,     0]],
 
         [[  101,  5787,  8376,  ...,     0,     0,     0]]]),
 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         ...,
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]]),
 'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

In [7]:
len(trainDataLoader), len(valDataLoader)

(14779, 778)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_INIT)
for name, params in model.named_parameters():
    print(name, params.requires_grad)

Some weights of the model checkpoint at DeepPavlov/bert-base-cased-conversational were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifi

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [9]:
for params in model.bert.parameters():
    params.requires_grad = False

for params in model.bert.encoder.layer[11].parameters():
    params.requires_grad = True

for params in model.bert.pooler.parameters():
    params.requires_grad = True

for params in model.classifier.parameters():
    params.requires_grad = True

for name, params in model.named_parameters():
    print(name, params.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
DEVICE = 'cuda:0'
NUM_EPOCHS = 5
LR = 3e-5

In [12]:
optimizer = AdamW(model.parameters(), lr=LR)
scheduler = OneCycleLR(
    optimizer=optimizer, 
    max_lr=LR, 
    steps_per_epoch=len(trainDataLoader), 
    epochs=NUM_EPOCHS, 
    pct_start=0.1, 
    anneal_strategy='cos'
)
criteriation = LabelSmoothingCrossEntropy()

In [13]:
trainer = BertTrainClf(
    model=model, 
    trainDataloader=trainDataLoader, 
    valDataloader=valDataLoader, 
    criteriation=criteriation,
    optimizer=optimizer,
    scheduler=scheduler, 
    device=DEVICE, 
    model_name='BertNameCompany_v1'
)

results = trainer(num_epochs=NUM_EPOCHS)


EPOCH 1 of 5
Training


Loss: 0.1986: 100%|██████████████████████████████████████████████| 14779/14779 [12:38<00:00, 19.48it/s]


Validating


Loss: 0.1990: 100%|██████████████████████████████████████████████████| 778/778 [00:35<00:00, 21.79it/s]


f1_macro_train: 0.770
f1_macro_val: 0.964

EPOCH 2 of 5
Training


Loss: 0.1985: 100%|██████████████████████████████████████████████| 14779/14779 [12:42<00:00, 19.38it/s]


Validating


Loss: 0.1986: 100%|██████████████████████████████████████████████████| 778/778 [00:35<00:00, 21.72it/s]


f1_macro_train: 0.966
f1_macro_val: 0.974
Save best model.

EPOCH 3 of 5
Training


Loss: 0.1985: 100%|██████████████████████████████████████████████| 14779/14779 [12:21<00:00, 19.94it/s]


Validating


Loss: 0.1985: 100%|██████████████████████████████████████████████████| 778/778 [00:35<00:00, 22.05it/s]


f1_macro_train: 0.982
f1_macro_val: 0.975
Save best model.

EPOCH 4 of 5
Training


Loss: 0.1985: 100%|██████████████████████████████████████████████| 14779/14779 [12:25<00:00, 19.81it/s]


Validating


Loss: 0.1986: 100%|██████████████████████████████████████████████████| 778/778 [00:35<00:00, 22.14it/s]


f1_macro_train: 0.989
f1_macro_val: 0.976

EPOCH 5 of 5
Training


Loss: 0.1985: 100%|██████████████████████████████████████████████| 14779/14779 [12:25<00:00, 19.81it/s]


Validating


Loss: 0.1985: 100%|██████████████████████████████████████████████████| 778/778 [00:35<00:00, 21.95it/s]


f1_macro_train: 0.992
f1_macro_val: 0.973


In [14]:
results

{'train_loss_history': [0.21360856620805965,
  0.2005032328878061,
  0.19963990467872694,
  0.19922910446795278,
  0.19904274056523874],
 'val_loss_history': [0.20080078506224566,
  0.2001806767058863,
  0.20002140763639484,
  0.2001537072896038,
  0.20013524879220213],
 'train_f1_history': [0.7695348772693069,
  0.965634098781363,
  0.9817127506209287,
  0.9892434998683126,
  0.992430197641661],
 'val_f1_history': [0.9636258481378064,
  0.974394188156579,
  0.9745370107258291,
  0.9761511350736807,
  0.9734919845495044]}

In [15]:
import json
with open("log_bert_train.json", "w") as outfile:
    json.dump(results, outfile)

### Evaluate

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve
import torch
from transformers import AutoTokenizer
from tqdm import tqdm

import sys
sys.path.append('../')

In [2]:
from src.bert.utils.inference_bert import BertPipeline

In [3]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
device='cuda:4'
pipeline_1 = BertPipeline('../weights/BertNameCompany_v1_best.pth', tokenizer, device)
pipeline_2 = BertPipeline('../weights/BertNameCompany_v1_last.pth', tokenizer, device)

In [4]:
df = pd.read_csv('../data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [5]:
pred_best, pred_last = [], []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    res_2 = pipeline_2(cmp_1, cmp_2)
    pred_best.append(res_1)
    pred_last.append(res_2)

100%|████████████████████████████████████████████████████████████| 24891/24891 [06:34<00:00, 63.03it/s]


In [6]:
df_val['pred_best'] = pred_best
df_val['pred_last'] = pred_last

In [7]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_best'].tolist(), average='macro')

0.9745370107258291

In [8]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_last'].tolist(), average='macro')

0.9734919845495044

In [11]:
classification_report(df_val['is_duplicate'], df_val['pred_best'])

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     24708\n           1       0.98      0.92      0.95       183\n\n    accuracy                           1.00     24891\n   macro avg       0.99      0.96      0.97     24891\nweighted avg       1.00      1.00      1.00     24891\n'

In [10]:
classification_report(df_val['is_duplicate'].tolist(), df_val['pred_last'].tolist())

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     24708\n           1       0.96      0.93      0.95       183\n\n    accuracy                           1.00     24891\n   macro avg       0.98      0.97      0.97     24891\nweighted avg       1.00      1.00      1.00     24891\n'

In [14]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_best'])

array([[24704,     4],
       [   14,   169]])

In [15]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_last'])

array([[24701,     7],
       [   12,   171]])

**Лучше взять модель с префиксом best**
 - для данной постановки задачи лучше найти подходяющую компанию, которая точно является подходящей
 - выше presicion для label 1 = 0.98 (на 0.02)

In [3]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
device='cuda:4'

pipeline_1 = BertPipeline('../weights/BertNameCompany_v1_best.pth', tokenizer, device, debug=True)

df = pd.read_csv('../data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [6]:
pred = []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    pred.append(res_1)

100%|███████████████████████████████████████████████████████████| 24891/24891 [03:12<00:00, 129.00it/s]


In [7]:
precision, recall, thresholds = precision_recall_curve(df_val['is_duplicate'], pred)
precision, recall, thresholds

(array([0.01685393, 0.01676338, 0.01676492, ..., 1.        , 1.        ,
        1.        ]),
 array([1.        , 0.99453552, 0.99453552, ..., 0.01092896, 0.00546448,
        0.        ]),
 array([0.05088564, 0.05088566, 0.05088569, ..., 0.96189672, 0.96424681,
        0.9728694 ]))

In [26]:
import numpy as np
np.where(precision > 0.99)

(array([ 9962,  9963,  9964,  9965,  9966,  9967,  9968,  9969,  9970,
         9971,  9972,  9973,  9974,  9975,  9976,  9977,  9978,  9979,
         9980,  9981,  9982,  9983,  9984,  9985,  9986,  9987,  9988,
         9989,  9990,  9991,  9992,  9993,  9994,  9995,  9996,  9997,
         9998,  9999, 10000, 10001, 10002, 10003, 10004, 10005, 10006,
        10007, 10008, 10009, 10010, 10011, 10012, 10013, 10014, 10015,
        10016, 10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024,
        10025, 10026, 10027, 10028, 10029, 10101, 10102, 10103, 10104,
        10105, 10106, 10107, 10108, 10109, 10110, 10111, 10112, 10113,
        10114, 10115, 10116, 10117, 10118, 10119, 10120, 10121, 10122,
        10123, 10124, 10125, 10126, 10127, 10128, 10129, 10130]),)

In [27]:
thresholds[9962]

0.824992835521698

In [28]:
recall[9962]

0.912568306010929

In [29]:
pred = []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    if res_1 > thresholds[9962]:
        pred.append(1)
    else:
        pred.append(0)

100%|███████████████████████████████████████████████████████████| 24891/24891 [03:12<00:00, 129.08it/s]


In [30]:
f1_score(df_val['is_duplicate'].tolist(), pred, average='macro')

0.9741036459898735