In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [2]:
%%capture

import datetime

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

from src.utils.data_constructor import CompanyDatasetBertClf
from src.bert.utils.criteriation import LabelSmoothingCrossEntropy
from src.bert.utils.bert_clf_trainer import BertTrainClf

In [3]:
import mlflow

remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("company-name-matcher")

<Experiment: artifact_location='s3://arts/2', creation_time=1666734835488, experiment_id='2', last_update_time=1666734835488, lifecycle_stage='active', name='company-name-matcher', tags={}>

In [4]:
%%capture

mlflow.start_run()

In [5]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
MODEL_NAME = 'bert_name_company_v1'
BATCH_SIZE = 32
DEVICE = 'cuda:0'
NUM_EPOCHS = 5
LR = 3e-5
OPTIMIZER = 'AdamW'
SAVE_DIR = f'../weights/{MODEL_NAME}-{datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")}'

path_data = '../data/preprocess_train.csv'

In [6]:
import os
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [7]:
params ={
    "tokenizer": MODEL_INIT,
    "batch_size":BATCH_SIZE,
    "num_epochs": NUM_EPOCHS,
    "lr": LR,
    "optimizer": OPTIMIZER
}
mlflow.log_params(params)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
train_dataset = CompanyDatasetBertClf(path_data, tokenizer) 
val_dataset = CompanyDatasetBertClf(path_data, tokenizer, train=False) 

In [9]:
trainDataLoader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=BATCH_SIZE
)

valDataLoader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=BATCH_SIZE
)

In [10]:
len(trainDataLoader), len(valDataLoader)

(60, 4)

In [11]:
%%capture
model = AutoModelForSequenceClassification.from_pretrained(MODEL_INIT)

In [12]:
%%capture

for params in model.bert.parameters():
    params.requires_grad = False

for params in model.bert.encoder.layer[11].parameters():
    params.requires_grad = True

for params in model.bert.pooler.parameters():
    params.requires_grad = True

for params in model.classifier.parameters():
    params.requires_grad = True

for name, params in model.named_parameters():
    print(name, params.requires_grad)

In [13]:
if OPTIMIZER == "AdamW":
    optimizer = AdamW(model.parameters(), lr=LR)

scheduler = OneCycleLR(
    optimizer=optimizer, 
    max_lr=LR, 
    steps_per_epoch=len(trainDataLoader), 
    epochs=NUM_EPOCHS, 
    pct_start=0.1, 
    anneal_strategy='cos'
)
criteriation = LabelSmoothingCrossEntropy()

In [14]:
trainer = BertTrainClf(
    model=model, 
    trainDataloader=trainDataLoader, 
    valDataloader=valDataLoader, 
    criteriation=criteriation,
    optimizer=optimizer,
    scheduler=scheduler, 
    device=DEVICE, 
    model_name=MODEL_NAME,
    save_dir=SAVE_DIR
)

results = trainer(num_epochs=NUM_EPOCHS)


EPOCH 1 of 5
Training


Loss: 0.2005: 100%|██████████| 60/60 [00:16<00:00,  3.66it/s]


Validating


Loss: 0.1997: 100%|██████████| 4/4 [00:00<00:00,  4.97it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000

EPOCH 2 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 3 of 5
Training


Loss: 0.1996: 100%|██████████| 60/60 [00:16<00:00,  3.55it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.98it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 4 of 5
Training


Loss: 0.1999: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1989: 100%|██████████| 4/4 [00:00<00:00,  5.01it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 5 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.60it/s]


Validating


Loss: 0.1988: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.


In [15]:
log = dict()
for i in range(NUM_EPOCHS):
    for keys in results:
        log[keys] = results[keys][i]
    mlflow.log_metrics(log)

In [16]:
import json
with open(f"{SAVE_DIR}/log.json", "w") as outfile:
    json.dump(results, outfile)

### Evaluate

In [30]:
from tqdm import tqdm

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve
from transformers import AutoTokenizer

import sys
sys.path.append('../')

In [24]:
from src.bert.bert_inference import BertPipeline

In [17]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
DEVICE='cuda:0'


'../weights/bert_name_company_v1-10-26-2022-05-17-54'

In [28]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
pipeline = BertPipeline(f'{SAVE_DIR}/best.pth', tokenizer, DEVICE)

In [31]:
df = pd.read_csv('../data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [5]:
pred_best, pred_last = [], []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    res_2 = pipeline_2(cmp_1, cmp_2)
    pred_best.append(res_1)
    pred_last.append(res_2)

100%|████████████████████████████████████████████████████████████| 24891/24891 [06:34<00:00, 63.03it/s]


In [6]:
df_val['pred_best'] = pred_best
df_val['pred_last'] = pred_last

In [7]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_best'].tolist(), average='macro')

0.9745370107258291

In [8]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_last'].tolist(), average='macro')

0.9734919845495044

In [11]:
classification_report(df_val['is_duplicate'], df_val['pred_best'])

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     24708\n           1       0.98      0.92      0.95       183\n\n    accuracy                           1.00     24891\n   macro avg       0.99      0.96      0.97     24891\nweighted avg       1.00      1.00      1.00     24891\n'

In [10]:
classification_report(df_val['is_duplicate'].tolist(), df_val['pred_last'].tolist())

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     24708\n           1       0.96      0.93      0.95       183\n\n    accuracy                           1.00     24891\n   macro avg       0.98      0.97      0.97     24891\nweighted avg       1.00      1.00      1.00     24891\n'

In [14]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_best'])

array([[24704,     4],
       [   14,   169]])

In [15]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_last'])

array([[24701,     7],
       [   12,   171]])

**Лучше взять модель с префиксом best**
 - для данной постановки задачи лучше найти подходяющую компанию, которая точно является подходящей
 - выше presicion для label 1 = 0.98 (на 0.02)

In [33]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
device='cpu'
pipeline_1 = BertPipeline('../weights/BertNameCompany_v1_best.pth', tokenizer, device, debug=True)

df = pd.read_csv('../data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [6]:
pred = []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    pred.append(res_1)

100%|███████████████████████████████████████████████████████████| 24891/24891 [03:12<00:00, 129.00it/s]


In [7]:
precision, recall, thresholds = precision_recall_curve(df_val['is_duplicate'], pred)
precision, recall, thresholds

(array([0.01685393, 0.01676338, 0.01676492, ..., 1.        , 1.        ,
        1.        ]),
 array([1.        , 0.99453552, 0.99453552, ..., 0.01092896, 0.00546448,
        0.        ]),
 array([0.05088564, 0.05088566, 0.05088569, ..., 0.96189672, 0.96424681,
        0.9728694 ]))

In [26]:
import numpy as np
np.where(precision > 0.99)

(array([ 9962,  9963,  9964,  9965,  9966,  9967,  9968,  9969,  9970,
         9971,  9972,  9973,  9974,  9975,  9976,  9977,  9978,  9979,
         9980,  9981,  9982,  9983,  9984,  9985,  9986,  9987,  9988,
         9989,  9990,  9991,  9992,  9993,  9994,  9995,  9996,  9997,
         9998,  9999, 10000, 10001, 10002, 10003, 10004, 10005, 10006,
        10007, 10008, 10009, 10010, 10011, 10012, 10013, 10014, 10015,
        10016, 10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024,
        10025, 10026, 10027, 10028, 10029, 10101, 10102, 10103, 10104,
        10105, 10106, 10107, 10108, 10109, 10110, 10111, 10112, 10113,
        10114, 10115, 10116, 10117, 10118, 10119, 10120, 10121, 10122,
        10123, 10124, 10125, 10126, 10127, 10128, 10129, 10130]),)

In [27]:
thresholds[9962]

0.824992835521698

In [28]:
recall[9962]

0.912568306010929

In [34]:
pred = []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline_1(cmp_1, cmp_2)
    if res_1 > 0.912568306010929:
        pred.append(1)
    else:
        pred.append(0)
    break

  0%|          | 0/24891 [00:00<?, ?it/s]


In [30]:
f1_score(df_val['is_duplicate'].tolist(), pred, average='macro')

0.9741036459898735

In [15]:
from dotenv import load_dotenv
import mlflow
import pandas as pd
from mlflow.models.signature import infer_signature

In [18]:
remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("company-name-matcher")

<Experiment: artifact_location='s3://arts/2', creation_time=1666734835488, experiment_id='2', last_update_time=1666734835488, lifecycle_stage='active', name='company-name-matcher', tags={}>

In [19]:
params = {
    "tokenizer":'DeepPavlov/bert-base-cased-conversational',
    "batch_size":32,
    "num_epochs":5,
    "lr":3e-5,
}

In [20]:
result = {'train_loss_history': [0.21360856620805965,
                                 0.2005032328878061,
                                 0.19963990467872694,
                                 0.19922910446795278,
                                 0.19904274056523874],
          'val_loss_history': [0.20080078506224566,
                               0.2001806767058863,
                               0.20002140763639484,
                               0.2001537072896038,
                               0.20013524879220213],
          'train_f1_history': [0.7695348772693069,
                               0.965634098781363,
                               0.9817127506209287,
                               0.9892434998683126,
                               0.992430197641661],
          'val_f1_history':   [0.9636258481378064,
                               0.974394188156579,
                               0.9745370107258291,
                               0.9761511350736807,
                               0.9734919845495044]}

In [37]:
log = dict()
for i in range(5):
    for keys in result:
        log[keys] = result[keys][i]
    mlflow.log_metrics(log)

In [35]:
signature = infer_signature(pd.DataFrame(['[CLS] ' + cmp_1 + ' [SEP] ' + cmp_2 + ' [SEP]']), pd.DataFrame([res_1]))

In [36]:
mlflow.pytorch.log_model(pipeline_1.model, "bert", signature=signature)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




KeyboardInterrupt: 

In [39]:
mlflow.end_run()