In [35]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [36]:
%%capture

import datetime

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

from src.utils.data_constructor import CompanyDatasetBertClf
from src.bert.utils.criteriation import LabelSmoothingCrossEntropy
from src.bert.utils.bert_clf_trainer import BertTrainClf

In [37]:
import mlflow

remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("company-name-matcher")

<Experiment: artifact_location='s3://arts/2', creation_time=1666734835488, experiment_id='2', last_update_time=1666734835488, lifecycle_stage='active', name='company-name-matcher', tags={}>

In [5]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
MODEL_NAME = 'bert'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
BATCH_SIZE = 32
DEVICE = 'cuda:0'
NUM_EPOCHS = 5
LR = 3e-5
OPTIMIZER = 'AdamW'
SAVE_DIR = f'../weights/{MODEL_NAME}-{TIMESTAMP}'

path_data = '../data/preprocess_train.csv'

In [None]:
%%capture

mlflow.start_run()

In [6]:
import os
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [7]:
params ={
    "tokenizer": MODEL_INIT,
    "batch_size":BATCH_SIZE,
    "num_epochs": NUM_EPOCHS,
    "lr": LR,
    "optimizer": OPTIMIZER
}
mlflow.log_params(params)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
train_dataset = CompanyDatasetBertClf(path_data, tokenizer) 
val_dataset = CompanyDatasetBertClf(path_data, tokenizer, train=False) 

In [9]:
trainDataLoader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=BATCH_SIZE
)

valDataLoader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=BATCH_SIZE
)

In [10]:
len(trainDataLoader), len(valDataLoader)

(60, 4)

In [11]:
%%capture
model = AutoModelForSequenceClassification.from_pretrained(MODEL_INIT)

In [12]:
%%capture

for params in model.bert.parameters():
    params.requires_grad = False

for params in model.bert.encoder.layer[11].parameters():
    params.requires_grad = True

for params in model.bert.pooler.parameters():
    params.requires_grad = True

for params in model.classifier.parameters():
    params.requires_grad = True

for name, params in model.named_parameters():
    print(name, params.requires_grad)

In [13]:
if OPTIMIZER == "AdamW":
    optimizer = AdamW(model.parameters(), lr=LR)

scheduler = OneCycleLR(
    optimizer=optimizer, 
    max_lr=LR, 
    steps_per_epoch=len(trainDataLoader), 
    epochs=NUM_EPOCHS, 
    pct_start=0.1, 
    anneal_strategy='cos'
)
criteriation = LabelSmoothingCrossEntropy()

In [14]:
trainer = BertTrainClf(
    model=model, 
    trainDataloader=trainDataLoader, 
    valDataloader=valDataLoader, 
    criteriation=criteriation,
    optimizer=optimizer,
    scheduler=scheduler, 
    device=DEVICE, 
    model_name=MODEL_NAME,
    save_dir=SAVE_DIR
)

results = trainer(num_epochs=NUM_EPOCHS)


EPOCH 1 of 5
Training


Loss: 0.2005: 100%|██████████| 60/60 [00:16<00:00,  3.66it/s]


Validating


Loss: 0.1997: 100%|██████████| 4/4 [00:00<00:00,  4.97it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000

EPOCH 2 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 3 of 5
Training


Loss: 0.1996: 100%|██████████| 60/60 [00:16<00:00,  3.55it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.98it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 4 of 5
Training


Loss: 0.1999: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1989: 100%|██████████| 4/4 [00:00<00:00,  5.01it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 5 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.60it/s]


Validating


Loss: 0.1988: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.


In [15]:
log = dict()
for i in range(NUM_EPOCHS):
    for keys in results:
        log[keys] = results[keys][i]
    mlflow.log_metrics(log)

In [16]:
import json
with open(f"{SAVE_DIR}/log.json", "w") as outfile:
    json.dump(results, outfile)

In [None]:
mlflow.log_artifact(SAVE_DIR)

### Evaluate

In [2]:
from tqdm import tqdm

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve
from transformers import AutoTokenizer

import sys
sys.path.append('../')

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
from src.bert.bert_inference import BertPipeline

In [20]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
DEVICE='cuda:0'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
pipeline = BertPipeline(f'{SAVE_DIR}/best.pth', tokenizer, DEVICE)

In [24]:
df = pd.read_csv('../data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [25]:
pred_best, pred_last = [], []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res = pipeline(cmp_1, cmp_2)
    pred_best.append(res)

100%|██████████| 24891/24891 [05:20<00:00, 77.77it/s]


In [27]:
df_val['pred_best'] = pred_best

In [28]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_best'].tolist(), average='macro')

0.49815520474203107

In [30]:
print(classification_report(df_val['is_duplicate'], df_val['pred_best']))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     24708
           1       0.00      0.00      0.00       183

    accuracy                           0.99     24891
   macro avg       0.50      0.50      0.50     24891
weighted avg       0.99      0.99      0.99     24891



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_best'])

array([[24708,     0],
       [  183,     0]])

In [33]:
precision, recall, thresholds = precision_recall_curve(df_val['is_duplicate'], df_val['pred_best'])
precision, recall, thresholds

(array([0.00735205, 1.        ]), array([1., 0.]), array([0]))

In [34]:
import numpy as np
np.where(precision > 0.99)

(array([1]),)

In [27]:
thresholds[9962]

0.824992835521698

In [28]:
recall[9962]

0.912568306010929

In [34]:
pred = []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res_1 = pipeline(cmp_1, cmp_2)
    if res_1 > 0.912568306010929:
        pred.append(1)
    else:
        pred.append(0)

  0%|          | 0/24891 [00:00<?, ?it/s]


In [30]:
f1_score(df_val['is_duplicate'].tolist(), pred, average='macro')

0.9741036459898735

## Save Model in MLFlow

In [3]:
import mlflow.pyfunc

class FastTextWrapper(mlflow.pyfunc.PythonModel):
    
    from src.bert.bert_inference import BertPipeline

    def load_context(self, context):
        tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/bert-base-cased-conversational")
        self.model = BertPipeline(context.artifacts["model_path"], tokenizer, DEVICE)
        
    def predict(self, context, model_input):
        res = self.model(model_input)
        return res
    

In [4]:
artifacts = {"model_path": "../weights/bert-10-20-2022-00-00-00/best.pth"}

In [5]:
mlflow.pyfunc.save_model(
    path="test",
    python_model=FastTextWrapper(),
    artifacts=artifacts,
)



In [6]:
mlflow.pyfunc.log_model(
    artifact_path="test",
    python_model=FastTextWrapper(),
    artifacts=artifacts,
)



<mlflow.models.model.ModelInfo at 0x7f8f45ac7370>

In [39]:
mlflow.end_run()

In [None]:
import mlflow
logged_model = 'runs:/19401c7e3a4d457dbde232c4e33e4d32/test'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)