In [1]:
import os
import sys
from dotenv import load_dotenv

sys.path.append('../')
load_dotenv("../.env")

True

In [2]:
%cd ..

/home/cva/Desktop/Company-name-matcher


In [3]:
%%capture

import datetime

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

from src.utils.data_constructor import CompanyDatasetBertClf
from src.bert.utils.criteriation import LabelSmoothingCrossEntropy
from src.bert.utils.bert_clf_trainer import BertTrainClf

In [4]:
import mlflow

remote_server_uri = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("company-name-matcher")

<Experiment: artifact_location='s3://arts/2', creation_time=1666734835488, experiment_id='2', last_update_time=1666734835488, lifecycle_stage='active', name='company-name-matcher', tags={}>

In [5]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
MODEL_NAME = 'bert'
TIMESTAMP = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
BATCH_SIZE = 32
DEVICE = 'cuda:0'
NUM_EPOCHS = 5
LR = 3e-5
OPTIMIZER = 'AdamW'
SAVE_DIR = f'weights/{MODEL_NAME}-{TIMESTAMP}'

path_data = 'data/preprocess_train.csv'

In [6]:
%%capture

mlflow.start_run()
mlflow.set_tag("mlflow.runName", f"{MODEL_NAME}-{TIMESTAMP}")

In [7]:
import os
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [7]:
params ={
    "tokenizer": MODEL_INIT,
    "batch_size":BATCH_SIZE,
    "num_epochs": NUM_EPOCHS,
    "lr": LR,
    "optimizer": OPTIMIZER
}
mlflow.log_params(params)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
train_dataset = CompanyDatasetBertClf(path_data, tokenizer) 
val_dataset = CompanyDatasetBertClf(path_data, tokenizer, train=False) 

In [9]:
trainDataLoader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=BATCH_SIZE
)

valDataLoader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=BATCH_SIZE
)

In [10]:
len(trainDataLoader), len(valDataLoader)

(60, 4)

In [11]:
%%capture
model = AutoModelForSequenceClassification.from_pretrained(MODEL_INIT)

In [12]:
%%capture

for params in model.bert.parameters():
    params.requires_grad = False

for params in model.bert.encoder.layer[11].parameters():
    params.requires_grad = True

for params in model.bert.pooler.parameters():
    params.requires_grad = True

for params in model.classifier.parameters():
    params.requires_grad = True

for name, params in model.named_parameters():
    print(name, params.requires_grad)

In [13]:
if OPTIMIZER == "AdamW":
    optimizer = AdamW(model.parameters(), lr=LR)

scheduler = OneCycleLR(
    optimizer=optimizer, 
    max_lr=LR, 
    steps_per_epoch=len(trainDataLoader), 
    epochs=NUM_EPOCHS, 
    pct_start=0.1, 
    anneal_strategy='cos'
)
criteriation = LabelSmoothingCrossEntropy()

In [14]:
trainer = BertTrainClf(
    model=model, 
    trainDataloader=trainDataLoader, 
    valDataloader=valDataLoader, 
    criteriation=criteriation,
    optimizer=optimizer,
    scheduler=scheduler, 
    device=DEVICE, 
    model_name=MODEL_NAME,
    save_dir=SAVE_DIR
)

results = trainer(num_epochs=NUM_EPOCHS)


EPOCH 1 of 5
Training


Loss: 0.2005: 100%|██████████| 60/60 [00:16<00:00,  3.66it/s]


Validating


Loss: 0.1997: 100%|██████████| 4/4 [00:00<00:00,  4.97it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000

EPOCH 2 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 3 of 5
Training


Loss: 0.1996: 100%|██████████| 60/60 [00:16<00:00,  3.55it/s]


Validating


Loss: 0.1990: 100%|██████████| 4/4 [00:00<00:00,  4.98it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 4 of 5
Training


Loss: 0.1999: 100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


Validating


Loss: 0.1989: 100%|██████████| 4/4 [00:00<00:00,  5.01it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.

EPOCH 5 of 5
Training


Loss: 0.1995: 100%|██████████| 60/60 [00:16<00:00,  3.60it/s]


Validating


Loss: 0.1988: 100%|██████████| 4/4 [00:00<00:00,  4.99it/s]


f1_macro_train: 0.499
f1_macro_val: 1.000
Save best model.


In [15]:
log = dict()
for i in range(NUM_EPOCHS):
    for keys in results:
        log[keys] = results[keys][i]
    mlflow.log_metrics(log)

In [16]:
import json
with open(f"{SAVE_DIR}/log.json", "w") as outfile:
    json.dump(results, outfile)

In [None]:
mlflow.log_artifact(SAVE_DIR)

### Evaluate

In [7]:
from tqdm import tqdm

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve
from transformers import AutoTokenizer

import sys
sys.path.append('../')

In [8]:
from src.bert.bert_inference import BertPipeline

In [9]:
MODEL_INIT = 'DeepPavlov/bert-base-cased-conversational'
DEVICE='cuda:0'
# SAVE_DIR = f'weights/bert-10-19-22-00-00-00'


In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_INIT)
pipeline = BertPipeline(tokenizer, f'{SAVE_DIR}/best.pth', DEVICE)

In [18]:
df = pd.read_csv('data/preprocess_train.csv')
_, df_val = train_test_split(
    df, train_size=0.95, stratify=df['is_duplicate'], random_state=17
)

In [19]:
pred_best, pred_last = [], []
for idx in tqdm(range(df_val.shape[0])):
    cmp_1, cmp_2 = df_val['name_1'].iloc[idx], df_val['name_2'].iloc[idx]
    res = pipeline(cmp_1, cmp_2)
    pred_best.append(res)

100%|██████████| 24891/24891 [04:26<00:00, 93.46it/s]


In [20]:
df_val['pred_best'] = pred_best

In [21]:
f1_score(df_val['is_duplicate'].tolist(), df_val['pred_best'].tolist(), average='macro')

0.9756115189143046

In [22]:
print(classification_report(df_val['is_duplicate'], df_val['pred_best']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24708
           1       0.99      0.91      0.95       183

    accuracy                           1.00     24891
   macro avg       1.00      0.96      0.98     24891
weighted avg       1.00      1.00      1.00     24891



In [23]:
confusion_matrix(df_val['is_duplicate'], df_val['pred_best'])

array([[24707,     1],
       [   16,   167]])

In [24]:
precision, recall, thresholds = precision_recall_curve(df_val['is_duplicate'], df_val['pred_best'])
precision, recall, thresholds

(array([0.00735205, 0.99404762, 1.        ]),
 array([1.        , 0.91256831, 0.        ]),
 array([0, 1]))

## Save Model in MLFlow

In [11]:
import mlflow.pyfunc

class FastTextWrapper(mlflow.pyfunc.PythonModel):
    from src.bert.bert_inference import BertPipeline
    def load_context(self, context):
        tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/bert-base-cased-conversational")
        self.model = BertPipeline(tokenizer, context.artifacts["model_path"], DEVICE)
        
    def predict(self, context, model_input):
        company_1, company_2, top_n = model_input
        res = self.model(company_1, company_2, top_n)
        return res


In [12]:
artifacts = {"model_path": f"{SAVE_DIR}/best.pth"}

In [13]:
mlflow.pyfunc.save_model(
    path=f"{SAVE_DIR}/model",
    python_model=FastTextWrapper(),
    artifacts=artifacts,
)

In [14]:
mlflow.pyfunc.log_model(
    artifact_path=f"{SAVE_DIR}/model",
    python_model=FastTextWrapper(),
    artifacts=artifacts,
)

<mlflow.models.model.ModelInfo at 0x7f5ca88d79d0>

In [15]:
mlflow.end_run()

In [16]:
# import mlflow
# logged_model = 'runs:/d7c9cc785a8e4c578990893d66685d04/weights/bert-10-19-22-00-00-00/model'

# loaded_model = mlflow.pyfunc.load_model(logged_model)

# loaded_model.predict(["abba", "abba", 10])