In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
import pandas as pd
from pandas import DataFrame
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
import sys
import os
import mlflow
from datetime import datetime

In [None]:
# Add src as folder from where to import
parent_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '.', 'src'))

# Add this directory to sys.path
sys.path.append(parent_dir)

In [None]:
# this code uploads the credentials to MLflow given in the .env file
from settings import (
    MLFLOW_TRACKING_URI,
    MLFLOW_TRACKING_USERNAME,
    MLFLOW_TRACKING_PASSWORD,
)

print(MLFLOW_TRACKING_PASSWORD)

In [None]:
# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model_path = 'model_2024_05_30_1047_50.pt'
model.load_state_dict(torch.load(model_path))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
columns_to_extract: list[str] = ['text', 'text_b', 'label']

#delete nrows parameter if device = 'cpu'
test_df: DataFrame = pd.read_csv('../data/work_data/test_work.csv', usecols=columns_to_extract)

In [None]:
texts = test_df['text'].tolist()
texts_b = test_df['text_b'].tolist()
labels = test_df['label'].tolist()

In [None]:
inputs = tokenizer(texts, texts_b, return_tensors='pt', max_length=512, truncation=True, padding=True)

In [None]:
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [None]:
user = MLFLOW_TRACKING_USERNAME

experiment_name = 'Nikita_PatentMatchBaseline'


def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d_%H%M_%S")

In [None]:
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [3]:
def get_weights_version(weights_path: str) -> str:
    path_tolist = weights_path.split('/')
    return path_tolist[len(path_tolist)-1]

day


In [None]:
with mlflow.start_run(experiment_id=experiment.experiment_id):
    mlflow.log_param('weight_version', get_weights_version(model_path))
    
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {val_accuracy}")
    mlflow.log_metric("val_accuracy", val_accuracy)
    
    # Calculate and log the accuracy of the predictions
    val_f1 = f1_score(true_labels, predictions, average='weighted')
    print(f"Validation F1 Score: {val_f1}")
    mlflow.log_metric("val_f1", val_f1)
    
    # Calculate and log the accuracy of the predictions
    val_mcc = matthews_corrcoef(true_labels, predictions)
    print(f"Validation MCC: {val_mcc}")
    mlflow.log_metric("val_mcc", val_mcc)

    print(f'val_accuracy: {val_accuracy}')
    print(f'val_f1: {val_f1}')
    print(f'val_mcc: {val_mcc}')