In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, f1_score
import os
import sys
import numpy as np
from datetime import datetime
import mlflow
import pandas as pd
from pandas import DataFrame

In [None]:
#use ! instead of % to run on google colab
%pip install -r requirements.txt

In [None]:
#change path to the root directory
%cd /Users/nikitalukasevic/Desktop/Work/Projects/classifier-nikita

In [None]:
# Add src as folder from where to import
parent_dir = os.path.abspath(os.getcwd())

# Add this directory to sys.path
sys.path.append(parent_dir)
print(parent_dir)

In [None]:
# this code uploads the credentials to MLflow given in the .env file
from src.settings import (
    MLFLOW_TRACKING_URI,
    MLFLOW_TRACKING_USERNAME,
    MLFLOW_TRACKING_PASSWORD,
)

print(MLFLOW_TRACKING_PASSWORD)

In [None]:
# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


In [None]:
# Move the model to the proper device
model.to(device)

In [None]:
columns_to_extract: list[str] = ['text', 'text_b', 'label']

#add nrows parameter if device = 'cpu' elese delete
train_df: DataFrame = pd.read_csv('data/work_data/train_work.csv', usecols=columns_to_extract, nrows=10)
test_df: DataFrame = pd.read_csv('data/work_data/test_work.csv', usecols=columns_to_extract, nrows=3)
val_df: DataFrame = pd.read_csv('data/work_data/val_work.csv', usecols=columns_to_extract, nrows=10)

In [None]:
config = {
    'learning_rate': 2e-5,
    'batch_size': 32,
    'num_epochs': 7,
    'max_length': 512,
    'test_size': 0.1,
    'random_state': 42,
}

In [None]:
def tokenize_data(df, tokenizer, max_length):
    texts = df['text'].tolist()
    texts_b = df['text_b'].tolist()
    
    # Tokenize the filtered texts
    encodings = tokenizer(
        texts,
        texts_b,
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    
    return encodings['input_ids'], encodings['attention_mask']

In [None]:
train_input_ids, train_attention_masks = tokenize_data(train_df, tokenizer, config['max_length'])
val_input_ids, val_attention_masks = tokenize_data(val_df, tokenizer, config['max_length'])
test_input_ids, test_attention_masks = tokenize_data(test_df, tokenizer, config['max_length'])

In [None]:
train_labels = torch.tensor(train_df['label'].astype(int).tolist())
val_labels = torch.tensor(val_df['label'].astype(int).tolist())
test_labels = torch.tensor(test_df['label'].astype(int).tolist())

In [None]:
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=config["batch_size"])
validation_dataloader = DataLoader(val_data, batch_size=config["batch_size"])
test_dataloader = DataLoader(test_data, batch_size=config["batch_size"])

In [None]:
optimizer = AdamW(model.parameters(), lr=config['learning_rate'], no_deprecation_warning=True)

In [None]:
user = MLFLOW_TRACKING_USERNAME

experiment_name = 'Nikita_PatentMatchBaseline'


def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d_%H%M_%S")

In [None]:
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [None]:
# Start training and reporting to MLflow with provided experiment name
with mlflow.start_run(experiment_id=experiment.experiment_id):

  # Log parameters provided in config variable
  for param_name, param_value in config.items():
    mlflow.log_param(param_name, param_value)


  for epoch in range(config['num_epochs']):  # Number of training epochs
      # set run name
      mlflow.set_tag(key='mlflow.runName',
                       value=f'{timestamp()}_{user}')
      model.train()
      train_loss = 0.0
      for batch in train_dataloader:
          b_input_ids, b_attention_mask, b_labels = batch

          # Move the batch to the proper device
          b_input_ids = b_input_ids.to(device)
          b_labels = b_labels.to(device)
          b_attention_mask = b_attention_mask.to(device)

          model.zero_grad()
          outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
          loss = outputs.loss
          train_loss += loss.item()
          loss.backward()
          optimizer.step()

      # Log the average training loss for the epoch
      avg_train_loss = train_loss / len(train_dataloader)
      mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

      # Validation loop
      model.eval()
      predictions, true_labels = [], []
      for batch in validation_dataloader:
          b_input_ids, b_attention_mask, b_labels = batch

          # Move the batch to the proper device
          b_input_ids = b_input_ids.to(device)
          b_labels = b_labels.to(device)
          b_attention_mask = b_attention_mask.to(device)


          with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)

          logits = outputs.logits
          predictions.extend(torch.argmax(logits, dim=1).cpu().tolist())
          true_labels.extend(b_labels.cpu().tolist())

      # Calculate and log the accuracy of the predictions
      val_accuracy = accuracy_score(true_labels, predictions)
      print(f"Validation Accuracy: {val_accuracy}")
      mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
      
      # Calculate and log the accuracy of the predictions
      val_f1 = f1_score(true_labels, predictions, average='weighted')
      print(f"Validation F1 Score: {val_f1}")
      mlflow.log_metric("val_f1", val_f1, step=epoch)
      
      # Calculate and log the accuracy of the predictions
      val_mcc = matthews_corrcoef(true_labels, predictions)
      print(f"Validation MCC: {val_mcc}")
      mlflow.log_metric("val_mcc", val_mcc, step=epoch)
      
      # Calculate the confusion matrix
      cm = confusion_matrix(true_labels, predictions)
      
      #logging confusion matrix as an artifact on mlflow
      cm_filename = f"artifacts/{timestamp()}confusion_matrix_epoch_{epoch}.csv"
      np.savetxt(cm_filename, cm, delimiter=",")
      mlflow.log_artifact(cm_filename)

      # Print the confusion matrix
      print(cm)
      
  model_filename = f"model_{timestamp()}.pt"
  torch.save(model.state_dict(), model_filename)
  
  # mlflow.pytorch.log_model(model, "model") #use it to save weights to mlflow.
mlflow.end_run()