In [None]:
# code required by Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# code required by my Colab. Change the path
%cd /content/drive/MyDrive/proj

In [None]:
#%pip install -r requirements.txt

In [None]:
# import transformers
import pandas as pd
import torch
import json 
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import os
import sys
import numpy as np
from datetime import datetime
import mlflow

In [None]:
# code required by my Colab
# Add folders from where to import
parent_dir = os.path.abspath(os.getcwd())
sys.path.append(parent_dir)

In [None]:
# code required by local PC
# Get the current working directory
current_dir = os.getcwd()

# Determine the parent directory
# Use os.path.abspath and os.path.join to get the absolute path to the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

# Add the parent directory to sys.path
# This allows importing modules from the parent directory
sys.path.append(parent_dir)


In [None]:
# Import configuration variables from the settings module.
# These variables are used to configure the connection to the MLflow server.

from src.settings import (
    MLFLOW_TRACKING_URI,
    MLFLOW_TRACKING_USERNAME,
    MLFLOW_TRACKING_PASSWORD,
)

In [None]:
#Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


In [None]:
# Move the model to the proper device
model.to(device)

In [None]:
# Call the clearing function to execute the dataset cleaning process. Delete all nulls values and delete repetitions
from src.clean_dataset import clearing
clearing()

In [None]:
# Load the cleaned data from the 'clean_data.parquet' file
# Specify the columns to be loaded: 'text', 'text_b', and 'label'
dataset_path = os.path.join(parent_dir, r'data\clean_data.parquet')
pair_data = pd.read_parquet(dataset_path, columns=['text', 'text_b', 'label'])

In [None]:
#Cast label column to integer 
pair_data['label'] = pair_data['label'].astype(int)

In [None]:
# Data shuffling
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

# Split the data into training and test sets
train_data, test_data = train_test_split(pair_data, test_size=0.2, random_state=42)

# Convert training and test sets to dictionary lists
train_list = train_data.to_dict(orient='records')
test_list = test_data.to_dict(orient='records')

# Вывод результата
print(len(train_list))
print(len(test_list))

In [None]:
# If you don't have a GPU, limit the amount of data
if device.type == 'cpu':
  train_list = train_list[:10]
  test_list = test_list[:10]

In [None]:
# Define a dictionary with model configuration and hyperparameters
# Consider changing the number of epochs if you want to observe longer training
config = {
    'learning_rate': 2e-5,
    'batch_size': 32,
    'num_epochs': 7,
    'max_length': 128,
    'test_size': 0.2,
    'random_state': 42,
}

In [None]:
# Data preparation
train_texts, train_labels, val_texts, val_labels = [], [], [], []

for item in train_list:
    train_texts.append((item['text'], item['text_b']))
    train_labels.append(item['label'])

for item in test_list:
    val_texts.append((item['text'], item['text_b']))
    val_labels.append(item['label'])

In [None]:
# Tokenisation
train_encodings = tokenizer(train_texts, padding=True, truncation=True,
                            return_tensors="pt", max_length=config['max_length'])
val_encodings = tokenizer(val_texts, padding=True, truncation=True,
                          return_tensors="pt", max_length=config['max_length'])

# Extract input IDs and attention masks
train_inp_ids = train_encodings['input_ids']
train_att_mask = train_encodings['attention_mask']
val_inp_ids = val_encodings['input_ids']
val_att_mask = val_encodings['attention_mask']


# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(val_labels)

# Creating a TensorDataset
train_data = TensorDataset(train_inp_ids, train_att_mask, train_labels)
val_data = TensorDataset(val_inp_ids, val_att_mask, validation_labels)

# Creating a DataLoader
train_dataloader = DataLoader(train_data, batch_size=config["batch_size"])
val_dataloader = DataLoader(val_data, batch_size=config["batch_size"])

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=config['learning_rate'], no_deprecation_warning=True)

In [None]:
user = MLFLOW_TRACKING_USERNAME
experiment_name = 'Herman_PatentMatchBaseline'

def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d_%H%M_%S")

In [None]:
# If you have provided the name of an experiment that does not exist or that existed in MLflow but was deleted, create a new experiment.
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [None]:
# Start training and reporting to MLflow with provided experiment name
with mlflow.start_run(experiment_id=experiment.experiment_id):

  # Log parameters provided in config variable
  for param_name, param_value in config.items():
    mlflow.log_param(param_name, param_value)


  for epoch in range(config['num_epochs']):  # Number of training epochs
      # set run name
      mlflow.set_tag(key='mlflow.runName',
                       value=f'tes2_{timestamp()}_{user}')
      model.train()
      train_loss = 0.0
      for batch in train_dataloader:
          b_input_ids, b_attention_mask, b_labels = batch

          # Move the batch to the proper device
          b_input_ids = b_input_ids.to(device)
          b_labels = b_labels.to(device)
          b_attention_mask = b_attention_mask.to(device)

          model.zero_grad()
          outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
          loss = outputs.loss
          train_loss += loss.item()
          loss.backward()
          optimizer.step()

      # Log the average training loss for the epoch
      avg_train_loss = train_loss / len(train_dataloader)
      mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
      print(f"Average training loss for epoch {epoch}: {avg_train_loss}")

      # Validation loop
      model.eval()
      val_loss = 0.0
      predictions, true_labels = [], []
      for batch in val_dataloader:
          b_input_ids, b_attention_mask, b_labels = batch

          # Move the batch to the proper device
          b_input_ids = b_input_ids.to(device)
          b_labels = b_labels.to(device)
          b_attention_mask = b_attention_mask.to(device)


          with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)

          logits = outputs.logits
          loss = outputs.loss
          val_loss += loss.item()
          
          predictions.extend(torch.argmax(logits, dim=1).cpu().tolist())
          true_labels.extend(b_labels.cpu().tolist())

      # Log the average validation loss for the epoch
      avg_val_loss = val_loss / len(val_dataloader)
      mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
      print(f"Average validation loss for epoch {epoch}: {avg_val_loss}")
      
      # Calculate the accuracy of the predictions
      val_accuracy = accuracy_score(true_labels, predictions)
      mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
      print(f"Validation Accuracy: {val_accuracy}")

      # Calculate the confusion matrix
      cm = confusion_matrix(true_labels, predictions)
      # You can log the confusion matrix as an artifact (e.g., as an image or a CSV file)
      # For example, to log it as a CSV file:
      cm_filename = os.path.join(parent_dir, f'artifacts/{timestamp()}confusion_matrix_epoch_{epoch}.csv')
      np.savetxt(cm_filename, cm, delimiter=",")
      mlflow.log_artifact(cm_filename)


      # Print the confusion matrix
      print(cm)
mlflow.end_run()