In [None]:
# code required by Colab
from google.colab import drive
drive.mount('/content/drive')

In [7]:
# import transformers
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import os
import sys
import numpy as np
from datetime import datetime
import mlflow
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric


In [2]:
# Add src as folder from where to import
parent_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '.', 'src'))

# Add this directory to sys.path
sys.path.append(parent_dir)

In [3]:
from settings import (
    MLFLOW_TRACKING_URI,
    MLFLOW_TRACKING_USERNAME,
    MLFLOW_TRACKING_PASSWORD,
)

In [4]:
#Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [11]:
from clean_dataset import clearing
clearing()

Initial number of rows: 3838
Number of rows after removing nulls and duplicates: 3685


In [12]:
pair_data = pd.read_parquet(r'..\data\clean_data.parquet', columns=['text', 'text_b', 'label'])

In [13]:
pair_data['label'] = pair_data['label'].astype(int)

In [14]:
# Data shuffling
pair_data = pair_data.sample(frac=1).reset_index(drop=True)

# Split the data into training and test sets
train_data, test_data = train_test_split(pair_data, test_size=0.2, random_state=42)

# Convert training and test sets to dictionary lists
train_list = train_data.to_dict(orient='records')
test_list = test_data.to_dict(orient='records')

# –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞
print(len(train_list))
print(len(test_list))

2948
737


In [15]:
if device.type == 'cpu':
  train_list = train_list[:10]
  test_list = test_list[:10]

In [17]:
# Save to jsonl
with open("../data/train.jsonl", "w") as f:
    for item in train_list:
        f.write(json.dumps(item) + "\n")

with open("../data/test.jsonl", "w") as f:
    for item in test_list:
        f.write(json.dumps(item) + "\n")

print('Train size: ', len(train_data))
print('Test size: ', len(test_data))

Train size:  2948
Test size:  737


In [18]:
user = MLFLOW_TRACKING_USERNAME
experiment_name = 'Herman_PatentMatchBaseline'

def timestamp():
    """This function creates current timestamp"""
    return datetime.now().strftime("%Y_%m_%d_%H%M_%S")

In [19]:
# If you have provided the name of an experiment that does not exist or that existed in MLflow but was deleted, create a new experiment.
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None or experiment.lifecycle_stage == 'deleted':
    mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment_by_name(experiment_name)

In [20]:
import datasets
def preprocess_function(batch):
    return tokenizer(batch["text"], batch["text_b"], truncation=True, padding="max_length")

dataset = datasets.load_dataset("json", data_files={"train": "../data/train.jsonl", "test": "../data/test.jsonl"})
tokenized_data = dataset.map(preprocess_function, batched=True)
print(tokenized_data)

Generating train split: 10 examples [00:00, 591.23 examples/s]
Generating test split: 10 examples [00:00, 1184.43 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 201.31 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 333.32 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'text_b', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
    test: Dataset({
        features: ['text', 'text_b', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})





In [21]:
def compute_metrics(eval_pred):
    f1_metric = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric.add_batch(predictions=predictions, references=labels)
    f1_score = f1_metric.compute()
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1_score["f1"], "accuracy": accuracy}

# Start tracking the session in MLflow
with mlflow.start_run(experiment_id=experiment.experiment_id):

    # Log configuration parameters
    mlflow.log_param("model_name", "roberta-base")
    mlflow.log_param("num_train_epochs", 3)
    mlflow.log_param("per_device_train_batch_size", 8)
    mlflow.log_param("per_device_eval_batch_size", 32)
    mlflow.log_param("warmup_steps", 100)
    mlflow.log_param("weight_decay", 0.01)
    mlflow.log_param("logging_steps", 50)
    mlflow.log_param("gradient_accumulation_steps", 2)  # Gradient accumulation to use effective larger batch size


    training_args = TrainingArguments(
        output_dir="./results",          # output directory
        num_train_epochs=3,              # total # of training epochs
        per_device_train_batch_size=8,    # batch size per device during training
        per_device_eval_batch_size=32,    # batch size for evaluation
        warmup_steps=100,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=50,
        gradient_accumulation_steps=2,   # Gradient accumulation
    )

    trainer = Trainer(
        model=model,                         # the instantiated ü§ó Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=tokenized_data["train"],         # training dataset
        eval_dataset=tokenized_data["test"],           # evaluation dataset
        compute_metrics=compute_metrics,               # the callback that computes metrics of interest
    )

    # Train the model
    trainer.train()

    for epoch in range(training_args.num_train_epochs):
        # Evaluate the model
        eval_results = trainer.evaluate()

        # Log metrics
        for key, value in eval_results.items():
            mlflow.log_metric(key, value, step=epoch)

        # Generate and log the confusion matrix
        predictions, labels, _ = trainer.predict(tokenized_data["test"])
        preds = np.argmax(predictions, axis=-1)
        cm = confusion_matrix(labels, preds)

        # Calculate accuracy
        accuracy = accuracy_score(labels, preds)
        mlflow.log_metric("accuracy", accuracy, step=epoch)

        # Save the confusion matrix as CSV and log it as an artifact
        cm_filename = f"../artifacts/{timestamp()}_confusion_matrix_epoch_{epoch}.csv"
        os.makedirs(os.path.dirname(cm_filename), exist_ok=True)
        cm_df = pd.DataFrame(cm)
        cm_df.to_csv(cm_filename, index=False)
        mlflow.log_artifact(cm_filename)

        # Print the confusion matrix
        print(f"Confusion Matrix for epoch {epoch}:")
        print(cm)

mlflow.end_run()


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [01:03<00:00, 21.15s/it]


{'train_runtime': 63.4179, 'train_samples_per_second': 0.473, 'train_steps_per_second': 0.047, 'train_loss': 0.734747568766276, 'epoch': 3.0}


  f1_metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.42s/it]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.19it/s]


Confusion Matrix for epoch 0:
[[6 0]
 [4 0]]


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.22it/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.21it/s]


Confusion Matrix for epoch 1:
[[6 0]
 [4 0]]


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.29it/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.18it/s]


Confusion Matrix for epoch 2:
[[6 0]
 [4 0]]
