# Explanation
This jupyter notebook contains all the code to finetune the roberta model on the semantic-benchmark dataset. It should always be kept uptodate.

**Note**: Maybe you need to adjust the paths to the dataset.

In [20]:
import os
from transformers import RobertaTokenizerFast, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments
from tokenizers.processors import TemplateProcessing
import torch
from torch.utils.data import Dataset
from datasets import Dataset, load_dataset
import os
from pathlib import Path
import numpy as np
import evaluate
import accelerate
from transformers import EarlyStoppingCallback, IntervalStrategy

### Constants

In [2]:
MODEL_NAME = "microsoft/codebert-base"
USE_CPU = True

##### Load Dataset:

In [3]:
os.chdir("..") # navigate to base-directory
dataset_path = Path("data/semantic_benchmark_dataset.csv")
if not dataset_path.exists():
    raise Exception(f"Could not find the the dataset in path: {dataset_path.absolute()}")

dataset = load_dataset('csv', data_files=str(dataset_path), split="train") # in kaggle we need to load it into a pandas, then load it into a dataset
dataset

Dataset({
    features: ['Unnamed: 0', 'clone1', 'clone2', 'semantic_clone'],
    num_rows: 2000
})

#### Tokenize the complete Dataset before Fine-Tuning
Note: they are stored on the CPU at the moment, but the trainer will move them to the GPU automatically during fine-tuning.

In [4]:
def tokenization(row):
    tokenized_inputs = tokenizer([row["clone1"], row["clone2"]], padding="max_length", truncation=True, return_tensors="pt",
                                 max_length=257)
    tokenized_inputs["input_ids"] = tokenized_inputs["input_ids"].flatten()
    tokenized_inputs["attention_mask"] = tokenized_inputs["attention_mask"].flatten()
    return tokenized_inputs

In [5]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

dataset = dataset.rename_column("semantic_clone", "label") # the Huggingface library expects the column name label
dataset = dataset.map(tokenization, batched=False) # using batched would not allow the current nifty trick
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # make sure everything are tensors

#### Create the dataset splits: (Train: 60, Evaluation:10, Testing: 30)

In [6]:
dataset = dataset.shuffle(seed=42) # randomize dataset: currently first 1000: clones, last 1000 not clones
dataset_train = dataset.select(range(1200)) # select the first 1200 for training and evaluation (during training)
dataset_train = dataset_train.train_test_split(test_size=0.1, seed=42)

proper_test_dataset = dataset.select(range(1200,2000))
proper_test_dataset.to_csv("proper_test_dataset.csv") # save them to be able to repeat scores on model

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3822211

#### Load Model

In [7]:
if USE_CPU:
    device = torch.device("cpu")
else: # Cuda=GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
config = RobertaConfig.from_pretrained(MODEL_NAME, num_labels=2) # Binary Classification Task: 2 labels
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], "precision": precision.compute(predictions=predictions, references=labels)["precision"], "recall": recall.compute(predictions=predictions, references=labels)["recall"], "f1": f1.compute(predictions=predictions, references=labels)["f1"]}

##### Sanity check to see if everything is setup correctly:
Hint: the scores calculate might print some warnings

In [19]:
batch_input_ids = dataset_train["train"]["input_ids"][1:2].to(device)
batch_attention_mask = dataset_train["train"]["attention_mask"][1:2].to(device)
batch_labels = dataset_train["train"]["label"][1:2].to(device)
output = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)

compute_metrics((output.logits.detach().numpy(), batch_labels))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 1.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

#### Training Setup

In [21]:
BATCH_SIZE = 16
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=2e-5,             # Learning rate
    adam_epsilon=1e-8,              # Epsilon for Adam optimizer
    num_train_epochs=30,             # Total number of training epochs
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=BATCH_SIZE,
    output_dir ="./output",
    dataloader_pin_memory=True,
    dataloader_num_workers=4, # how many cpus to use to load the data while training
    do_eval=True,                 # Perform evaluation at the end of training
    save_strategy="steps",
    save_steps=BATCH_SIZE,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    save_total_limit=2,
    use_cpu=USE_CPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],      # Evaluation dataset
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


#### Start Training

In [ ]:
trainer.train()

In [ ]:
# continue training from a checkpoint:
# trainer.train(resume_from_checkpoint=True)

# calculate the scores of the returning/best model on the evaluation dataset
# trainer.evaluate()

# store model to disk (same as best checkpoint)
# trainer.save_model(f"semantic_fine_tuned2")

In [ ]:
# Evaluate on the Test Dataset
trainer.evaluate(proper_test_dataset)

##### (Cleaning Memory)
rerun below cells a few time (especially if you stopped with an error above)

In [54]:
torch.cuda.empty_cache()
import gc
gc.collect()

del model
del trainer
torch.cuda.empty_cache()

66

In [ ]:
1/0