# Explanation
This jupyter notebook contains all the code to finetune the roberta model on the semantic-benchmark dataset. It should always be kept uptodate.

**Note**: Maybe you need to adjust the paths to the dataset.

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [2]:
import os
from transformers import RobertaTokenizerFast, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments
from tokenizers.processors import TemplateProcessing
import torch
from torch.utils.data import Dataset
from datasets import Dataset, load_dataset
import os
from pathlib import Path
import numpy as np
import evaluate
import accelerate
from transformers import EarlyStoppingCallback, IntervalStrategy
import os
import pandas as pd


os.environ["WANDB_DISABLED"] = "true"

2024-04-18 10:58:00.766152: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 10:58:00.766343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 10:58:00.903696: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Constants

In [3]:
MODEL_NAME = "microsoft/codebert-base"
USE_CPU = False

##### Load Dataset:

In [4]:
dataset_path = Path("/kaggle/input/gpt-clonebenchmark-dataset/gpt_benchmark_dataset.csv")
assert dataset_path.exists(), f"Could not find the the dataset in path: {dataset_path.absolute()}"

df = pd.read_csv(dataset_path)
dataset = Dataset.from_pandas(df) # in kaggle we need to load it into a pandas, then load it into a dataset
dataset

Dataset({
    features: ['clone1', 'clone2', 'semantic_clone'],
    num_rows: 12195
})

#### Tokenize the complete Dataset before Fine-Tuning
Note: they are stored on the CPU at the moment, but the trainer will move them to the GPU automatically during fine-tuning.

In [5]:
MAX_LENGTH = 255
def tokenization(row):
    tokenized_inputs = tokenizer([row["clone1"], row["clone2"]], padding="max_length", truncation=True, return_tensors="pt",
                                 max_length=MAX_LENGTH)
    tokenized_inputs["input_ids"] = tokenized_inputs["input_ids"].flatten()
    tokenized_inputs["attention_mask"] = tokenized_inputs["attention_mask"].flatten()
    return tokenized_inputs

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

dataset = dataset.rename_column("semantic_clone", "label") # the Huggingface library expects the column name label
dataset = dataset.map(tokenization, batched=False) # using batched would not allow the current nifty trick
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # make sure everything are tensors

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Map:   0%|          | 0/12195 [00:00<?, ? examples/s]

#### Create the dataset splits: (Train: **70**, ~Evaluation:10~, Testing: 30): FIRST DATASET

In [7]:
dataset = dataset.shuffle(seed=42) # randomize dataset: currently first 1000: clones, last 1000 not clones
dataset_train = dataset.select(range(8537)) # select the first 1200 for training and evaluation (during training)
# dataset_train = dataset_train.train_test_split(test_size=0.1, seed=42)

proper_test_dataset = dataset.select(range(8537,12195))
# proper_test_dataset.to_csv("proper_test_dataset.csv") # save them to be able to repeat scores on model

#### Create the dataset splits: (~Train: 60~, Evaluation:**60**, Testing: 40): SECOND DATASET

In [8]:
dataset_path_semantic = Path("/kaggle/input/semanticbenchmark-dataset/semantic_benchmark_dataset_2.csv")
assert dataset_path_semantic.exists(), f"Could not find the the dataset in path: {dataset_path_semantic.absolute()}"

df = pd.read_csv(dataset_path_semantic)

# Convert the Pandas DataFrame into a datasets Dataset
dataset_semantic = Dataset.from_pandas(df)
dataset_semantic = dataset_semantic.rename_column("semantic_clone", "label")
dataset_semantic = dataset_semantic.map(tokenization, batched=False)
dataset_semantic.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

dataset_semantic = dataset_semantic.shuffle(seed=42)
dataset_semantic_evaluation = dataset_semantic.select(range(1200))

proper_test_dataset_semantic = dataset_semantic.select(range(1200,2000))
proper_test_dataset_semantic.to_csv("proper_test_dataset_semantic.csv")
# 
# dataset_train = dataset_train.train_test_split(test_size=0.16, seed=42)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3799596

#### Load Model

In [9]:
if USE_CPU:
    device = torch.device("cpu")
else: # Cuda=GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
config = RobertaConfig.from_pretrained(MODEL_NAME, num_labels=2) # Binary Classification Task: 2 labels
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], "precision": precision.compute(predictions=predictions, references=labels)["precision"], "recall": recall.compute(predictions=predictions, references=labels)["recall"], "f1": f1.compute(predictions=predictions, references=labels)["f1"]}

##### Sanity check to see if everything is setup correctly:
Hint: the scores calculate might print some warnings

In [12]:
# batch_input_ids = dataset_train["train"]["input_ids"][1:2].to(device)
# batch_attention_mask = dataset_train["train"]["attention_mask"][1:2].to(device)
# batch_labels = dataset_train["train"]["label"][1:2].to(device)
# output = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
# 
# compute_metrics((output.logits.detach().cpu().numpy(), batch_labels))

#### Training Setup

In [13]:
BATCH_SIZE = 16
STEPS = 32
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=2e-5,             # Learning rate
    adam_epsilon=1e-8,              # Epsilon for Adam optimizer
    num_train_epochs= 15,             # Total number of training epochs
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=STEPS,
    evaluation_strategy="steps",
    eval_steps=STEPS,
    output_dir ="./output",
    dataloader_pin_memory=True,
    dataloader_num_workers=4, # how many cpus to use to load the data while training
    do_eval=True,                 # Perform evaluation at the end of training
    save_strategy="steps",
    save_steps=STEPS,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    save_total_limit=3,
    use_cpu=USE_CPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_semantic_evaluation,      # Evaluation dataset
    compute_metrics=compute_metrics,
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=12)],
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


#### Start Training

In [14]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
32,0.7094,0.706832,0.4975,0.0,0.0,0.0
64,0.6955,0.671658,0.6875,0.641791,0.855721,0.733475
96,0.3784,0.480167,0.818333,0.95082,0.6733,0.78835
128,0.0932,0.76342,0.843333,0.981439,0.701493,0.818182
160,0.1563,1.114492,0.771667,0.993994,0.548922,0.707265
192,0.3573,0.910605,0.7525,0.993548,0.510779,0.674699
224,0.1096,0.614277,0.86,0.944785,0.766169,0.846154
256,0.1369,0.679359,0.8375,0.981132,0.689884,0.810127
288,0.1139,0.682805,0.8475,0.990654,0.703151,0.822502
320,0.09,0.921321,0.816667,1.0,0.635158,0.776876


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=8010, training_loss=0.024748435918979583, metrics={'train_runtime': 12705.3737, 'train_samples_per_second': 10.079, 'train_steps_per_second': 0.63, 'total_flos': 3.3561074138679e+16, 'train_loss': 0.024748435918979583, 'epoch': 15.0})

In [15]:
# continue training from a checkpoint:
# trainer.train(resume_from_checkpoint=True)

# calculate the scores of the returning/best model on the evaluation dataset
trainer.evaluate()

# store model to disk (same as best checkpoint)
trainer.save_model(f"gpt_fine_tuned")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [16]:
# Evaluate on the Test Dataset
trainer.evaluate(proper_test_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.061519283801317215,
 'eval_accuracy': 0.9846910880262438,
 'eval_precision': 0.9717935071846727,
 'eval_recall': 0.9983597594313832,
 'eval_f1': 0.9848975188781014,
 'eval_runtime': 64.1847,
 'eval_samples_per_second': 56.992,
 'eval_steps_per_second': 3.568,
 'epoch': 15.0}

#### Test on Semantic Dataset????

In [17]:
trainer.evaluate(proper_test_dataset_semantic)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.4882342517375946,
 'eval_accuracy': 0.9225,
 'eval_precision': 0.9772079772079773,
 'eval_recall': 0.8639798488664987,
 'eval_f1': 0.9171122994652405,
 'eval_runtime': 14.2351,
 'eval_samples_per_second': 56.199,
 'eval_steps_per_second': 3.512,
 'epoch': 15.0}

In [18]:
trainer.evaluate(dataset_semantic)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 0.49776002764701843,
 'eval_accuracy': 0.917,
 'eval_precision': 0.9826388888888888,
 'eval_recall': 0.849,
 'eval_f1': 0.9109442060085837,
 'eval_runtime': 35.2391,
 'eval_samples_per_second': 56.755,
 'eval_steps_per_second': 3.547,
 'epoch': 15.0}

##### (Cleaning Memory)
rerun below cells a few time (especially if you stopped with an error above)

In [19]:
# torch.cuda.empty_cache()
# import gc
# gc.collect()
# 
# del model
# del trainer
# torch.cuda.empty_cache()

In [20]:
# 1/0