# Explanation
This jupyter notebook contains all the code to finetune the roberta model on the semantic-benchmark dataset. It should always be kept uptodate.

**Note**: Maybe you need to adjust the paths to the dataset.

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [2]:
import os
from transformers import RobertaTokenizerFast, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments
from tokenizers.processors import TemplateProcessing
import torch
from torch.utils.data import Dataset
from datasets import Dataset, load_dataset
import os
from pathlib import Path
import numpy as np
import evaluate
import accelerate
from transformers import EarlyStoppingCallback, IntervalStrategy
import os
import pandas as pd


os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

2024-04-30 09:42:41.760285: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-30 09:42:41.760423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-30 09:42:41.900067: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Constants

In [3]:
MODEL_NAME = "microsoft/codebert-base"
USE_CPU = False

##### Load Dataset:

In [4]:
dataset_path = Path("/kaggle/input/gpt-clonebenchmark-dataset/gpt_benchmark_dataset.csv")
assert dataset_path.exists(), f"Could not find the the dataset in path: {dataset_path.absolute()}"

df = pd.read_csv(dataset_path)
dataset = Dataset.from_pandas(df) # in kaggle we need to load it into a pandas, then load it into a dataset
dataset

Dataset({
    features: ['clone1', 'clone2', 'semantic_clone'],
    num_rows: 12195
})

#### Tokenize the complete Dataset before Fine-Tuning
Note: they are stored on the CPU at the moment, but the trainer will move them to the GPU automatically during fine-tuning.

In [5]:
MAX_LENGTH = 255
def tokenization(row):
    tokenized_inputs = tokenizer([row["clone1"], row["clone2"]], padding="max_length", truncation=True, return_tensors="pt",
                                 max_length=MAX_LENGTH)
    tokenized_inputs["input_ids"] = tokenized_inputs["input_ids"].flatten()
    tokenized_inputs["attention_mask"] = tokenized_inputs["attention_mask"].flatten()
    return tokenized_inputs

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

dataset = dataset.rename_column("semantic_clone", "label") # the Huggingface library expects the column name label
dataset = dataset.map(tokenization, batched=False) # using batched would not allow the current nifty trick
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # make sure everything are tensors

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Map:   0%|          | 0/12195 [00:00<?, ? examples/s]

#### Create the dataset splits: (Train: **90**, Evaluation:10, ~Testing: 30~): FIRST DATASET

In [7]:
dataset = dataset.shuffle(seed=42) # randomize dataset: currently first 1000: clones, last 1000 not clones
# dataset_train = dataset.select(range(8537)) # select the first 1200 for training and evaluation (during training)
# dataset_train = dataset_train.train_test_split(test_size=0.1, seed=42)
dataset_train = dataset.train_test_split(test_size=0.1, seed=42)
# proper_test_dataset = dataset.select(range(8537,12195))
# proper_test_dataset.to_csv("proper_test_dataset.csv") # save them to be able to repeat scores on model

#### Create the dataset splits: (Train: 60, Evaluation:10, Testing: 30): SECOND DATASET

In [8]:
dataset_path_semantic = Path("/kaggle/input/semanticbenchmark-dataset/semantic_benchmark_dataset_2.csv")
assert dataset_path_semantic.exists(), f"Could not find the the dataset in path: {dataset_path_semantic.absolute()}"

df = pd.read_csv(dataset_path_semantic)

# Convert the Pandas DataFrame into a datasets Dataset
dataset_semantic = Dataset.from_pandas(df)
dataset_semantic = dataset_semantic.rename_column("semantic_clone", "label")
dataset_semantic = dataset_semantic.map(tokenization, batched=False)
dataset_semantic.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

dataset_semantic = dataset_semantic.shuffle(seed=42)
dataset_semantic_train = dataset_semantic.select(range(1200))
dataset_semantic_train = dataset_semantic_train.train_test_split(test_size=0.1, seed=42)

proper_test_dataset_semantic = dataset_semantic.select(range(1200,2000))
proper_test_dataset_semantic.to_csv("proper_test_dataset_semantic.csv")
# 
# dataset_train = dataset_train.train_test_split(test_size=0.16, seed=42)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

3799596

#### Load Model

In [9]:
if USE_CPU:
    device = torch.device("cpu")
else: # Cuda=GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
config = RobertaConfig.from_pretrained(MODEL_NAME, num_labels=2) # Binary Classification Task: 2 labels
# config.hidden_dropout_prob = 0.3
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], "precision": precision.compute(predictions=predictions, references=labels)["precision"], "recall": recall.compute(predictions=predictions, references=labels)["recall"], "f1": f1.compute(predictions=predictions, references=labels)["f1"]}

##### Sanity check to see if everything is setup correctly:
Hint: the scores calculate might print some warnings

In [12]:
# batch_input_ids = dataset_train["train"]["input_ids"][1:2].to(device)
# batch_attention_mask = dataset_train["train"]["attention_mask"][1:2].to(device)
# batch_labels = dataset_train["train"]["label"][1:2].to(device)
# output = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
# 
# compute_metrics((output.logits.detach().cpu().numpy(), batch_labels))

#### Training Setup

In [13]:
BATCH_SIZE = 16
STEPS = (len(dataset_train["train"]) // BATCH_SIZE) // 2

training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=2e-5,             # Learning rate
    adam_epsilon=1e-8,              # Epsilon for Adam optimizer
    num_train_epochs= 15,             # Total number of training epochs
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=STEPS,
    evaluation_strategy="steps",
    eval_steps=STEPS,
    output_dir ="./output",
    dataloader_pin_memory=True,
    dataloader_num_workers=4, # how many cpus to use to load the data while training
    do_eval=True,                 # Perform evaluation at the end of training
    save_strategy="steps",
    save_steps=STEPS,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    save_total_limit=3,
    use_cpu=USE_CPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],      # Evaluation dataset
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


#### Start Training

In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
342,0.212,0.106363,0.982787,0.99026,0.976,0.983078
684,0.0706,0.053028,0.990164,0.993559,0.9872,0.990369
1026,0.0561,0.052291,0.991803,0.995169,0.9888,0.991974
1368,0.0449,0.05185,0.990164,0.995153,0.9856,0.990354
1710,0.0271,0.041914,0.989344,0.988818,0.9904,0.989608
2052,0.0325,0.066612,0.990984,0.998377,0.984,0.991136


TrainOutput(global_step=2052, training_loss=0.0738610919223659, metrics={'train_runtime': 1859.0237, 'train_samples_per_second': 88.555, 'train_steps_per_second': 5.535, 'total_flos': 8604194010174000.0, 'train_loss': 0.0738610919223659, 'epoch': 2.99})

In [15]:
# continue training from a checkpoint:
# trainer.train(resume_from_checkpoint=True)

# calculate the scores of the returning/best model on the evaluation dataset
trainer.evaluate()

# store model to disk (same as best checkpoint)
# trainer.save_model(f"gpt_fine_tuned")

{'eval_loss': 0.052291300147771835,
 'eval_accuracy': 0.9918032786885246,
 'eval_precision': 0.9951690821256038,
 'eval_recall': 0.9888,
 'eval_f1': 0.9919743178170145,
 'eval_runtime': 21.4278,
 'eval_samples_per_second': 56.935,
 'eval_steps_per_second': 3.593,
 'epoch': 2.99}

In [16]:
# Evaluate on the Test Dataset
# trainer.evaluate(proper_test_dataset)

#### Test on Semantic Dataset????

In [17]:
trainer.evaluate(proper_test_dataset_semantic)

{'eval_loss': 1.373661994934082,
 'eval_accuracy': 0.81125,
 'eval_precision': 0.9959677419354839,
 'eval_recall': 0.6221662468513854,
 'eval_f1': 0.7658914728682171,
 'eval_runtime': 14.1012,
 'eval_samples_per_second': 56.733,
 'eval_steps_per_second': 3.546,
 'epoch': 2.99}

In [18]:
# trainer.evaluate(dataset_semantic)

#### Second stage of fineTuning

In [19]:
STEPS = (len(dataset_semantic_train["train"]) // BATCH_SIZE) // 2
training_args.logging_steps = STEPS
training_args.eval_steps = STEPS
training_args.save_steps = STEPS

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_semantic_train["train"],
    eval_dataset=dataset_semantic_train["test"],      # Evaluation dataset
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
33,0.3789,0.200014,0.925,0.965517,0.888889,0.92562
66,0.2344,0.294579,0.908333,1.0,0.825397,0.904348
99,0.1751,0.106116,0.95,0.983051,0.920635,0.95082
132,0.1815,0.078915,0.966667,0.953846,0.984127,0.96875
165,0.0539,0.21778,0.95,1.0,0.904762,0.95
198,0.0944,0.186402,0.958333,1.0,0.920635,0.958678
231,0.0616,0.065785,0.983333,0.984127,0.984127,0.984127
264,0.0903,0.082536,0.983333,0.984127,0.984127,0.984127
297,0.0386,0.106024,0.983333,0.969231,1.0,0.984375
330,0.0489,0.125163,0.966667,0.968254,0.968254,0.968254


TrainOutput(global_step=396, training_loss=0.11806501312689348, metrics={'train_runtime': 387.7409, 'train_samples_per_second': 41.78, 'train_steps_per_second': 2.631, 'total_flos': 1650076317028800.0, 'train_loss': 0.11806501312689348, 'epoch': 5.82})

In [21]:
trainer.save_model(f"two_stage_gpt_fine_tuned_semantic")

In [22]:
trainer.evaluate()

{'eval_loss': 0.10602383315563202,
 'eval_accuracy': 0.9833333333333333,
 'eval_precision': 0.9692307692307692,
 'eval_recall': 1.0,
 'eval_f1': 0.9843749999999999,
 'eval_runtime': 2.2749,
 'eval_samples_per_second': 52.749,
 'eval_steps_per_second': 3.517,
 'epoch': 5.82}

In [23]:
trainer.evaluate(proper_test_dataset_semantic)

{'eval_loss': 0.3708450198173523,
 'eval_accuracy': 0.94125,
 'eval_precision': 0.9186602870813397,
 'eval_recall': 0.9672544080604534,
 'eval_f1': 0.9423312883435583,
 'eval_runtime': 14.1132,
 'eval_samples_per_second': 56.684,
 'eval_steps_per_second': 3.543,
 'epoch': 5.82}

##### (Cleaning Memory)
rerun below cells a few time (especially if you stopped with an error above)

In [24]:
# torch.cuda.empty_cache()
# import gc
# gc.collect()
# 
# del model
# del trainer
# torch.cuda.empty_cache()

In [25]:
# 1/0