In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from datasets import load_metric
#from codebleu import corpus_bleu
import nltk

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Capability: {torch.cuda.get_device_capability(0)}")

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="python-codes-small.json",
    block_size=128,
)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

metric_bleu = load_metric("sacrebleu")
metric_rouge = load_metric("rouge")
metric_bertscore = load_metric("bertscore")

output_dir = "gpt2-small-finetuned"
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=100,
    save_steps=600,
    warmup_steps=100,
    prediction_loss_only=True,
)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits.argmax(axis=-1)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu_score = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)["score"]
    rouge_scores = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_score = metric_bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    # codebleu_score = corpus_bleu(decoded_labels, decoded_preds)
    
    return {
        "bleu": bleu_score,
        "rouge-l": rouge_scores["rouge-l"],
        "bertscore": sum(bertscore_score["f1"]) / len(bertscore_score["f1"]),
        # "codebleu": codebleu_score,
    }

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model(output_dir)


Device: cuda
GPU Name: NVIDIA GeForce RTX 3090
GPU Capability: (8, 6)


  metric_bleu = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  8%|▊         | 501/6384 [00:40<07:55, 12.36it/s]

{'loss': 1.8397, 'grad_norm': 3.7450003623962402, 'learning_rate': 4.681731381285805e-05, 'epoch': 0.23}


 16%|█▌        | 1001/6384 [01:44<07:15, 12.36it/s] 

{'loss': 1.3543, 'grad_norm': 3.167097806930542, 'learning_rate': 4.283895607893062e-05, 'epoch': 0.47}


 24%|██▎       | 1501/6384 [02:29<06:50, 11.91it/s]  

{'loss': 1.2442, 'grad_norm': 2.24015736579895, 'learning_rate': 3.886059834500319e-05, 'epoch': 0.7}


 31%|███▏      | 2001/6384 [03:18<06:09, 11.86it/s]  

{'loss': 1.1882, 'grad_norm': 2.2886240482330322, 'learning_rate': 3.488224061107575e-05, 'epoch': 0.94}


 39%|███▉      | 2501/6384 [04:13<05:24, 11.95it/s]  

{'loss': 1.095, 'grad_norm': 2.321869134902954, 'learning_rate': 3.0903882877148314e-05, 'epoch': 1.17}


 47%|████▋     | 3000/6384 [04:54<04:41, 12.00it/s]

{'loss': 1.0426, 'grad_norm': 2.194209098815918, 'learning_rate': 2.692552514322088e-05, 'epoch': 1.41}


 55%|█████▍    | 3501/6384 [05:46<04:01, 11.95it/s]  

{'loss': 1.0157, 'grad_norm': 2.066540479660034, 'learning_rate': 2.2947167409293445e-05, 'epoch': 1.64}


 63%|██████▎   | 4001/6384 [06:45<03:19, 11.94it/s]  

{'loss': 1.0027, 'grad_norm': 2.217780351638794, 'learning_rate': 1.8968809675366008e-05, 'epoch': 1.88}


 71%|███████   | 4501/6384 [07:46<02:37, 11.97it/s]  

{'loss': 0.9633, 'grad_norm': 2.2332375049591064, 'learning_rate': 1.4990451941438575e-05, 'epoch': 2.11}


 78%|███████▊  | 5001/6384 [08:36<01:55, 11.93it/s]

{'loss': 0.9363, 'grad_norm': 2.306957483291626, 'learning_rate': 1.101209420751114e-05, 'epoch': 2.35}


 86%|████████▌ | 5501/6384 [09:27<01:14, 11.85it/s]

{'loss': 0.9363, 'grad_norm': 2.1316850185394287, 'learning_rate': 7.0337364735837045e-06, 'epoch': 2.58}


 94%|█████████▍| 6000/6384 [10:09<00:32, 11.95it/s]

{'loss': 0.9236, 'grad_norm': 2.2025504112243652, 'learning_rate': 3.05537873965627e-06, 'epoch': 2.82}


100%|██████████| 6384/6384 [10:51<00:00,  9.80it/s]


{'train_runtime': 651.1273, 'train_samples_per_second': 78.427, 'train_steps_per_second': 9.805, 'train_loss': 1.1158026071419394, 'epoch': 3.0}


In [5]:
eval_results = trainer.evaluate()

# print(f"BLEU Score: {eval_results['bleu']}")
# print(f"ROUGE-L Score: {eval_results['rouge-l']}")
# print(f"BERTScore: {eval_results['bertscore']}")
# print(f"CodeBLEU Score: {eval_results['codebleu']}")

print(eval_results)

  0%|          | 0/532 [00:00<?, ?it/s]

100%|██████████| 532/532 [00:12<00:00, 41.29it/s]

{'eval_loss': 0.9164884090423584, 'eval_runtime': 12.8998, 'eval_samples_per_second': 329.928, 'eval_steps_per_second': 41.241, 'epoch': 3.0}



