In [None]:
!pip install transformers datasets accelerate peft trl bitsandbytes

In [None]:
!huggingface-cli login

In [None]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

dataset = load_dataset("mbpp")

model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
import torch

# Check if CUDA is available and set the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Transfer the model to the GPU
model = model.to(device)

In [None]:
# The padding token is set to the unknown token.
tokenizer.pad_token = tokenizer.unk_token

# The ID of the padding token is set to the ID of the unknown token.
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# The padding side is set to 'left', meaning that padding tokens will be added to the left (start) of the sequence.
tokenizer.padding_side = 'left'

In [None]:
max_input_length = 256
max_target_length = 512

def preprocess_examples(examples):
  codes = examples['code']
  texts = examples['text']

  model_inputs = tokenizer(texts, max_length=max_input_length, padding="max_length", truncation=True)

  labels = tokenizer(codes, max_length=max_target_length, padding="max_length", truncation=True).input_ids

  label_list = []
  for label in labels:
    label_list.append([item if item != tokenizer.pad_token_id else -100 for item in label])

  model_inputs['labels'] = label_list

  return model_inputs

In [None]:
dataset = dataset.map(preprocess_examples, batched=True)

In [None]:
import torch
from transformers import TrainingArguments
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType, PeftModel

args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=1000,
        num_train_epochs=4,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none"
)

trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=args
)



# Start training
trainer.train()

In [None]:
data = trainer.state.log_history

import matplotlib.pyplot as plt

# Extract loss and step values
steps = [entry['step'] for entry in data if 'loss' in entry]
losses = [entry['loss'] for entry in data if 'loss' in entry]

# Plot the loss vs. steps
plt.figure(figsize=(8, 6))
plt.plot( steps,losses,  linestyle='-', color='b')
plt.title('train/loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
model_dir = "/content/finetuned_gpt_mbpp"  # Path to the checkpoint directory
tokenizer_dir = "/content/finetuned_gpt_mbpp_tk/"  # Path to directory with the tokenizer

In [None]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_dir)

In [None]:
model = model.to(device)

In [None]:
# Encode the input text and transfer it to the GPU
input_text = f"Solve this problem in python: {dataset['validation']['text'][10]}"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate the output
output = model.generate(input_ids, max_length=512, repetition_penalty=2.0)

# Decode the generated output (no need to move it to CPU for decoding)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the input and output
print(input_text)
print("Generated code:")
print(generated_text)
print("Expected code:")
print(dataset['validation']['code'][10])
