In [None]:
!pip install accelerate


In [None]:
import torch
import os
from transformers import GPT2Tokenizer , GPT2LMHeadModel , TrainingArguments, Trainer , DataCollatorWithPadding
from torch.utils.data import Dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.pad_token_id = tokenizer.eos_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
prompt_text ="List 5 reasons why someone should learn to code"

input_ids = tokenizer(prompt_text, return_tensors ="pt").input_ids
attention_mask = tokenizer(prompt_text, return_tensors = "pt").attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=100
)

generated_text = tokenizer.decode(output[0],skip_special_tokens=True)
print(generated_text)


List 5 reasons why someone should learn to code

1. You can't just write a program that's written in C++.

2. You can't just write a program that's written in C++.

3. You can't just write a program that's written in C++.

4. You can't just write a program that's written in C++.

5. You can't just write a program that's written in C++.



In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
dataset=load_dataset("MBZUAI/LaMini-instruction")["train"].select(range(2000))

In [None]:
print(dataset)

Dataset({
    features: ['instruction', 'response', 'instruction_source'],
    num_rows: 2000
})


In [None]:
print(dataset[0])

{'instruction': 'List 5 reasons why someone should learn to code', 'response': '1. High demand for coding skills in the job market\n2. Increased problem-solving and analytical skills\n3. Ability to develop new products and technologies\n4. Potentially higher earning potential\n5. Opportunity to work remotely and/or freelance', 'instruction_source': 'alpaca'}


In [None]:
class CustomDataSet(Dataset):
  def __init__(self, tokenizer, data, block_size):
    self.tokenizer = tokenizer
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = self.data[idx]
    tokenized_inputs = self.tokenizer(
        item['instruction'],
        item['response'],
        truncation=True,
        padding="max_length",
        max_length=self.block_size,
        return_tensors="pt"

    )
    tokenized_inputs["labels"]=tokenized_inputs["input_ids"].clone()
    return tokenized_inputs


In [None]:
data = CustomDataSet(tokenizer, dataset , 512)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
print(data[0])

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    num_train_epochs=10,
    learning_rate=5e-5,
    logging_steps=100,
    output_dir='./results4'

)

trainer= Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=None,
    data_collator=data_collator,
)

In [None]:
trainer.train()


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Step,Training Loss
100,0.5876
200,0.3726
300,0.3664
400,0.3466
500,0.331
600,0.289
700,0.3209
800,0.2912
900,0.2804
1000,0.2722


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

TrainOutput(global_step=2500, training_loss=0.27176709365844726, metrics={'train_runtime': 3110.5598, 'train_samples_per_second': 6.43, 'train_steps_per_second': 0.804, 'total_flos': 5225840640000000.0, 'train_loss': 0.27176709365844726, 'epoch': 10.0})

In [None]:
model.eval()

In [None]:
prompt_text ="List 5 reasons why someone should learn to code"

input_ids = tokenizer(prompt_text, return_tensors ="pt").input_ids
attention_mask = tokenizer(prompt_text, return_tensors = "pt").attention_mask

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.pad_token_id,
    max_length=100,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    do_sample=True
)

generated_text = tokenizer.decode(output[0],skip_special_tokens=True)
print(generated_text)


List 5 reasons why someone should learn to code1. Improved communication skills: learning to code through experienced professionals can help improve communication skills.

2. Increased motivation: learning to code can lead to increased motivation and motivation to continue learning.

3. Increased motivation to learn: learning to code can lead to increased motivation and motivation to continue learning.

4. Increased flexibility: coding can allow for greater flexibility and flexibility in working arrangements.

5. Increased accessibility: coding can facilitate


In [None]:
trainer.save_model("./finetunedgpt2")


In [None]:
tokenizer.save_pretrained("finetunedgpt2")

('finetunedgpt2/tokenizer_config.json',
 'finetunedgpt2/special_tokens_map.json',
 'finetunedgpt2/vocab.json',
 'finetunedgpt2/merges.txt',
 'finetunedgpt2/added_tokens.json')