<a href="https://colab.research.google.com/github/Guhan2348519/LLM-lab-tasks/blob/main/Fine_tuning_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import get_linear_schedule_with_warmup

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split, RandomSampler, SequentialSampler

import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
# model_name: ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
model_name = "gpt2"
model_save_path = './model'

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

In [None]:


configuration = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
input_ids = tokenizer.encode(input_sequence, return_tensors='pt')
attention_mask = torch.ones(input_ids.shape, device=device)

model = model.to(device)
sample_outputs = model.generate(
    input_ids.to(device),
    attention_mask=attention_mask,
    do_sample=True, max_length=120,
    top_k=50, top_p=0.85,
    num_return_sequences=3,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
df_recipes = pd.read_csv('/content/recipes_1000.csv')
df_recipes.reset_index(drop=True, inplace=True)

def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients: {ingredient.strip()}. " \
        f"Instructions: {instruction.strip()}<|endoftext|>"
    return s

data = df_recipes.apply(lambda x:form_string(
    x['ingredients'], x['instructions']), axis=1).to_list()
data[0]

"<|startoftext|>Ingredients: blueberries, granulated sugar, vanilla yogurt, lemon juice. Instructions: Toss 2 cups berries with sugar. Let stand for 45 minutes, stirring occasionally. Transfer berry-sugar mixture to food processor. Add yogurt and process until smooth. Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft.  Transfer to processor and blend until smooth again. Return to pan and freeze until edges are solid. Transfer to processor and blend until smooth again. Fold in remaining 2 cups of blueberries. Pour into plastic mold and freeze overnight. Let soften slightly to serve.<|endoftext|>"

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2TokenizerFast

batch_size = 2
max_length = 180

# Set the padding token to the EOS token
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Sample data
data = ["beef, salt, pepper", "chicken, garlic, herbs", "pasta, tomato, basil", "rice, beans, avocado"]

# Standard PyTorch approach of loading data using a Dataset class
class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for recipe in data:
            encodings = tokenizer.encode_plus(recipe,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=max_length,
                                              # return a PyTorch tensor
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'], 0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'], 0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

# Create the dataset
dataset = RecipeDataset(data, tokenizer)

# Ensure dataset is large enough for a split
min_dataset_size = 3
if len(dataset) < min_dataset_size:
    raise ValueError(f"Dataset must contain at least {min_dataset_size} items, but contains {len(dataset)}.")

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Ensure at least one item in both sets
if train_size == 0:
    train_size = 1
    val_size = len(dataset) - train_size
elif val_size == 0:
    val_size = 1
    train_size = len(dataset) - val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create the DataLoaders for our training and validation datasets
# Get training samples in random order
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

# Get validation samples sequentially
validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

# Print the first batch from the train_dataloader to verify
for batch in train_dataloader:
    input_ids, attn_masks = batch
    print(f"input_ids: {input_ids}")
    print(f"attn_masks: {attn_masks}")
    break


input_ids: tensor([[30119,    64,    11, 24240,    11, 37792, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 5025

In [None]:

# Initialize the model and move it to the device
configuration = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)
model = model.to(device)
model.resize_token_embeddings(len(tokenizer))

# Training parameters
epochs = 1
learning_rate = 2e-5
warmup_steps = 1e2
epsilon = 1e-8
optim = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

total_steps = len(train_dataloader) * epochs  # [no batches] x [no epochs]

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attn_masks = batch
        input_ids = input_ids.to(device)
        attn_masks = attn_masks.to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attn_masks, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    model.eval()
    eval_loss = 0

    for batch in validation_dataloader:
        input_ids, attn_masks = batch
        input_ids = input_ids.to(device)
        attn_masks = attn_masks.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attn_masks, labels=input_ids)
            loss = outputs.loss
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(validation_dataloader)
    print(f"Average validation loss: {avg_eval_loss}")



Average training loss: 9.164665222167969
Average validation loss: 10.879677772521973


In [None]:
for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels    = batch[0].to(device)
        b_masks     = batch[1].to(device)

        model.zero_grad()
        outputs = model( input_ids = b_input_ids, labels = b_labels,
                         attention_mask = b_masks, token_type_ids = None )

        loss = outputs[0]

        # Get sample every x batches.
        if step % 100 == 0 and not step == 0:
            model.eval()
            print(infer("eggs, flour, butter, sugar"))
            model.train()

        loss.backward()
        optim.step()
        scheduler.step()

In [None]:
model.eval()

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels    = batch[0].to(device)
        b_masks     = batch[1].to(device)

        with torch.no_grad():
            outputs  = model(input_ids = b_input_ids, labels = b_labels
                             attention_mask = b_masks)
            loss = outputs[0]

In [None]:
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_save_path)
model.to(device)

In [None]:
# Initialize the model and move it to the device
configuration = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)
model = model.to(device)
model.resize_token_embeddings(len(tokenizer))

# Initialize optimizer and scheduler
optim = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optim,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Define inference function
def infer(prompt):
    input = f"Ingredients: {prompt.strip()}"
    input = tokenizer(input, return_tensors="pt")
    input_ids = input["input_ids"]
    attention_mask = input["attention_mask"]

    output = model.generate(input_ids.to(device),
                            attention_mask=attention_mask.to(device),
                            max_new_tokens=max_length,
                            do_sample=True, top_k=50, top_p=0.85)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

# Training loop
for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()
        outputs = model(input_ids=b_input_ids, labels=b_labels,
                        attention_mask=b_masks, token_type_ids=None)

        loss = outputs.loss
        total_train_loss += loss.item()

        # Get sample every 100 batches.
        if step % 100 == 0 and not step == 0:
            model.eval()
            print(infer("eggs, flour, butter, sugar"))
            model.train()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    # Evaluate data for one epoch
    model.eval()
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, labels=b_labels,
                            attention_mask=b_masks)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(validation_dataloader)
    print(f"Average validation loss: {avg_eval_loss}")

    # Save the model
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

# Reload the model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2TokenizerFast.from_pretrained(model_save_path)
model.to(device)

Average training loss: 8.958427906036377
Average validation loss: 10.880024909973145


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
infer("eggs, mushroom, butter, sugar")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Ingredients: eggs, mushroom, butter, sugar, cayenne, garlic, salt, cinnamon, oregano, parsley, ginger, paprika, cayenne pepper, cumin, oregano, ginger, garlic, onion, garlic powder, and herbs.\n\nDirections: Combine all ingredients into a blender and blend until smooth. You will notice that the eggs start to come out a little thick and creamy, but not that thick at all. Add more water, salt, and spices in an effort to get it to taste exactly like you expect it to, and add in as much water as you need.\n\nIf you're using only the eggs, use more water than you need, as the water is the same amount of water you used when you began baking. Use about 2/3 cup for each cup of butter you use.\n\nIf you are using the whole ingredients for this recipe, use"