In [24]:
import supplementary 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
import tiktoken
import supplementary 

In [2]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor

In [None]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [4]:
class FineTuningDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_length=128):
        # Load JSON data from file
        with open(json_file, 'r') as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        instruction = item["instruction"]
        input_text = item["input"]
        output_text = item["output"]

        # Combine instruction, input, and output with <|endoftext|> as separator
        full_text = f"{instruction} Input: {input_text} Output: {output_text}<|endoftext|>"

        # Convert to token IDs
        token_ids = text_to_token_ids(full_text, self.tokenizer).squeeze(0)  # Remove batch dim for now

        # Truncate or pad to max_length
        if token_ids.size(0) > self.max_length:
            token_ids = token_ids[:self.max_length]
        else:
            padding = torch.zeros(self.max_length - token_ids.size(0), dtype=torch.long)
            token_ids = torch.cat([token_ids, padding])

        return token_ids

In [5]:
json_file = "C:/Users/suman/OneDrive/Desktop/From_Scratch_LLM/finetuning/instruction-data.json"

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.pad_token_id = tokenizer.encode("<|endoftext|>", allowed_special={'<|endoftext|>'})[0]  # 50256
tokenizer.vocab_size = tokenizer.n_vocab  # 50257

# 5. Create dataset and dataloader
dataset = FineTuningDataset(json_file, tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 6. Load the pre-trained model
model_path = r"C:\Users\suman\OneDrive\Desktop\From_Scratch_LLM\weightloading\pretrained_gpt_full_model.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt = torch.load(model_path, map_location=device)
gpt.to(device)

# 7. Set up optimizer and loss function
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

  gpt = torch.load(model_path, map_location=device)


In [20]:
def train_model(model, dataloader, tokenizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()

            # Move batch to device
            input_ids = batch.to(device)

            # Forward pass
            logits = model(input_ids)

            # Shift input_ids and logits for language modeling
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = input_ids[..., 1:].contiguous()

            # Compute loss
            loss = criterion(shift_logits.view(-1, cfg["vocab_size"]), shift_labels.view(-1))
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")

    # Example inference after training
    model.eval()
    with torch.no_grad():
        sample_input = "Evaluate the following phrase by transforming it into the spelling given. Input: freind --> friend Output:"
        token_ids = text_to_token_ids(sample_input, tokenizer).to(device)
        output_logits = model(token_ids)
        predicted_ids = torch.argmax(output_logits, dim=-1)
        print("Sample Output:", token_ids_to_text(predicted_ids, tokenizer))

In [22]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}


# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

cfg=NEW_CONFIG


In [None]:
train_model(gpt, dataloader, tokenizer, epochs=1)


In [None]:
# 9. Save the fine-tuned model (optional)
torch.save(gpt.state_dict(), "fine_tuned_gpt.pth")

In [None]:
gpt = GPTModel(cfg)

state_dict_path = "fine_tuned_gpt.pth"  
# state_dict_path = r"C:\Users\suman\OneDrive\Desktop\From_Scratch_LLM\weightloading\fine_tuned_gpt.pth"

# Load the state_dict
state_dict = torch.load(state_dict_path, map_location="cpu")  # Load to CPU first
gpt.load_state_dict(state_dict)

# Move to the desired device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)

# Set to evaluation mode (optional, for inference)
gpt.eval()