# Project 2

Done by: N Gautam

Roll No: 2210110411

Dataset: https://huggingface.co/datasets/Josephgflowers/Finance-Instruct-500k

Since this is a large dataset we are only taking 10k samples for finetuning

Model: https://huggingface.co/EleutherAI/pythia-410m

# Importing Libraries

In [None]:
%%capture
!pip install datasets

In [None]:
import os
import random
import time

import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from matplotlib.ticker import FuncFormatter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# Loading the datset

In [None]:
os.makedirs("data", exist_ok=True)

file_name = "train.json"
local_path = os.path.join("data", file_name)

if not os.path.exists(file_name):
    path = hf_hub_download(
        repo_id="Josephgflowers/Finance-Instruct-500k",
        filename=file_name,
        repo_type="dataset",
        local_dir="data",
    )
    print(f"File downloaded to: {path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.json:   0%|          | 0.00/580M [00:00<?, ?B/s]

File downloaded to: data/train.json


In [None]:
df = pd.read_json(local_path, lines=True)

In [None]:
df = df.dropna()
df = df.drop_duplicates(subset=["user", "assistant"])
df = df[df["user"].notna() & df["assistant"].notna()]
df.drop(columns=["system"], inplace=True)

df = df.sample(n=1000, random_state=123)

print(f"Final sample shape: {df.shape}")

Final sample shape: (1000, 2)


In [None]:
print(df.iloc[:5])

                                                     user  \
404530  Question about Personal Finance:\nWhat should ...   
352028    Question:\nGenerate a palindrome of any length.   
382672  Given the headline "stocks slide on borrowing ...   
13733   How can L’Hopital’s Rule be used in day to day...   
250165                         What does the general say?   

                                                assistant  
404530  You may owe taxes, penalties, and interest due...  
352028                                            racecar  
382672                                                 No  
13733   Yes, there are several mathematical concepts t...  
250165  U.S. troops will be out of the country by the ...  


The dataset is already in the phi-3 format

Dividing into train test and validation sets

In [None]:
train_portion = int(len(df) * 0.85)
test_portion = int(len(df) * 0.1)
val_portion = len(df) - train_portion - test_portion

train_data = df[:train_portion]
test_data = df[train_portion : train_portion + test_portion]
val_data = df[train_portion + test_portion :]

print("Number of samples in train set:", len(train_data))
print("Number of samples in test set:", len(test_data))
print("Number of samples in validation set:", len(val_data))

Number of samples in train set: 850
Number of samples in test set: 100
Number of samples in validation set: 50


# Making the dataset class

In [None]:
class InstructionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = dataframe[["user", "assistant"]].to_dict("records")

        self.encoded_texts = []
        for entry in self.data:
            user_text = entry["user"]
            assistant_text = entry["assistant"]

            full_text = f"<|user|>\n{user_text}<|assistant|>\n{assistant_text}"

            encoded = tokenizer.encode(
                full_text,
                max_length=self.max_length,
                truncation=True,
                return_tensors="pt",
            )

            self.encoded_texts.append(encoded.squeeze())

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.encoded_texts)

# Loading moddel and tokenizer

In [None]:
model_name = "EleutherAI/pythia-410m"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

special_tokens_dict = {"additional_special_tokens": ["<|user|>", "<|assistant|>"]}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_tokens} special tokens to the tokenizer")
model.resize_token_embeddings(len(tokenizer))

print(f"<|user|> token ID: {tokenizer.convert_tokens_to_ids('<|user|>')}")
print(f"<|assistant|> token ID: {tokenizer.convert_tokens_to_ids('<|assistant|>')}")

Added 2 special tokens to the tokenizer
<|user|> token ID: 50277
<|assistant|> token ID: 50278


# Collate function for dataloader

In [None]:
def customized_collate_fn(batch):
    pad_token_id = tokenizer.pad_token_id
    ignore_index = -100

    batch_max_length = max(len(item) for item in batch)

    inputs_lst = []
    targets_lst = []
    attention_masks = []

    for item in batch:
        item_list = item.tolist() if isinstance(item, torch.Tensor) else item
        inputs = torch.tensor(item_list)
        attention_mask = [1] * len(item_list)
        labels = torch.full_like(inputs, ignore_index)
        assistant_positions = (
            inputs == tokenizer.convert_tokens_to_ids("<|assistant|>")
        ).nonzero(as_tuple=True)[0]

        if len(assistant_positions) > 0:
            assistant_pos = assistant_positions[0] + 1
            labels[assistant_pos:] = inputs[assistant_pos:]

        inputs = torch.cat([inputs, torch.tensor([pad_token_id])])
        attention_mask = attention_mask + [1]

        if ( len(assistant_positions) > 0):
            labels = torch.cat( [labels, torch.tensor([pad_token_id])])
        else:
            labels = torch.cat( [labels, torch.tensor([ignore_index])])

        padding_length = batch_max_length - len(inputs) + 1

        if padding_length > 0:
            inputs = torch.cat( [ inputs, torch.full((padding_length,), pad_token_id, dtype=inputs.dtype), ])
            attention_mask = attention_mask + [0] * padding_length
            labels = torch.cat( [ labels, torch.full((padding_length,), ignore_index, dtype=labels.dtype), ])

        inputs_lst.append(inputs)
        attention_masks.append(torch.tensor(attention_mask))
        targets_lst.append(labels)


    inputs_tensor = torch.stack(inputs_lst)
    attention_mask_tensor = torch.stack(attention_masks)
    targets_tensor = torch.stack(targets_lst)


    return {
        "input_ids": inputs_tensor,
        "attention_mask": attention_mask_tensor,
        "labels": targets_tensor,
    }

# Defining the data loaders

In [None]:
batch_size = 4

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=0,
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=0,
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=0,
)

# Finetuning the model

In [None]:
def calc_loss_batch(batch, model, device):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    return loss

In [None]:
def evaluate_model(model, dataloader, device, num_iters=None):
    """Evaluate model on the dataloader for num_iters batches."""
    model.eval()
    total_loss = 0
    total_batches = 0

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            if num_iters is not None and i >= num_iters:
                break

            loss = calc_loss_batch(batch, model, device)
            total_loss += loss.item()
            total_batches += 1


    return total_loss / max(total_batches, 1)

In [None]:
def train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs,
    eval_freq=100,
    eval_iters=10,
    save_name="best_model.pt",
):

    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen = 0
    global_step = 0
    best_val_loss = float("inf")

    print(f"Starting training for {num_epochs} epochs")


    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        batch_count = 0

        for batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(batch, model, device)
            loss.backward()
            optimizer.step()

            batch_tokens = (batch["attention_mask"].sum()).item()
            tokens_seen += batch_tokens
            running_loss += loss.item()
            batch_count += 1
            global_step += 1

            if global_step % eval_freq == 0:
                avg_train_loss = running_loss / batch_count
                val_loss = evaluate_model(model, val_loader, device, eval_iters)
                train_losses.append(avg_train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Epoch {epoch+1}/{num_epochs} (Step {global_step:06d}): "
                    f"Train loss {avg_train_loss:.4f}, Val loss {val_loss:.4f}, "
                    f"Tokens: {tokens_seen:,}"
                )

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    torch.save(
                        {
                            "epoch": epoch,
                            "model_state_dict": model.state_dict(),
                            "optimizer_state_dict": optimizer.state_dict(),
                            "val_loss": val_loss,
                            "global_step": global_step,
                        },
                        save_name,
                    )
                    print(f"Saved new best model with val_loss: {val_loss:.4f}")

                running_loss = 0.0
                batch_count = 0
    return train_losses, val_losses, track_tokens_seen

In [None]:
def plot_values(tokens_seen, train_values, val_values, plot_label="loss"):
    eval_steps = list(range(len(train_losses)))
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(eval_steps, train_values, label=f"Training {plot_label}")
    ax1.plot(eval_steps, val_values, linestyle="-.", label=f"Validation {plot_label}")
    ax1.set_xlabel("Evaluation Steps")
    ax1.set_ylabel(plot_label.capitalize())
    ax1.legend()
    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_values, alpha=0)
    ax2.set_xlabel("Tokens seen")
    ax2.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
    fig.tight_layout()
    plt.savefig(f"{plot_label}-plot.pdf")
    plt.show()

# Training loop

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, tokens_seen = train_model_simple(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=50,
    eval_iters=5,
    save_name="best_model.pth",
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Starting training for 5 epochs
Epoch 1/5 (Step 000050): Train loss 2.2633, Val loss 1.7623, Tokens: 33,980
Saved new best model with val_loss: 1.7623
Epoch 1/5 (Step 000100): Train loss 2.2624, Val loss 1.8417, Tokens: 66,609
Epoch 1/5 (Step 000150): Train loss 2.3378, Val loss 1.7610, Tokens: 102,593
Saved new best model with val_loss: 1.7610
Epoch 1/5 (Step 000200): Train loss 2.2436, Val loss 1.6589, Tokens: 138,376
Saved new best model with val_loss: 1.6589
Epoch 2/5 (Step 000250): Train loss 1.4070, Val loss 1.7743, Tokens: 169,170
Epoch 2/5 (Step 000300): Train loss 1.4747, Val loss 1.7860, Tokens: 206,147
Epoch 2/5 (Step 000350): Train loss 1.4804, Val loss 1.8489, Tokens: 239,690
Epoch 2/5 (Step 000400): Train loss 1.5167, Val loss 1.7476, Tokens: 274,059
Epoch 3/5 (Step 000450): Train loss 0.9630, Val loss 1.8985, Tokens: 309,343
Epoch 3/5 (Step 000500): Train loss 0.9835, Val loss 1.8981, Tokens: 342,537
Epoch 3/5 (Step 000550): Train loss 0.9856, Val loss 2.0238, Tokens: 380

KeyboardInterrupt: 

# Results

In [None]:
plot_values(tokens_seen, train_losses, val_losses, plot_label="loss")

In [None]:
best_model_path = "/content/best_model.pth"
if os.path.exists(best_model_path):
    checkpoint = torch.load(best_model_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    print( f"Loaded model from {best_model_path} (validation loss: {checkpoint['val_loss']:.4f})")
model.eval()

In [None]:
sample_index = random.randint(0, len(test_data) -1)
prompt = test_data.iloc[sample_index]['user']
expected_output = test_data.iloc[sample_index]['assistant']
input_text = f"<|user|>\n{prompt}<|assistant|>\n"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
attention_mask = torch.ones_like(input_ids).to(device)

model.eval()
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[1] + 256,
        pad_token_id=tokenizer.pad_token_id,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
    )

generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
actual_output = generated_text.split("<|assistant|>\n")[1].strip() if "<|assistant|>\n" in generated_text else ""
actual_output = actual_output.replace("<|endoftext|>", "")
print(f"\nPrompt: {prompt}")
print(f"\nExpected Output: {expected_output}")
print(f"\nActual Output: {actual_output}")

In [18]:
from google.colab import files

# Assuming your weights are saved in 'best_model.pth'
files.download('best_model.pth')
files.download('Instruction-finetune.ipynb')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

FileNotFoundError: Cannot find file: Instruction-finetune.ipynb