In [2]:
%pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [28]:
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from datasets import load_dataset
from datasets import Dataset as HuggingFaceDataset
from transformers import AutoTokenizer

import pandas as pd

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

yelp = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize(example):
    #for this data set, we only need to tokenzie the text col
    text = example["text"]

    return tokenizer(text, padding="max_length", truncation=True)


#keep label but remove text since pytorch models dont take text they onyl take input ids, toekn type and atten mask.
yelp_tokenized = yelp.map(tokenize, batched=True)
yelp_tokenized = yelp_tokenized.remove_columns(["text"])
yelp_tokenized = yelp_tokenized.rename_column("label", "labels")
#convert each element into a tensor, even labels since pytorch only takes tensors.
yelp_tokenized.set_format("torch")

small_tokenized_train = yelp_tokenized["train"].shuffle().select(range(1000))
small_tokenized_test = yelp_tokenized["test"].shuffle().select(range(100))


train_dataloader = DataLoader(small_tokenized_train, shuffle = True, batch_size = 8)
test_dataloader = DataLoader(small_tokenized_test, shuffle = False, batch_size = 8)


#get our model

from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
model.to(device)


import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

accuracy_before_ft = metric.compute()

from tqdm.auto import tqdm

epochs = 3

num_training_steps = epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

optimizer = AdamW(model.parameters(), lr = 5e-5)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

loss_fn = torch.nn.CrossEntropyLoss()
model.train()

for epoch in range(epochs):
    for train_batch in train_dataloader:
        batch = {}
        for k, v in train_batch.items():
            # "text" key is the input ids etc as value, thus tensor needs to be sent to gpu not "name"
            batch[k] = v.to(device)

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


metric = evaluate.load("accuracy")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

accuracy_after_ft = metric.compute()

print(f"Accuracy before fine tuning: {accuracy_before_ft}")
print(f"Accuracy after fine tuning: {accuracy_after_ft}")


README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  0%|          | 0/375 [00:00<?, ?it/s]

Accuracy before fine tuning: {'accuracy': 0.26}
Accuracy after fine tuning: {'accuracy': 0.5}


In [22]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import LinearLR
from tqdm.auto import tqdm
import random

from torch.utils.tensorboard import SummaryWriter

shks_set = load_dataset("karpathy/tiny_shakespeare")
#single row, giant string for shakespere.. in train/test/validate.
#print(len(shks_set["train"]["text"]))

#so to get it in shakespere form, as part of pre trianing/ or style transfer fine tuning,
#i just need to make a huge string of all the text data, even in instruin fne tuning
#i am just training model to predict next sentence token, so instead of our data being
#rows in a dataframe, it is just chunks of this massive text, which makes things much easier in
#terms of data handling.

for text in shks_set["train"]["text"]:
  print(len(text))

for text in shks_set["test"]["text"]:
  print(len(text))

for text in shks_set["validation"]["text"]:
  print(len(text))


#lets tokenize and chunk

device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

chunk_length = 128

train_tokenized = tokenizer(shks_set["train"]["text"], truncation=True,
                            max_length=chunk_length,
                            return_overflowing_tokens=True,  #this creates overlapping chunks
                            stride=chunk_length // 2,  # 50% overlap between chunks
                            return_length=True,
                            padding = "max_length")
#make sure to pad as well such that each chunk is of same length
#chunk and tokenize since we set up max context length, and we get overlapping as well

shks_set["train"] = {
    "input_ids": train_tokenized["input_ids"],
    "attention_mask": train_tokenized["attention_mask"]
}

test_tokenized = tokenizer(shks_set["test"]["text"], truncation=True,
                            max_length=chunk_length,
                            return_overflowing_tokens=True,  #this creates overlapping chunks
                            stride=chunk_length // 2,  # 50% overlap between chunks
                            return_length=True,
                            padding = "max_length")
#chunk and tokenize since we set up max context length, and we get overlapping as well

shks_set["test"] = {
    "input_ids": test_tokenized["input_ids"],
    "attention_mask": test_tokenized["attention_mask"]
}

val_tokenized = tokenizer(shks_set["validation"]["text"], truncation=True,
                            max_length=chunk_length,
                            return_overflowing_tokens=True,  #this creates overlapping chunks
                            stride=chunk_length // 2,  # 50% overlap between chunks
                            return_length=True,
                            padding = "max_length")

shks_set["validation"] = {
    "input_ids": val_tokenized["input_ids"],
    "attention_mask": val_tokenized["attention_mask"]
}


#chunk and tokenize since we set up max context length, and we get overlapping as well
print(f"Number of chunks in train set: {len(shks_set['train']['input_ids'])}")
print(f"Number of chunks in test set: {len(shks_set['test']['input_ids'])}")
print(f"Number of chunks in validation set: {len(shks_set['validation']['input_ids'])}")



'''
After applying tokenize_and_chunk, shks_set["train"] would look like:

{
  "input_ids": [
    [15496, 307, 477, 262, 582, 2743, 13, 50256, 50256, 50256],  # Chunk 1
    [ 477, 262, 582, 2743, 13, 50256, 50256, 50256, 50256, 50256]  # Chunk 2 (overlap)
  ],
  "attention_mask": [
    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],  # Mask for Chunk 1
    [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  # Mask for Chunk 2
  ]
}
'''

'''
chunk_length = 128
stride = chunk_length // 2

def tokenize_and_chunk(data_split):
    tokenized = tokenizer(
        data_split["text"],
        truncation=True,
        max_length=chunk_length,
        return_overflowing_tokens=True,
        stride=stride,
        return_length=True,
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
    }

# Process and replace each split
shks_set = shks_set.map(tokenize_and_chunk, batched=True, remove_columns=["text"])
'''

print(type(shks_set))

def sample(input_string, tokenizer, model):
  #tokenizer.tokenzie() jsut tokenzies and splits string into chunks thats it doesnt embed.
  #you need to call the tokenzier class itself, to tokenizer? like gen a new object
  tokenized_input = tokenizer(input_string, return_tensors = "pt")
  for k , v in tokenized_input.items():
    tokenized_input[k] = v.to(device)

  output = model.generate(**tokenized_input,
                          max_new_tokens = 512,
                          num_beams = 5,
                          early_stopping = True,
                          no_repeat_ngram_size = 2,
                          temperature = 0.7
                          )
  return tokenizer.decode(output[0], skip_special_tokens = True)

#print(sample("how are you doing", tokenizer, model))

#lets get the data loaders set up

class TextualDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.dataset = data

    def __len__(self):
        return len(self.dataset["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.dataset["input_ids"][idx]),
            "attention_mask": torch.tensor(self.dataset["attention_mask"][idx]),
            "labels": torch.tensor(self.dataset["input_ids"][idx]),
        } #for causal lm label is the input id itself.

    #no need to do sliding window, since model while training does itself trasnformers has that
    #capacbilty vuilt in to perform window and get true label while calcuating Cross entropy loss
    #can do it manually but whats the point.

#no shuffle as order matters! each token representes a piece of sentence!
train_dataloader = DataLoader(TextualDataset(shks_set["train"]), shuffle = False, batch_size = 16)
test_dataloader = DataLoader(TextualDataset(shks_set["test"]), shuffle = False, batch_size = 16)
val_dataloader = DataLoader(TextualDataset(shks_set["validation"]), shuffle = False, batch_size = 16)

#now lets fine tune our model!

def fine_tune_lm(dataloader, model, optimizer, progress_bar) -> float:
  total_loss_epoch = 0.0
  model.train()

  for batch in dataloader:
    #send everything to the device please
    for k, v in batch.items():
      batch[k] = v.to(device)

    outputs = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"],
                    labels = batch["input_ids"])

    #loss is built in since i provide the labels argument
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    progress_bar.update(1)

    total_loss_epoch += loss.item()

  #return average loss htis epoch.
  return total_loss_epoch / len(dataloader)

def validate_lm(dataloader, model) -> float:
  total_loss_epoch = 0.0
  model.eval()

  for batch in dataloader:
    for k, v in batch.items():
      batch[k] = v.to(device)

    output = model(input_ids = batch["input_ids"], attention_mask = batch["attention_mask"],
                   labels = batch["input_ids"])
    loss = output.loss
    total_loss_epoch += loss.item()

  return total_loss_epoch / len(dataloader)


writer = SummaryWriter(log_dir="ft_language_model/shakespere")

num_epochs = 4
lr = 0.0001

optimizer = torch.optim.Adam(model.parameters(), lr = lr)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=num_training_steps
    )

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")

  train_loss = fine_tune_lm(train_dataloader, model, optimizer, progress_bar)
  val_loss = validate_lm(val_dataloader, model)

  #sample("how are you doing", tokenizer, model)

  lr_scheduler.step()

  print(f"Train Loss: {train_loss:.4f}")
  print(f"Validation Loss: {val_loss:.4f}")
  print("\n")



sample("how are you doing", tokenizer, model)


1003854
55770
55770
Number of chunks in train set: 4718
Number of chunks in test set: 281
Number of chunks in validation set: 282
<class 'datasets.dataset_dict.DatasetDict'>


  0%|          | 0/1180 [00:00<?, ?it/s]

Epoch 1/4
Train Loss: 4.8483
Validation Loss: 4.5622


Epoch 2/4
Train Loss: 4.3686
Validation Loss: 3.8358


Epoch 3/4
Train Loss: 4.0834
Validation Loss: 3.7001


Epoch 4/4


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Train Loss: 3.9394
Validation Loss: 3.6223




'how are you doing?"\n\n"No, sir," said he, "I am not. I do not know what you are doing, but I know that I am doing it, and that you have done it; and I will tell you how it is done, if you will excuse me, for I have not seen you in a long time, nor have you seen me in any other place than here; for, as you say, I did not see you till you came to my house, when I was a little boy; but now you come to me and tell me how you did it. Tell me your name, my dear sir; what do you mean by that name? What is the name of your father, your mother, or your sister? I think you must have been born in this country, that is, at the time when you were born, in the year of my father\'s death, which is now twenty-two years old. If you please, let me see what I can do for you. You are a good boy, you know me well enough to know how to do it: but if I were to give you a word of advice, it would be to say to you, \'You must be careful not to make any mistake in your life, lest you should be deceived by the

In [23]:
# Save fine-tuned model
model.save_pretrained("fine_tuned_shakespeare_gpt2")
tokenizer.save_pretrained("fine_tuned_shakespeare_gpt2")

('fine_tuned_shakespeare_gpt2/tokenizer_config.json',
 'fine_tuned_shakespeare_gpt2/special_tokens_map.json',
 'fine_tuned_shakespeare_gpt2/vocab.json',
 'fine_tuned_shakespeare_gpt2/merges.txt',
 'fine_tuned_shakespeare_gpt2/added_tokens.json',
 'fine_tuned_shakespeare_gpt2/tokenizer.json')

In [27]:
print(sample("i am lancelot, who are you?", tokenizer, model))
print("\n\n\n\n")
print(sample("my name is maanas", tokenizer, model))
print("\n\n\n\n")
print(sample("i beleive in LLMs do you?", tokenizer, model))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


i am lancelot, who are you?

KING RICHARD II:
I am not, sir, but I am the king of England, and you are my lord.
DUKE VINCENTIO:


Your lordship, my lords, I do not know what you do,
but I will tell you what I know, if you will not tell me
what it is that you have done to me; and if I shall tell
you what it was that I did to you, you shall know it
as well as I can, for I have never seen you
before, nor have you seen me before; but you must know that
it is my duty to tell the truth; for if it be true, then I
will tell it to the queen; if not to her, she
shall not be able to hear it;
and if she shall not hear, it shall be a lie; so that she may not
know what she is doing, or how she did it. I, therefore, will
tell you all that is in my power, which is to say, what
is in your power and what is not in yours; that, in the
same way, we shall have a king, a queen, an
honourable king; a prince, princely prince; an honourable
man, nobleman; noblewoman, fair lady; fair
woman; good-natured gentlem

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


my name is maanas.

DUKE VINCENTIO:
I am, sir, your father's son,
And I am your mother's daughter, and your brother's brother
As well as your sister's sister;
But I do not know your name, nor do I know
Your name's name; but I will tell you
What I have heard from you; and what I
have seen of you, what you have seen
Of my father, my mother and my brother; what
you have read of me, you know not; I shall
tell you the truth, but you shall not tell me
The truth; nor shall I tell the
Truth; neither shall you tell it to me nor to
My father; for he knows not what he is
To do with his life, or to do
With his death; or, for that matter, with the death of
His brother, his sister and his brother-in-law; he
Will not be able to tell his own life
From what is said of him, from what has been
Said of his father and of mine: but he will not
Tell me what it is that he has done with my life; that
He has not done it with mine, that I cannot tell
It to him: nor, indeed, have I heard of it from him
Until now,