In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
from datasets import load_dataset
import numpy as np
import nltk
import evaluate
import accelerate
from nltk.tokenize import sent_tokenize
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
accelerate.__version__

'1.8.1'

# Response Generating Function

In [3]:
print('CUDA available:', torch.cuda.is_available())
print('GPU name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU')

CUDA available: True
GPU name: NVIDIA GeForce RTX 4070 Laptop GPU


In [4]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Function to generate a response
def generate_response(input_sequence, max_length=100):
    encoded_input = tokenizer(input_sequence + tokenizer.eos_token, return_tensors='pt').to("cuda" if torch.cuda.is_available() else "cpu")
    
    encoded_response = model.generate(
        **encoded_input,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        # attention_mask=encoded_input['attention_mask'] #required for some models  like DialoGPT but not for T5
        # do_sample=False,  # Set to False for deterministic output
    )
    
    
    
    return tokenizer.decode(encoded_response[0].cpu(), skip_special_tokens=True)

# Testing

In [6]:
# Testing the function

sequence = "You are a knowledgable and factual assistant: What comes after 5?"
generate_response(sequence)

'(no )'

In [7]:
generate_response('Does anyonw know what enegy from mass anniihilation means?')

'a genesis'

In [8]:
generate_response("What are the advantages and disadvantages of nuclear energy?")

'Nuclear energy is a renewable resource'

In [9]:
generate_response("Please answer the following question: What do you think about the benefit of Artificial Intelligence?")

'Especially for people with autism, we have to provide them with the ability to do the right thing.'

In [10]:
generate_response("System: You are a helpful and factual assistant.\nUser's Question: Define the term 'reinforcement'.")

'reinforcement'

# Implementing Chat History

In [11]:
chat_history = [
    # 'You are a knowledgable and factual assistant'
]

def format_chat_history(history, history_length=5):
    return '\n'.join(history[-history_length*2:] if len(history) > history_length*2 else history)

In [12]:
def add_to_chat_history(input_sequence):
    chat_history.append('user: '+input_sequence)
    formatted = format_chat_history(chat_history)
    print(f"Formatted chat history: {formatted}")
    response = generate_response(formatted+'\nbot: ')
    chat_history.append('bot: '+response)

# Chat History Testing

In [13]:
prompt1 = "Translate to German: 'What is the capital of France?'"
add_to_chat_history(prompt1)
chat_history

Formatted chat history: user: Translate to German: 'What is the capital of France?'


["user: Translate to German: 'What is the capital of France?'",
 "bot: 'What is the capital of France?'"]

In [14]:
prompt2 = 'Which city is the most populated in the United States?'
add_to_chat_history(prompt2)
chat_history

Formatted chat history: user: Translate to German: 'What is the capital of France?'
bot: 'What is the capital of France?'
user: Which city is the most populated in the United States?


["user: Translate to German: 'What is the capital of France?'",
 "bot: 'What is the capital of France?'",
 'user: Which city is the most populated in the United States?',
 "bot: 'Walt – die Hauptstadt der Stadt?' bot: 'Walt – die Hauptstadt der Stadt?'"]

# Fine Tuning using Yahoo QA dataset

## Loading Dataset

In [15]:
dataset = load_dataset("yahoo_answers_qa")

## Train Test Split

In [16]:
dataset = dataset['train'].train_test_split(test_size=0.1)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 78625
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 8737
    })
})

## Preprocessing

In [None]:
prefix = 'Answer the question: '

def preprocess(input):
    prefixed_input = [prefix + question for question in input['question']]
    model_input = tokenizer(prefixed_input, truncation=True, max_length=128)
    
    with tokenizer.as_target_tokenizer():-
        labels = tokenizer(input['answer'], truncation=True, max_length=512)
        
    model_input['labels'] = labels['input_ids']
    return model_input    


In [19]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 78625/78625 [00:06<00:00, 11521.81 examples/s]
Map: 100%|██████████| 8737/8737 [00:00<00:00, 14579.84 examples/s]


In [20]:
sample = tokenized_dataset['train'][0]
print(tokenizer.decode(sample['input_ids']))
print(tokenizer.decode(sample['labels']))

Answer the question: why does reading in the dark make your eyes worse?</s>
If there isn't enough light in the room for your eyes to focus correctly and see what you're reading, your eyes have to strain themselves to try to see.. . Prolonged eye strain can ruin your sight, or make it worse than it was.</s>


In [21]:
print(np.random.choice(tokenized_dataset['train']))

{'id': '3158900', 'question': 'why do people still stuff from the store?', 'answer': "There could be a number of reasons why people steal, it could be peer pressure, it could be they are homeless and hungry, they might have a drug problem, or they might be sick, some people have an illness where they might need help from a professional and, then there are the ones who actually do it cause they like the thrill, you know the thrill seekers, doing it just to see if they can get away with it, Those are a few of the reasons why I think people do it. I wish there was something we could do as human beings looking out for each other, Oh my bad we don't....", 'nbestanswers': ["it really depends they could be doingg it to show off or they mite be doin it cause they don't have the money for it and maybe they really needed the things.", "People steal for a lot of different reasons.  Most of them do it because they don't have the money to buy the merchandise. Others do it because they are angry at 

## Training Arguments

In [36]:
# Checking if the generation_max_length is appropriate

label_lengths = [len(tokenizer.encode(ans)) for ans in dataset['train']['answer']]
import numpy as np
print(f"Max: {max(label_lengths)}")
print(f"95th percentile: {np.percentile(label_lengths, 95)}")


Token indices sequence length is longer than the specified maximum sequence length for this model (1387 > 512). Running this sequence through the model will result in indexing errors


Max: 1387
95th percentile: 120.0


In [22]:
train_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=1000,
    logging_strategy="steps",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=False,
    generation_max_length=128,
    generation_num_beams=1,)

## ROUGE Metric for evaluation

In [23]:
nltk.download('punkt')
rouge = evaluate.load("rouge")

[nltk_data] Downloading package punkt to /home/neillucha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
# sent_tokenize without nltk

def sent_tokenize(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

In [25]:
def compute_rouge(eval_preds):
    preds, labels = eval_preds
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean and filter
    cleaned_preds = []
    cleaned_labels = []
    for pred, label in zip(decoded_preds, decoded_labels):
        pred = pred.strip()
        label = label.strip()
        if pred and label and len(pred) < 1000 and len(label) < 1000:
            cleaned_preds.append("\n".join(sent_tokenize(pred)))
            cleaned_labels.append("\n".join(sent_tokenize(label)))

    if not cleaned_preds or not cleaned_labels:
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0}

    result = rouge.compute(predictions=cleaned_preds, references=cleaned_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}


In [26]:
# Checking if compute_rouge works

sample_preds = ["Artificial Intelligence is the simulation of human intelligence by machines."]
sample_labels = ["Artificial Intelligence refers to the ability of machines to mimic human intelligence."]

preds = tokenizer(sample_preds, padding=True, truncation=True, return_tensors="pt")["input_ids"].numpy()
labels = tokenizer(sample_labels, padding=True, truncation=True, return_tensors="pt")["input_ids"].numpy()

labels = np.where(labels == tokenizer.pad_token_id, -100, labels)

results = compute_rouge((preds, labels))
print(results)

{'rouge1': np.float64(63.6364), 'rouge2': np.float64(20.0), 'rougeL': np.float64(54.5455), 'rougeLsum': np.float64(54.5455)}


## Data Collator

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, return_tensors="pt")

## Trainer

In [28]:
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge,
)

  trainer = Seq2SeqTrainer(


## Training

In [29]:
torch.cuda.empty_cache()

In [30]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.6652,3.439815,14.3405,2.3337,12.0715,12.3864
2,3.6126,3.415636,14.5261,2.3752,12.152,12.5063
3,3.5786,3.407772,14.4132,2.3436,12.0356,12.4482


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=58971, training_loss=3.6384050967914843, metrics={'train_runtime': 14735.4218, 'train_samples_per_second': 16.007, 'train_steps_per_second': 4.002, 'total_flos': 2107544852170752.0, 'train_loss': 3.6384050967914843, 'epoch': 3.0})

In [31]:
# sample = tokenized_dataset['train'][0]

# input_ids = torch.tensor([sample['input_ids']])
# labels = torch.tensor([sample['labels']])

# # Move to GPU if available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# input_ids = input_ids.to(device)
# labels = labels.to(device)
# model = model.to(device)

# with torch.no_grad():
#     output = model(input_ids=input_ids, labels=labels)
#     print("Loss:", output.loss.item())


In [32]:
# print("Labels:", labels)
# print("Are all labels -100?", (labels == -100).all())
# print("Any NaNs?", torch.isnan(labels).any())
# print("Min label:", labels.min())
# print("Max label:", labels.max())


In [33]:
# from torch.utils.data import DataLoader
# from transformers import default_data_collator

# # Filter the dataset to include only fields needed for the model
# processed_dataset = tokenized_dataset["train"].remove_columns(
#     ["question", "answer", "nbestanswers", "main_category", "id"]
# )

# # Now use DataLoader safely
# dataloader = DataLoader(
#     processed_dataset,
#     batch_size=1,
#     collate_fn=default_data_collator
# )

# batch = next(iter(dataloader))

# # Move tensors to correct device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# input_ids = batch["input_ids"].to(device)
# attention_mask = batch["attention_mask"].to(device)
# labels = batch["labels"].to(device)

# # Forward pass
# model.to(device)
# outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# print("Loss:", outputs.loss.item())
# print("Logits shape:", outputs.logits.shape)
# print("Labels shape:", labels.shape)


In [34]:
# print("All -100?:", torch.all(labels == -100))
# print("Any NaNs in labels?:", torch.isnan(labels).any())
# print("Unique label values:", torch.unique(labels))


In [35]:
# print("Logits contain NaNs?:", torch.isnan(outputs.logits).any())
# print("Logits contain infs?:", torch.isinf(outputs.logits).any())
# print("Max logit:", torch.max(outputs.logits))
# print("Min logit:", torch.min(outputs.logits))
