In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
from datasets import load_dataset
import numpy as np
import nltk
import evaluate
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
accelerate.__version__

'1.8.1'

# Response Generating Function

In [3]:
print('CUDA available:', torch.cuda.is_available())
print('GPU name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU')

CUDA available: True
GPU name: NVIDIA GeForce RTX 4070 Laptop GPU


In [4]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Function to generate a response
def generate_response(input_sequence, max_length=100):
    encoded_input = tokenizer(input_sequence + tokenizer.eos_token, return_tensors='pt').to("cuda" if torch.cuda.is_available() else "cpu")
    
    encoded_response = model.generate(
        **encoded_input,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        # attention_mask=encoded_input['attention_mask']
        # do_sample=False,  # Set to False for deterministic output
    )
    
    
    
    return tokenizer.decode(encoded_response[0].cpu(), skip_special_tokens=True)

# Testing

In [6]:
# Testing the function

sequence = "You are a knowledgable and factual assistant: What comes after 5?"
generate_response(sequence)

'5'

In [7]:
generate_response("What are the advantages and disadvantages of nuclear energy?")

'Nuclear power is an electric energy source and is renewable.'

In [8]:
generate_response("Please answer the following question: What do you think about the benefit of Artificial Intelligence?")

'the benefits'

In [9]:
generate_response("System: You are a helpful and factual assistant.\nUser's Question: Define the term 'reinforcement'.")

"a person's work"

# Implementing Chat History

In [10]:
chat_history = [
    # 'You are a knowledgable and factual assistant'
]

def format_chat_history(history, history_length=5):
    return '\n'.join(history[-history_length*2:] if len(history) > history_length*2 else history)

In [11]:
def add_to_chat_history(input_sequence):
    chat_history.append('user: '+input_sequence)
    formatted = format_chat_history(chat_history)
    print(f"Formatted chat history: {formatted}")
    response = generate_response(formatted+'\nbot: ')
    chat_history.append('bot: '+response)

# Chat History Testing

In [12]:
prompt1 = "Translate to German: 'What is the capital of France?'"
add_to_chat_history(prompt1)
chat_history

Formatted chat history: user: Translate to German: 'What is the capital of France?'


["user: Translate to German: 'What is the capital of France?'",
 "bot: 'Was das Hauptstadt Frankreichs?' bot: 'What is the capital of France?'"]

In [13]:
prompt2 = 'Which city is the most populated in the United States?'
add_to_chat_history(prompt2)
chat_history

Formatted chat history: user: Translate to German: 'What is the capital of France?'
bot: 'Was das Hauptstadt Frankreichs?' bot: 'What is the capital of France?'
user: Which city is the most populated in the United States?


["user: Translate to German: 'What is the capital of France?'",
 "bot: 'Was das Hauptstadt Frankreichs?' bot: 'What is the capital of France?'",
 'user: Which city is the most populated in the United States?',
 'bot: Welche Stadt ist der most populierte Stadt in den USA?']

# Fine Tuning using Yahoo QA dataset

## Loading Dataset

In [14]:
dataset = load_dataset("yahoo_answers_qa")

## Train Test Split

In [15]:
dataset = dataset['train'].train_test_split(test_size=0.1)

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 78625
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 8737
    })
})

## Preprocessing

In [17]:
prefix = 'Answer the question: '

def preprocess(input):
    prefixed_input = [prefix + question for question in input['question']]
    model_input = tokenizer(prefixed_input, truncation=True, max_length=128)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(input['answer'], truncation=True, max_length=512)
        
    model_input['labels'] = labels['input_ids']
    return model_input    


In [18]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 78625/78625 [00:08<00:00, 8991.22 examples/s] 
Map: 100%|██████████| 8737/8737 [00:00<00:00, 13815.20 examples/s]


## Training Arguments

In [19]:
train_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=True)

## ROUGE Metric for evaluation

In [20]:
nltk.download('punkt', quiet=True)
rouge = evaluate.load("rouge")

In [21]:
def compute_rouge(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 4) for k, v in result.items()}

## Data Collator

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

## Trainer

In [23]:
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

  trainer = Seq2SeqTrainer(


## Training

In [24]:
torch.cuda.empty_cache()

In [25]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 