In [1]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments,Seq2SeqTrainer

In [2]:
model_name = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
data = load_dataset("json",data_files="D:/Pythonn/Optimizing VectorDB/data.jsonl")

In [4]:
data = data["train"].train_test_split(test_size=0.2,)

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2265
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 567
    })
})

In [6]:
def preprocess(examples):
    inputs = tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=256)
    labels = tokenizer(examples["completion"], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [7]:
tokenized_dataset = data.map(preprocess,batched=True)

Map:   0%|          | 0/2265 [00:00<?, ? examples/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

In [20]:
nltk.download("punkt",quiet=True)
nltk.download('punkt_tab')
metric = evaluate.load("rouge")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
    return result
    

In [10]:
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

In [12]:
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [14]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset = tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.112118,0.585331,0.568793,0.585488,0.585266
2,0.026900,0.065052,0.585299,0.568944,0.585399,0.585147
3,0.026900,0.067827,0.585438,0.56905,0.585491,0.585293


TrainOutput(global_step=852, training_loss=0.021277781383532313, metrics={'train_runtime': 2523.7575, 'train_samples_per_second': 2.692, 'train_steps_per_second': 0.338, 'total_flos': 4652926209884160.0, 'train_loss': 0.021277781383532313, 'epoch': 3.0})

In [2]:
last_checkpoint = "D:/Pythonn/Optimizing VectorDB/results/checkpoint-852"

In [3]:
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [4]:
paragarph = "Self-driving cars rely on a combination of sensors, cameras, radar, and artificial intelligence to navigate roads without human input. These autonomous vehicles use real-time data to detect obstacles, interpret traffic signs, and make split-second decisions. Advances in deep learning and computer vision have significantly improved their ability to operate safely in complex environments."
inputs = f"Etract Keywords from the Paragraph:\n {paragarph}"

In [5]:
inputs = tokenizer(inputs, return_tensors="pt")
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
answer

'<pad>Self-driving cars rely combination sensors cameras radar artificial intelligence navigate roads without human input autonomous'