# Finetunig

### Method 1

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Define your custom dataset class
class QuestionDataset(Dataset):
    def __init__(self, texts, questions):
        self.texts = texts
        self.questions = questions

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'questions': self.questions[idx]}

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare your dataset (replace this with your own data loading logic)
texts = [...]  # List of input texts
questions = [...]  # List of corresponding questions
dataset = QuestionDataset(texts, questions)

# Tokenize and encode the dataset
def encode_batch(batch):
    inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=512)
    targets = tokenizer(batch['questions'], return_tensors='pt', padding=True, truncation=True, max_length=128)
    return inputs, targets

# DataLoader for the dataset
train_loader = DataLoader(dataset, batch_size=4, collate_fn=encode_batch, shuffle=True)

# Set up training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3  # Adjust as needed

for epoch in range(num_epochs):
    model.train()

    for batch in train_loader:
        inputs = batch[0]
        targets = batch[1]

        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = {k: v.to(device) for k, v in targets.items()}

        # Forward pass
        outputs = model(**inputs, labels=targets['input_ids'])

        # Compute loss
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print training loss for each epoch
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan_t5_large")
tokenizer.save_pretrained("fine_tuned_flan_t5_large")


### Method 2

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation")

# Extract relevant data from the dataset
train_data = dataset['train']

# Define your custom dataset class
class QuestionDataset:
    def __init__(self, data):
        self.texts = data['context']
        self.questions = data['questions']

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'questions': self.questions[idx]}

# Prepare your dataset
train_dataset = QuestionDataset(train_data)

# Tokenize and encode the dataset
def encode_batch(batch):
    inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=512)
    targets = tokenizer(batch['questions'], return_tensors='pt', padding=True, truncation=True, max_length=128)
    return inputs, targets

# DataLoader for the dataset
train_loader = DataLoader(train_dataset, batch_size=4, collate_fn=encode_batch, shuffle=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./question_generation",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_loader,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan_t5_large")
tokenizer.save_pretrained("fine_tuned_flan_t5_large")


### Method 2.1

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation")

# Extract relevant data from the dataset
train_data = dataset['train']

# Tokenize and encode the dataset
def encode_batch(batch):
    inputs = tokenizer(batch['context'], return_tensors='pt', padding=True, truncation=True, max_length=512)
    targets = tokenizer(batch['questions'], return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Include additional keys
    inputs = {key: value.squeeze() for key, value in inputs.items()}
    targets = {key: value.squeeze() for key, value in targets.items()}

    # Print batch size and content
    print("Batch size:", len(batch))
    print("Input IDs size:", inputs['input_ids'].size())
    print("Target IDs size:", targets['input_ids'].size())

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'decoder_input_ids': targets['input_ids'],
        'decoder_attention_mask': targets['attention_mask'],
        'labels': targets['input_ids'],
    }

# DataLoader for the dataset
train_loader = DataLoader(train_data, batch_size=4, collate_fn=encode_batch, shuffle=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./question_generation",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define Trainer with DataCollator
data_collator = lambda batch: encode_batch(batch)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,  # Pass the data collator
    train_dataset=train_data,  # Pass the dataset directly
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan_t5_large_squad")
tokenizer.save_pretrained("fine_tuned_flan_t5_large_squad")

### Method 2.2

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation")

# Extract relevant data from the dataset
train_data = dataset['train']

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./question_generation",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Tokenize and encode the dataset
train_data_encoded = train_data.map(lambda x: tokenizer(x['context'], x['questions'], padding='max_length', truncation=True, max_length=512), batched=True)

# Define Trainer with DataCollator
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_encoded,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan_t5_large_squad")
tokenizer.save_pretrained("fine_tuned_flan_t5_large_squad")

### Method 2.3 - working

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset

def encode_batch(batch):
    inputs = tokenizer(batch['context'], return_tensors='pt', padding=True, truncation=True, max_length=512)
    targets = tokenizer(batch['questions'], return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Include additional keys
    inputs = {key: value.squeeze() for key, value in inputs.items()}
    targets = {key: value.squeeze() for key, value in targets.items()}

    # Add decoder_input_ids key
    targets['decoder_input_ids'] = targets['input_ids'].clone()

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'decoder_input_ids': targets['decoder_input_ids'],
        'decoder_attention_mask': targets['attention_mask'],
        'labels': targets['input_ids'],
    }

# Load pre-trained model and tokenizer
model_name = "google/flan-t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load the dataset
dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation", split='train[:3]')

# Extract relevant data from the dataset
# train_data = dataset['train'] # the entire dataset is too large to run for my computer
train_data = dataset

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./question_generation",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Tokenize and encode the dataset using encode_batch function
train_data_encoded = train_data.map(encode_batch, batched=True)

# Define Trainer with DataCollator
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data_encoded,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan_t5_large_squad")
tokenizer.save_pretrained("fine_tuned_flan_t5_large_squad")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading readme:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18896 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2067 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
from datasets import load_dataset

# dataset = load_dataset("RUCAIBox/Question-Generation", split='train')
# dataset = load_dataset("derek-thomas/squad-v1.1-t5-question-generation")
newsqa_dataset = load_dataset("inquisitive_qg")
print(newsqa_dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 6.70k/6.70k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 3.95k/3.95k [00:00<?, ?B/s]
Downloading data: 4.77MB [00:00, 9.83MB/s]                          


FileNotFoundError: Couldn't find file at https://github.com/wjko2/INQUISITIVE/raw/master/articles.tgz