In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset


data = df2[['input_text', 'Limitation']]

train_df3, test_df3 = train_test_split(df2, test_size=0.7, random_state=42)

class PegasusDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        source = self.tokenizer(row['input_text'], max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target = self.tokenizer(row['Limitation'], max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            'input_ids': source.input_ids.squeeze(0),
            'attention_mask': source.attention_mask.squeeze(0),
            'labels': target.input_ids.squeeze(0)
        }

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')

training_args = TrainingArguments(
    output_dir='./pegasus_results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    evaluation_strategy="epoch"
)

train_dataset = PegasusDataset(tokenizer, train_df)
test_dataset = PegasusDataset(tokenizer, test_df)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()



In [None]:
def generate_limitation(text):
    # Encode the input text and ensure tensor is on the correct device
    inputs = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)

    # Generate output tokens
    output_tokens = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)

    # Decode the output tokens to a string
    limitation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return limitation

# Manually applying the function to ensure device consistency
def apply_generate_limitations(df):
    limitations = []
    for text in df['input_text']:
        limitation = generate_limitation(text)
        limitations.append(limitation)
    return limitations
test_df3['predicted_limitation'] = test_df3['input_text'].apply(generate_limitation)