In [None]:
!pip -q install accelerate -U
!pip -q install sentencepiece

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

# model = model.to(device)

# df2['input_text'] = df2['Abstract'] + ' ' + df2['conclusion'] +  ' ' + df2['Introduction']
# df2 = df2[['input_text', 'Limitation']]  # Focus on relevant columns

data = df2[['input_text', 'Limitation']]

train_df2, test_df2 = train_test_split(df2, test_size=0.3, random_state=42)

class T5Dataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = row['input_text']
        target_text = row['Limitation']

        source_encodings = self.tokenizer(
            input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt"
        )
        target_encodings = self.tokenizer(
            target_text, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
        )

        return {
            'input_ids': source_encodings['input_ids'].squeeze(0),  # remove batch dimension
            'attention_mask': source_encodings['attention_mask'].squeeze(0),
            'labels': target_encodings['input_ids'].squeeze(0)
        }

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    evaluation_strategy="epoch"
)

train_dataset = T5Dataset(tokenizer, train_df2)
test_dataset = T5Dataset(tokenizer, test_df2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# model.to('cpu')  # Move model to CPU for inference if not using a GPU

# def generate_limitations(text):
#     inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512)
#     outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# test_df['predicted_limitation'] = test_df['input_text'].apply(generate_limitations)

def generate_limitations(text):
    model.eval()  # Set model to evaluation mode
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512).to(device)
    outputs = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

test_df2['predicted_limitation'] = test_df2['input_text'].apply(generate_limitations)
