In [None]:
df2['input_text'] = df.apply(lambda row: f"""Abstract: {row['Abstract']}
Introduction: {row['Introduction']}
Experiment_and_Results: {row['Experiment_and_Results']}
Conclusion: {row['Conclusion']}
""", axis=1)

In [None]:
!pip install transformers[torch]
!pip -q install transformers[torch] accelerate>=0.26.0

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Assuming df2 already has the necessary columns
# df2['input_text'] = df2['Abstract'] + ' ' + df2['Introduction'] + ' ' + df2['Conclusion']
data = df2[['input_text', 'Limitation']]

train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)  # 70% training, 30% testing

class TextDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        source_enc = self.tokenizer(item['input_text'], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        target_enc = self.tokenizer(item['Limitation'], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")

        return {
            'input_ids': source_enc['input_ids'].flatten(),
            'attention_mask': source_enc['attention_mask'].flatten(),
            'labels': target_enc['input_ids'].flatten()
        }

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

training_args = TrainingArguments(
    output_dir='./bart_results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

train_dataset = TextDataset(tokenizer, train_data)
test_dataset = TextDataset(tokenizer, test_data)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [None]:
import torch

# Set device based on CUDA availability (if the code is in CPU and my env is in GPU then there will be a missmatch)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to the appropriate device

def generate_limitation(text):
    # Encode the input text and ensure tensor is on the correct device
    inputs = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)

    # Generate output tokens
    output_tokens = model.generate(inputs, max_length=150, num_beams=5, early_stopping=True)

    # Decode the output tokens to a string
    limitation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return limitation

# Manually applying the function to ensure device consistency
def apply_generate_limitations(df):
    limitations = []
    for text in df['input_text']:
        limitation = generate_limitation(text)
        limitations.append(limitation)
    return limitations
test_data['generated_limitations_bart'] = ''
test_data['generated_limitations_bart'] = apply_generate_limitations(test_data)

