In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load your dataset
df = pd.read_csv('prepared_data.csv')

# Convert to dictionary for easier processing
data = df.to_dict('records')

In [None]:
# Load the pretrained T5-small model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

# Create a custom dataset class
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=512, max_output_len=150):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        
        # Combine title and content for better context
        input_text = f"summarize: {item['title']}. {item['content']}"
        
        # Tokenize inputs
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Tokenize targets (using content as target for self-supervised learning)
        # In a real scenario, you'd have human-written summaries as targets
        targets = self.tokenizer(
            item['content'][:500],  # Using first 500 chars as pseudo-summary
            max_length=self.max_output_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": targets["input_ids"].flatten()
        }

# Create dataset
train_dataset = SummaryDataset(data, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd

# Initialize the pretrained model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set device (use CPU if GPU not available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def t5_summarize(text, max_length=150):
    # Prepare input text with task prefix
    input_text = f"summarize: {text}"
    
    # Tokenize and generate
    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)
    
    outputs = model.generate(
        inputs,
        max_length=max_length,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load your data
df = pd.read_csv('prepared_data.csv')

# Apply summarization in batches to avoid memory issues
batch_size = 8  # Adjust based on your RAM
summaries = []
for i in range(0, len(df), batch_size):
    batch = df['content'].iloc[i:i+batch_size].astype(str).tolist()
    batch_summaries = [t5_summarize(text) for text in batch]
    summaries.extend(batch_summaries)
    print(f"Processed {min(i+batch_size, len(df))}/{len(df)}")

df['t5_summary'] = summaries
df.to_csv('t5_summaries_no_finetuning.csv', index=False)

2025-04-20 07:00:14.615317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745128814.642653  377345 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745128814.651333  377345 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745128814.673677  377345 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745128814.673706  377345 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745128814.673709  377345 computation_placer.cc:177] computation placer alr

Processed 8/192392
Processed 16/192392
Processed 24/192392
Processed 32/192392
Processed 40/192392
Processed 48/192392
Processed 56/192392
Processed 64/192392
Processed 72/192392
Processed 80/192392
Processed 88/192392
Processed 96/192392
Processed 104/192392
Processed 112/192392
Processed 120/192392
Processed 128/192392
Processed 136/192392
Processed 144/192392
Processed 152/192392
Processed 160/192392
Processed 168/192392
Processed 176/192392
Processed 184/192392
Processed 192/192392
Processed 200/192392
Processed 208/192392
Processed 216/192392
Processed 224/192392
Processed 232/192392
Processed 240/192392
Processed 248/192392
Processed 256/192392
Processed 264/192392
Processed 272/192392
Processed 280/192392
Processed 288/192392
Processed 296/192392
Processed 304/192392
Processed 312/192392
Processed 320/192392
Processed 328/192392
Processed 336/192392
Processed 344/192392
Processed 352/192392
Processed 360/192392
Processed 368/192392
Processed 376/192392
Processed 384/192392
Proce

KeyboardInterrupt: 