In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset

In [None]:
data = pd.read_csv("/Users/keshavsaraogi/data/wikihowAll.csv")

print(data.columns)

In [None]:
print(data.head())
print(data.info())
print(data.describe())

In [None]:
# Select relevant columns

df = data[['title', 'headline', 'text']].dropna()
print(df)

In [None]:
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    df['text'].tolist(), df['headline'].tolist(), test_size=0.1, random_state=42
)

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_output_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        input_text = "summarize: " + self.texts[idx]
        target_text = self.summaries[idx]
        
        input_encoding = self.tokenizer(
            input_text, truncation=True, padding='max_length', max_length=self.max_input_length, return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target_text, truncation=True, padding='max_length', max_length=self.max_output_length, return_tensors="pt"
        )
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [None]:
# Create DataLoader
train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer)
val_dataset = SummarizationDataset(val_texts, val_summaries, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define optimizer & loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)