In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset

In [2]:
data = pd.read_csv("/Users/keshavsaraogi/data/wikihowAll.csv")

print(data.columns)

Index(['headline', 'title', 'text'], dtype='object')


In [3]:
print(data.head())
print(data.info())
print(data.describe())

                                            headline  \
0  \nKeep related supplies in the same area.,\nMa...   
1  \nCreate a sketch in the NeoPopRealist manner ...   
2  \nGet a bachelor’s degree.,\nEnroll in a studi...   
3  \nStart with some experience or interest in ar...   
4  \nKeep your reference materials, sketches, art...   

                                    title  \
0          How to Be an Organized Artist1   
1  How to Create a Neopoprealist Art Work   
2      How to Be a Visual Effects Artist1   
3           How to Become an Art Investor   
4          How to Be an Organized Artist2   

                                                text  
0   If you're a photographer, keep all the necess...  
1   See the image for how this drawing develops s...  
2   It is possible to become a VFX artist without...  
3   The best art investors do their research on t...  
4   As you start planning for a project or work, ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215365 entri

In [4]:
# Select relevant columns

df = data[['title', 'headline', 'text']].dropna()
print(df)

                                         title  \
0               How to Be an Organized Artist1   
1       How to Create a Neopoprealist Art Work   
2           How to Be a Visual Effects Artist1   
3                How to Become an Art Investor   
4               How to Be an Organized Artist2   
...                                        ...   
215360               How to Pick a Stage Name3   
215361               How to Pick a Stage Name4   
215362                 How to Identify Prints1   
215363                 How to Identify Prints2   
215364                 How to Identify Prints3   

                                                 headline  \
0       \nKeep related supplies in the same area.,\nMa...   
1       \nCreate a sketch in the NeoPopRealist manner ...   
2       \nGet a bachelor’s degree.,\nEnroll in a studi...   
3       \nStart with some experience or interest in ar...   
4       \nKeep your reference materials, sketches, art...   
...                              

In [5]:
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    df['text'].tolist(), df['headline'].tolist(), test_size=0.1, random_state=42
)

In [6]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [7]:
# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_output_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        input_text = "summarize: " + self.texts[idx]
        target_text = self.summaries[idx]
        
        input_encoding = self.tokenizer(
            input_text, truncation=True, padding='max_length', max_length=self.max_input_length, return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target_text, truncation=True, padding='max_length', max_length=self.max_output_length, return_tensors="pt"
        )
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [9]:
# Create DataLoader
train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer)
val_dataset = SummarizationDataset(val_texts, val_summaries, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define optimizer & loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
# Training loop
def train_model(model, train_loader, val_loader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")
    print("Training complete!")


In [11]:
# Train the model
train_model(model, train_loader, val_loader, optimizer, epochs=3)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
# Inference Function
def generate_summary(text):
    model.eval()
    input_text = "summarize: " + text
    input_encoding = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    input_ids = input_encoding['input_ids'].to(device)
    attention_mask = input_encoding['attention_mask'].to(device)
    
    summary_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
# Test the summarizer
sample_text = """
WikiHow is a website that provides step-by-step guides on various topics. It is widely used by individuals seeking
how-to information, covering subjects such as health, technology, and daily life tips. The platform relies on volunteer 
contributors and community edits to maintain the accuracy and quality of its articles.
"""
print("Generated Summary:", generate_summary(sample_text))