In [3]:
!pip install transformers datasets




In [11]:
from IPython import get_ipython
from IPython.display import display
# %%
! pip install transformers datasets
# %%
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from torch.cuda.amp import autocast, GradScaler
import os
import warnings
warnings.filterwarnings('ignore')

# %%
def load_data():
  # Create a data directory if it doesn't exist
  data_dir = "./cnn_dailymail_data"
  os.makedirs(data_dir, exist_ok=True)

  # Define a cache directory for the dataset metadata and configuration
  cache_dir = "./datasets_cache"
  os.makedirs(cache_dir, exist_ok=True)

  # Explicitly set the data_dir and cache_dir when loading the dataset
  # Also, consider setting download_mode="force_redownload" if you suspect
  # a corrupted cache, but use with caution as it will hit the Hub again.
  # For now, we will use default download_mode.
  #
  # HfHubHTTPError: 429 Client Error: Too Many Requests indicates a rate limit
  # from the Hugging Face Hub. This is usually a temporary issue.
  # The code itself is correctly specifying local directories for data and cache.
  # The library might still attempt to verify metadata with the Hub, triggering
  # the rate limit if many requests are made in a short period.
  #
  # Action to fix: Wait and retry running the cell that calls load_data().
  # No code changes are needed to fix this specific error.
  dataset = load_dataset(
      "cnn_dailymail",
      "3.0.0",
      split="train[:200]",
      data_dir=data_dir,
      cache_dir=cache_dir
      )
  return dataset

# %%
# Load the dataset once and assign it to the variable 'dataset'
# This is the line that failed due to the HfHubHTTPError.
# The solution is to wait and re-run this cell after the Hugging Face Hub
# rate limit for your connection resets.
# You might need to wait a few minutes to a couple of hours depending on the Hub's policy.
try:
    dataset = load_data()
    # Print the first 1000 characters of the first article to confirm loading
    print(dataset[0]['article'][:1000])
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    print("Please wait a few minutes and try running this cell again.")
    dataset = None # Ensure dataset is None if loading fails

# %%
class SummarizationDataset(Dataset):
  def __init__(self,dataset,tokenizer,max_length=256,target_max_length=128):
    self.dataset=dataset
    self.tokenizer=tokenizer
    self.max_length=max_length
    self.target_max_length=target_max_length
    self.max_input_length = max_length
    self.max_target_length = target_max_length
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self,idx):
    article = self.dataset[idx]['article']
    summary = self.dataset[idx]['highlights']

    article = "summarize: " + article

    input_encoding = self.tokenizer(
        article,
        max_length=self.max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    target_encoding = self.tokenizer(
        summary,
        max_length = self.max_target_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    return {
        "input_ids": input_encoding["input_ids"].squeeze(),
        "attention_mask": input_encoding["attention_mask"].squeeze(),
        "labels": target_encoding["input_ids"].squeeze()
    }

# %%
# Note: The CustomTransformer class seems to implement a standard Transformer architecture,
# but the traceback points to an issue during dataset loading, not model training or inference.
# We will keep this class definition for completeness, but the fix focuses on dataset loading.
class CustomTransformer(nn.Module):
  def __init__(self,vocab_size,d_model=256,nhead=4,num_encoder_layers=3,num_decoder_layers=3,dim_feedforwad=1024,dropout=0.1):
    super(CustomTransformer,self).__init__()
    self.embedding=nn.Embedding(vocab_size,d_model)
    self.pos_encoder=nn.Parameter(torch.zeros(1,512,d_model))
    self.transformer=nn.Transformer(
        d_model=d_model,
        nhead=nhead,
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforwad,
        dropout=dropout
    )
    self.fc_out=nn.Linear(d_model,vocab_size)
    self.d_model=d_model

  def forward(self,src,tgt,src_mask=None,tgt_mask=None):
    src_embedded = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model,dtype=torch.float))
    src_embedded = src_embedded + self.pos_encoder[:, :src_embedded.size(1),:]
    tgt_embedded = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model,dtype=torch.float))
    tgt_embedded = tgt_embedded + self.pos_encoder[:, :tgt_embedded.size(1),:]
    # The transformer expects input in (sequence_length, batch_size, d_model)
    output=self.transformer(src_embedded.transpose(0,1),tgt_embedded.transpose(0,1),src_mask,tgt_mask)
    return output

# %%
# We will use the T5ForConditionalGeneration model from Hugging Face Transformers
# as it is more standard for summarization tasks than a custom Transformer.
# This also aligns with the T5Tokenizer usage.
def load_model_tokenizer(model_name='t5-small'):
    # Use T5ForConditionalGeneration which is designed for sequence-to-sequence tasks like summarization
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name) # Use model_name for tokenizer too
    device='cuda' if torch.cuda.is_available() else 'cpu'
    return model.to(device), tokenizer


# %%
def train_model(model, dataloader, tokenizer, epochs=1, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    # T5 models usually handle the label shifting internally for loss calculation
    # We can pass labels directly to the model's forward pass
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            optimizer.zero_grad()

            with autocast():
                # For T5ForConditionalGeneration, we pass input_ids, attention_mask, and labels
                # The model's forward method returns a dictionary containing loss, logits, etc.
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss # Get the loss from the model output

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"Epoch: {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

    # Save the fine-tuned model state dictionary
    torch.save(model.state_dict(), 't5_summarization_model.pth')
    # Save the tokenizer as well
    tokenizer.save_pretrained("./t5_summarizer_tokenizer")


# %%
def summarize_text(text, model, tokenizer,max_length=128,min_length=30,device='cuda' if torch.cuda.is_available() else 'cpu'):
  model.eval()
  model.to(device) # Ensure model is on the correct device
  text="summarize: " + text # Add the task prefix for T5 models
  encoding=tokenizer(
      text,
      max_length=512, # Increased max_length for potentially longer articles
      padding="max_length",
      truncation=True,
      return_tensors="pt"
  )
  input_ids = encoding.input_ids.to(device)
  attention_mask = encoding.attention_mask.to(device)

  # Use the generate method from the T5ForConditionalGeneration model
  generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_length=max_length,
      min_length=min_length,
      num_beams=4,
      early_stopping=True
  )
  summary=tokenizer.decode(generated_ids[0],skip_special_tokens=True)
  return summary

# %%
def main():
  # Data Loading and Preparation
  # The dataset should have been loaded in a previous cell.
  # Ensure dataset is defined before creating the SummarizationDataset instance
  # This check is helpful if you run cells out of order, but the primary fix
  # for the HfHubHTTPError is to wait and retry loading the data.
  if 'dataset' not in globals() or dataset is None:
      print("Dataset not loaded. Please run the data loading cell first and ensure it succeeds.")
      return # Exit if dataset is not loaded

  # Model and Tokenizer Loading (Using T5ForConditionalGeneration)
  model,tokenizer = load_model_tokenizer()

  # Training Setup (Using a small subset for demonstration)
  # Using the already loaded dataset
  train_dataset = SummarizationDataset(dataset,tokenizer)
  train_dataloader = DataLoader(train_dataset,batch_size=8,shuffle=True)

  # Train the model (Optional, if you want to fine-tune)
  # print("Starting model training...")
  # train_model(model,train_dataloader,tokenizer, epochs=1) # Uncomment to train
  # print("Model training finished.")

  # Load the trained model state dict if you trained it
  # try:
  #     model.load_state_dict(torch.load('t5_summarization_model.pth'))
  #     print("Loaded trained model state dictionary.")
  # except FileNotFoundError:
  #     print("No trained model state dictionary found. Using the pre-trained model.")

  # Example Summarization
  sample_text="""
  The quick brown fox jumps over the lazy dog. This is a classic pangram used to test typewriters and keyboards.
    It contains every letter of the English alphabet. The fox is known for its agility and cunning, while the dog,
    in this case, is depicted as idle. This sentence has been used in various contexts to demonstrate text processing.
    The pangram is often employed in design and development to ensure that fonts and text rendering systems display
    all characters correctly. Its brevity and inclusivity make it a practical tool for testing.
  """
  print("\n--- Summarization Example ---")
  print("Original Text: ", sample_text)
  print("Length of original Text (words):", len(sample_text.split()))

  # Summarize the sample text
  summary = summarize_text(sample_text, model, tokenizer)

  print("\nSummary:", summary)
  print("Length of Summary (words):", len(summary.split()))

# %%
# This block will execute the main function if the script is run directly
# or if this cell is run in a Jupyter notebook environment.
if __name__ == '__main__':
   main()

# %%
# Example of summarizing user input outside the main function for interactive use
# This cell can be run independently after the main function has completed (or parts of it)
# Make sure 'model' and 'tokenizer' are in the global scope or loaded here.
# model, tokenizer = load_model_tokenizer() # Load model and tokenizer if not already loaded

# input_text = input("Enter text to be summarized:").strip()
# summary = summarize_text(input_text, model, tokenizer)
# print("Length of input (words):", len(input_text.split()))
# print("\nSummary:", summary)
# print("\nLength of Summary (words):", len(summary.split()))

An error occurred during data loading: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/cnn_dailymail/tree/96df5e686bee6baa90b8bee7c28b81fa3fa6223d/.%2Fcnn_dailymail_data?recursive=True&expand=False
Please wait a few minutes and try running this cell again.
Dataset not loaded. Please run the data loading cell first and ensure it succeeds.


In [12]:
def train_model(model,dataloader,tokenizer,epochs=1,device='cuda' if torch.cuda.is_available() else 'cpu'):
  model.to(device)
  optimizer=optim.Adam(model.parameters(),lr=0.0001)
  criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
  scaler=GradScaler()
  for epoch in range(epochs):
    model.train()
    total_loss=0
    for batch in dataloader:
      input_ids=batch['input_ids'].to(device)
      attention_mask=batch['attention_mask'].to(device)
      labels=batch["labels"].to(device)
      optimizer.zero_grad()
      decoder_input= labels[:,:,1]
      decoder_target=labels[:,:,1]
      tgt_mask=model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)

      with autocast():

        output=model(input_ids,decoder_input,tgt_mask=tgt_mask)
        loss=criterion(output.reshape(-1,output.size(-1)),decoder_target.reshape(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss+=loss.item()
      print(f"Epoch:{epoch+1}/{epoch},Loss{total_loss/len(dataloader)}")
    torch.save(model.state_dict(),"custom_transformer_model.pth")
    os.makedirs("summarizer_tokens",exist_ok=True)
    tokenizer.save_pretrained("tokenizer")

In [13]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import pickle
import os

In [14]:
def load_model_tokenizer(model_name='t5-small'):
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    return model.to(device).eval(), tokenizer


In [15]:
def summarize_text(text, model, tokenizer, max_length=128, min_length=30, device='cuda' if torch.cuda.is_available() else 'cpu'):
    text = "summarize: " + text
    encoding = tokenizer(text, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    input_ids = encoding.input_ids.to(device)
    attention_mask = encoding.attention_mask.to(device)

    summary_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        min_length=min_length,
        num_beams=4,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [16]:
def main():
    model, tokenizer = load_model_tokenizer()
    input_text = input("Enter text to be summarized:\n").strip()
    summary = summarize_text(input_text, model, tokenizer)
    print("\n--- Summary ---")
    print(summary)
    print("\nLength of Input (words):", len(input_text.split()))
    print("Length of Summary (words):", len(summary.split()))

if __name__ == '__main__':
    main()


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 8s [Retry 5/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/adapter_config.json
Retrying in 1s [Retry 1/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/adapter_config.json
Retrying

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Enter text to be summarized:
if you want to automate uploads using Git inside Colab, you can do it with a few commands. if you want to automate uploads using Git, you can do it with a few commands.

--- Summary ---
if you want to automate uploads using Git inside Colab, you can do it with a few commands. if you want to automate uploads using Git, you can do it with a few commands.

Length of Input (words): 34
Length of Summary (words): 34


In [1]:
!git clone https://github.com/Gowreeshdechhar/t5-summarizer


Cloning into 't5-summarizer'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [3]:
!pwd


/content


In [4]:
!ls

sample_data
