In [None]:
!pip install transformers datasets torch --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transformer = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
class MultiHeadAttentionPooling(nn.Module):
    def __init__(self, d_model=768, n_heads=8):
        super().__init__()
        self.n_heads = n_heads
        self.d_head = d_model // n_heads

        self.query = nn.Parameter(torch.randn(n_heads, 1, self.d_head))
        self.linear = nn.Linear(d_model, d_model)
        self.output_proj = nn.Linear(d_model, d_model)

        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.layernorm = nn.LayerNorm(d_model)

    def forward(self, Cp):
        B, T, D = Cp.shape
        Cp_heads = Cp.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        query = self.query.expand(self.n_heads, B, self.d_head).transpose(0, 1)
        query = query.unsqueeze(2)

        scores = torch.matmul(query, Cp_heads.transpose(-1, -2)) / (self.d_head ** 0.5)
        attn_weights = torch.softmax(scores, dim=-1)

        pooled = torch.matmul(attn_weights, Cp_heads)
        pooled = pooled.squeeze(2).reshape(B, -1)

        out = self.output_proj(pooled)
        out = self.layernorm(out + self.ffn(out))

        return out


In [None]:
pooling = MultiHeadAttentionPooling(d_model=768, n_heads=8)

def encode_paragraphs(paragraph_texts):
    inputs = tokenizer(paragraph_texts, padding=True, truncation=True, return_tensors='pt')
    outputs = transformer(**inputs)
    Cp = outputs.last_hidden_state
    φp = pooling(Cp)
    return Cp, φp


In [None]:
paras = ["This is paragraph one.", "This is paragraph two.", "This is paragraph three."]

Cp_list, φp_list = [], []
for p_text in paras:
    Cp, φ = encode_paragraphs([p_text])
    Cp_list.append(Cp)
    φp_list.append(φ)

  return forward_call(*args, **kwargs)


In [None]:
class PHTDecoderLayer(nn.Module):
    def __init__(self, d_model=768, n_heads=8):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.word_cross_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.para_cross_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)

        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.norm4 = nn.LayerNorm(d_model)

    def forward(self, tgt, memory_tokens, memory_paras):
        self_attn_out, _ = self.self_attn(tgt, tgt, tgt)
        tgt = self.norm1(tgt + self_attn_out)
        word_attn_out, _ = self.word_cross_attn(tgt, memory_tokens, memory_tokens)
        tgt = self.norm2(tgt + word_attn_out)
        para_attn_out, _ = self.para_cross_attn(tgt, memory_paras, memory_paras)
        tgt = self.norm3(tgt + para_attn_out)
        ffn_out = self.ffn(tgt)
        tgt = self.norm4(tgt + ffn_out)

        return tgt


In [None]:
class PHTDecoder(nn.Module):
    def __init__(self, num_layers=3, d_model=768, n_heads=8, vocab_size=30522):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.layers = nn.ModuleList([
            PHTDecoderLayer(d_model, n_heads) for _ in range(num_layers)
        ])
        self.out_proj = nn.Linear(d_model, vocab_size)

    def forward(self, tgt_ids, memory_tokens, memory_paras):
        tgt_emb = self.embedding(tgt_ids)

        for layer in self.layers:
            tgt_emb = layer(tgt_emb, memory_tokens, memory_paras)

        logits = self.out_proj(tgt_emb)
        return logits


In [None]:
Cp_combined = torch.cat(Cp_list, dim=1)
φp_combined = torch.stack(φp_list, dim=1)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tgt_ids = tokenizer("summarize:", return_tensors="pt").input_ids
decoder = PHTDecoder()
logits = decoder(tgt_ids, Cp_combined, φp_combined)
output_ids = torch.argmax(logits, dim=-1)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated summary:", summary)


Generated summary: jim shipment 犬rys 950 camille


In [None]:
class PHTEncoder(nn.Module):
    def __init__(self, model_name='bert-base-uncased', d_model=768, n_heads=8):
        super().__init__()
        from transformers import BertModel, BertTokenizer
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.encoder = BertModel.from_pretrained(model_name)
        self.pooling = MultiHeadAttentionPooling(d_model=d_model, n_heads=n_heads)

    def forward(self, paras):
        Cp_list, φp_list = [], []

        for p in paras:
            inputs = self.tokenizer(p, return_tensors='pt', padding=True, truncation=True, max_length=512)
            outputs = self.encoder(**inputs)
            Cp = outputs.last_hidden_state
            φp = self.pooling(Cp)
            Cp_list.append(Cp)
            φp_list.append(φp)

        Cp_combined = torch.cat(Cp_list, dim=1)
        φp_combined = torch.stack(φp_list, dim=1)

        return Cp_combined, φp_combined


In [None]:
class PHTModel(nn.Module):
    def __init__(self, vocab_size=30522, d_model=768, n_heads=8, num_layers=3):
        super().__init__()
        self.encoder = PHTEncoder(d_model=d_model, n_heads=n_heads)
        self.decoder = PHTDecoder(num_layers=num_layers, d_model=d_model, n_heads=n_heads, vocab_size=vocab_size)

    def forward(self, paras, decoder_input_ids):
        Cp, φp = self.encoder(paras)
        logits = self.decoder(decoder_input_ids, Cp, φp)
        return logits


In [None]:
paras = [
    "The Indian postal system plays a vital role in connecting rural and urban areas.",
    "With digital adoption, tracking and speed have improved significantly.",
    "However, many rural areas still lack proper infrastructure and daily services."
]
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
decoder_input_ids = tokenizer("summarize:", return_tensors="pt").input_ids

model = PHTModel()
logits = model(paras, decoder_input_ids)

output_ids = torch.argmax(logits, dim=-1)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)


Generated Summary: submarines debate positions lowers comparison ref


In [None]:
!pip install kagglehub



In [None]:
from google.colab import files
uploaded = files.upload()


Saving kaggle.json to kaggle.json


In [None]:
import os

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "wb") as f:
    f.write(uploaded["kaggle.json"])
os.chmod("/root/.kaggle/kaggle.json", 600)


In [None]:
import kagglehub
path = kagglehub.dataset_download("sandeep16064/wikisum")

print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/wikisum


In [None]:
import os

path = "/kaggle/input/wikisum"
print("Files in folder:", os.listdir(path))


Files in folder: ['WikiSumDataset.jsonl']


In [None]:
import json
import os

json_path = '/kaggle/input/wikisum/WikiSumDataset.jsonl'

with open(json_path, "r") as f:
    data = [json.loads(line) for line in f]

print("Keys:", data[0].keys())


Keys: dict_keys(['url', 'title', 'summary', 'article', 'step_headers', 'fold'])


In [None]:
articles = [item['article'] for item in data]
summaries = [item['summary'] for item in data]
titles = [item['title'] for item in data]


In [None]:
def split_into_paragraphs(text):
    return [p.strip() for p in text.split('\n') if p.strip()]
paragraphs_list = [split_into_paragraphs(article) for article in articles]


In [None]:
print("First article title:", titles[0])
print("First 3 paragraphs:", paragraphs_list[0][:3])
print("Target summary:", summaries[0])


First article title: How to Store Fresh Oysters
First 3 paragraphs: ["Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of ice into the bottom of the container. Do not keep your oysters in a sealed or closed-top container. Doing so will suffocate them. You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check you

In [None]:
dataset = []

for paras, summary in zip(paragraphs_list, summaries):
    dataset.append({
        "paragraphs": paras,
        "summary": summary
    })


In [None]:
import pickle

with open("/content/wikisum_pht_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)

print("Preprocessed dataset saved!")


Preprocessed dataset saved!


In [None]:
print(dataset[0]['paragraphs'][:2])
print("Target:", dataset[0]['summary'])


["Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of ice into the bottom of the container. Do not keep your oysters in a sealed or closed-top container. Doing so will suffocate them. You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check your oysters regularly. Place your oysters on top of the ice bed deep s

In [None]:
input_text = " ".join(dataset[0]['paragraphs'][:2])

print("Input text length (chars):", len(input_text))
print("\nInput for summarization:\n", input_text)


Input text length (chars): 5740

Input for summarization:
 Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of ice into the bottom of the container. Do not keep your oysters in a sealed or closed-top container. Doing so will suffocate them. You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check your oysters r

In [None]:
!pip install transformers




In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer_t5 = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
t5_input = "summarize: " + input_text
inputs = tokenizer_t5.encode(t5_input, return_tensors="pt", max_length=512, truncation=True)


In [None]:
summary_ids = model_t5.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
t5_summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
print("🔹 T5 Summary:\n", t5_summary)


🔹 T5 Summary:
 if your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. leave the grit and dirt on the oysters. keep them moist and will help to insulate the meat.


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer_bart = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model_bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

inputs_bart = tokenizer_bart.encode(input_text, return_tensors='pt', max_length=1024, truncation=True)
summary_ids_bart = model_bart.generate(inputs_bart, max_length=150, min_length=30, num_beams=4, early_stopping=True)

bart_summary = tokenizer_bart.decode(summary_ids_bart[0], skip_special_tokens=True)
print("BART Summary:\n", bart_summary)


BART Summary:
 Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. Keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If you don't have enough room in your freezer to keep full-shelled oysters, you can shucking them before storage.


In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
tokenizer_pegasus = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model_pegasus = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

tokens_pegasus = tokenizer_pegasus(input_text, truncation=True, padding="longest", return_tensors="pt")
summary_ids_pegasus = model_pegasus.generate(**tokens_pegasus)

pegasus_summary = tokenizer_pegasus.decode(summary_ids_pegasus[0], skip_special_tokens=True)
print("Pegasus Summary:\n", pegasus_summary)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pegasus Summary:
 Here are some tips on how to safely store oysters.


In [None]:
tokenizer_distilbart = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model_distilbart = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")

inputs_distilbart = tokenizer_distilbart.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids_distilbart = model_distilbart.generate(inputs_distilbart, max_length=150, min_length=30, num_beams=4, early_stopping=True)

distilbart_summary = tokenizer_distilbart.decode(summary_ids_distilbart[0], skip_special_tokens=True)
print(" DistilBART Summary:\n", distilbart_summary)


 DistilBART Summary:
  Oysters taste best when you shuck them immediately before eating them . Keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad . If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them . Dampen a towel with cold water and place it on top of the oysters with damp paper towels . If you don't have enough room in your freezer, you can shuck oysters before storage .


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

tokenizer_mt5 = MT5Tokenizer.from_pretrained("google/mt5-small")
model_mt5 = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

mt5_input = "summarize: " + input_text
inputs_mt5 = tokenizer_mt5(mt5_input, return_tensors="pt", max_length=512, truncation=True)


summary_ids_mt5 = model_mt5.generate(inputs_mt5["input_ids"], max_length=150, min_length=30, num_beams=4, early_stopping=True)
mt5_summary = tokenizer_mt5.decode(summary_ids_mt5[0], skip_special_tokens=True)

print("mT5 Summary:\n", mt5_summary)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


mT5 Summary:
 <extra_id_0> the oysters' juices easily. Continue Reading... Continue Reading... <extra_id_51> the oysters' juice


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
flan_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

flan_input = "summarize: " + dataset[0]['paragraphs'][0]
flan_inputs = flan_tokenizer(flan_input, return_tensors="pt", max_length=512, truncation=True)

flan_summary_ids = flan_model.generate(flan_inputs['input_ids'], max_length=100, num_beams=4, early_stopping=True)
flan_summary = flan_tokenizer.decode(flan_summary_ids[0], skip_special_tokens=True)

print("FLAN-T5 Summary:\n", flan_summary)


FLAN-T5 Summary:
 Keep your oysters in their shells. Place ice in a bowl or container. Place your oysters on top of the ice bed. Place a towel on top of the oysters. Cover the oysters with a towel. Refrigerate your oysters.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

longt5_tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")
longt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-tglobal-base")

longt5_input = "summarize: " + " ".join(dataset[0]['paragraphs'])
longt5_inputs = longt5_tokenizer(longt5_input, return_tensors="pt", max_length=2048, truncation=True)

longt5_ids = longt5_model.generate(longt5_inputs['input_ids'], max_length=200, num_beams=4)
longt5_summary = longt5_tokenizer.decode(longt5_ids[0], skip_special_tokens=True)

print("LongT5 Summary:\n", longt5_summary)


LongT5 Summary:
 You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check your oysters regularly. Storing your oysters inside their shells will make them less likely to go bad and, in some cases, better preserve their taste. To help your shucked oysters retain their juiciness, pour the liquor you removed during the shucking process into your freezer-safe container. To make sure your oysters aren't going bad, look over them regularly and remove any that have cracked shells or cloudy meat that is a pink, black, brown, or grey color.


In [None]:
from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration

prophet_tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
prophet_model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

prophet_input = prophet_tokenizer("summarize: " + dataset[0]['paragraphs'][0], return_tensors="pt", max_length=512, truncation=True)
prophet_summary_ids = prophet_model.generate(prophet_input['input_ids'], max_length=100, num_beams=4)
prophet_summary = prophet_tokenizer.decode(prophet_summary_ids[0], skip_special_tokens=True)

print("ProphetNet Summary:\n", prophet_summary)


`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.
`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.


ProphetNet Summary:
 keep your oysters in their shells. keep reading for more information.


In [None]:
import pickle

with open("/content/wikisum_pht_dataset.pkl", "rb") as f:
    dataset = pickle.load(f)

In [None]:
from transformers import LEDTokenizer, LEDForConditionalGeneration

led_tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
led_model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

led_input = " ".join(dataset[0]['paragraphs'])
led_inputs = led_tokenizer(led_input, return_tensors="pt", truncation=True, padding="max_length", max_length=16384)

led_summary_ids = led_model.generate(input_ids=led_inputs['input_ids'], attention_mask=led_inputs['attention_mask'], max_length=150, num_beams=4)
led_summary = led_tokenizer.decode(led_summary_ids[0], skip_special_tokens=True)

print("LED Summary:\n", led_summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`cache.key_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].keys` instead.
`cache.value_cache[idx]` is deprecated and will be removed in v4.56.0. Use `cache.layers[idx].values` instead.


LED Summary:
 Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of


In [None]:
!pip install --upgrade transformers




In [None]:
!pip uninstall -y transformers


In [None]:
!pip install transformers


In [None]:
!pip uninstall -y transformers tokenizers
!pip install transformers==4.38.2 tokenizers==0.18.1


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384")


In [None]:
!pip install rouge-score

