In [3]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_output_length=150):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        summary = str(self.summaries[idx])

        input_encoding = self.tokenizer(
            text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        output_encoding = self.tokenizer(
            summary,
            max_length=self.max_output_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding.input_ids.flatten(),
            "attention_mask": input_encoding.attention_mask.flatten(),
            "labels": output_encoding.input_ids.flatten(),
            "decoder_attention_mask": output_encoding.attention_mask.flatten()
        }


dataset_path = "TOSDR_labeled_with_summaries.csv"
df = pd.read_csv(dataset_path)


texts = df['Text'].tolist()
summaries = df['eng_summary'].tolist()


train_texts, val_texts, train_summaries, val_summaries = train_test_split(texts, summaries, test_size=0.1, random_state=42)


tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")


train_dataset = CustomDataset(train_texts, train_summaries, tokenizer)
val_dataset = CustomDataset(val_texts, val_summaries, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)


optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Epoch " + str(epoch + 1)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    print("Epoch {} Loss: {:.4f}".format(epoch + 1, total_loss / len(train_dataloader)))


model.eval()
total_val_loss = 0
for batch in tqdm(val_dataloader, desc="Validation"):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    decoder_attention_mask = batch["decoder_attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        loss = outputs.loss
        total_val_loss += loss.item()

avg_val_loss = total_val_loss / len(val_dataloader)
print("Average Validation Loss:", avg_val_loss)





Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Epoch 1: 100%|██████████| 188/188 [48:19<00:00, 15.42s/it]


Epoch 1 Loss: 3.1117


Epoch 2: 100%|██████████| 188/188 [45:56<00:00, 14.66s/it]


Epoch 2 Loss: 2.7726


Epoch 3: 100%|██████████| 188/188 [53:28<00:00, 17.07s/it]


Epoch 3 Loss: 2.7938


Validation: 100%|██████████| 21/21 [00:46<00:00,  2.22s/it]

Average Validation Loss: 2.000438553946359





wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the fine-tuned model and tokenizer
# model_path = "path_to_your_saved_model"
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained(model_path)

long_english_sentence = "We use cookies and other tracking technologies to improve your browsing experience, personalize content, and serve targeted ads."

inputs = tokenizer.encode("summarize: " + long_english_sentence, return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(inputs, max_length=20, num_beams=2, early_stopping=True)

summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: cookies and other tracking technologies are used to improve your browsing experience, personalize content, and serve


In [13]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
# model_path = "path_to_your_saved_model"
# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained(model_path)


english_text = "You agree to indemnify and hold Apple, its officers, directors, shareholders, predecessors, successors in interest, employees, agents, subsidiaries and affiliates, harmless from any demands, loss, liability, claims or expenses (including attorneys’ fees), made against Apple by any third party due to or arising out of or in connection with your use of the Site."

inputs = tokenizer.encode("summarize: " + english_text, return_tensors="pt", max_length=512, truncation=True)

with torch.no_grad():
    outputs = model.generate(inputs, max_length=20, num_beams=2, early_stopping=True, length_penalty=2.0)

hindi_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Hindi Summary:", hindi_summary)


Hindi Summary: you agree to indemnify and hold Apple, its officers, directors, shareholders, predecessors
