In [1]:
import pandas as pd

df = pd.read_json('ai_summarization_100.jsonl', lines = True)
df_text = df['text']
df_summary = df['summary']
df_text = pd.DataFrame(df_text)
df_summary = pd.DataFrame(df_summary)

In [2]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train(input = 'ai_summarization_100.jsonl', model_prefix = 'bpe', vocab_size = 200)

In [3]:
sp = spm.SentencePieceProcessor(model_file = 'bpe.model')

In [4]:
text_tokens = []
summary_tokens = []

for i in range(len(df_text)):
    token = sp.Encode(str(df.iloc[i]['text']), out_type = int)
    text_tokens.append(token)

for i in range(len(df_summary)):
    token = sp.Encode(str(df.iloc[i]['summary']), out_type = int)
    summary_tokens.append(token)

In [5]:
lengths = [len(i) for i in text_tokens]
max_len = max(lengths)
max_len

52

In [6]:
lengths = [len(i) for i in summary_tokens]
max_len = max(lengths)
max_len

18

In [7]:
import os
import torch
from torch import nn
from torch.nn import Embedding, LSTM
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(text_tokens, summary_tokens, train_size = 0.9, random_state = 42)

In [9]:
# making tensors
class d(Dataset):
    def __init__(self, input, target):
        self.input = input
        self.target = target

    def __len__(self):
        return len(self.input)

    def __getitem__(self, idx):

        return {
            "input_ids":torch.tensor(self.input[idx], dtype = torch.long),
            "target_ids":torch.tensor(self.target[idx], dtype = torch.long)
        }

# padding sequences 
def collate_fn(batch):
    inputs = [i["input_ids"] for i in batch]
    targets = [i["target_ids"] for i in batch]

    inputs_with_padding = pad_sequence(inputs, batch_first = True, padding_value = 0)
    targets_with_padding = pad_sequence(targets, batch_first = True, padding_value = 0)

    return {
        "input_ids":inputs_with_padding,
        "target_ids":targets_with_padding
    }


# Dataloader for train and test in network
train_dataset = d(x_train, y_train)
test_dataset = d(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size = 12, shuffle = True, collate_fn = collate_fn)
test_loader = DataLoader(test_dataset, batch_size = 2, shuffle = True, collate_fn = collate_fn)

In [10]:
# Encoder class
class Encoderlstm(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.x1 = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
        self.x2 = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size, batch_first = True)

    def ForwardEncode(self, input_ids):
        x = self.x1(input_ids)
        outputs, (h, c) = self.x2(x)
        return outputs, (h, c)
    
#Decoder class
class decoderlstm(torch.nn.Module):
    def __init__(self, vocab_size, embadding_dim, hidden_size):
        super().__init__()
        self.x1 = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embadding_dim)
        self.x2 = nn.LSTM(input_size = embadding_dim, hidden_size = hidden_size, batch_first = True)
        self.x3 = nn.Linear(hidden_size, vocab_size)

    def ForwordDecode(self, input_ids, encoder_hidden):
        a = self.x1(input_ids)
        b, _ = self.x2(a, encoder_hidden)
        c = self.x3(b)
        return c

In [11]:
# Trainer

class Seq2SeqTrainer:
    def __init__(self, encoder, decoder, optimizer, loss_fn, device):
        self.encoder = encoder
        self.decoder = decoder
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.device = device

    def train(self, dataloader, epochs=5, teacher_forcing_ratio=0.5):
        self.encoder.train()
        self.decoder.train()

        for epoch in range(epochs):
            total_loss = 0
            for batch in dataloader:
                input_ids = batch["input_ids"].to(self.device)       # (B, src_len)
                target_ids = batch["target_ids"].to(self.device)     # (B, tgt_len)

                batch_size, tgt_len = target_ids.shape
                vocab_size = self.decoder.x3.out_features

                # 1. Encode source
                _, hidden = self.encoder.ForwardEncode(input_ids)

                # 2. Prepare decoder input
                decoder_input = target_ids[:, 0].unsqueeze(1)  # (B, 1)
                decoder_hidden = hidden

                outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)

                for t in range(1, tgt_len):
                    decoder_output = self.decoder.ForwordDecode(decoder_input, decoder_hidden)
                    decoder_hidden = decoder_output[1] if isinstance(decoder_output, tuple) else decoder_hidden
                    logits = decoder_output[:, -1, :] if decoder_output.dim() == 3 else decoder_output

                    outputs[:, t] = logits

                    teacher_force = torch.rand(1).item() < teacher_forcing_ratio
                    top1 = logits.argmax(1).unsqueeze(1)  # (B, 1)

                    decoder_input = target_ids[:, t].unsqueeze(1) if teacher_force else top1

                # 3. Compute loss
                loss = self.loss_fn(outputs[:, 1:].reshape(-1, vocab_size),
                                    target_ids[:, 1:].reshape(-1))

                # 4. Backward & optimize
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            print(f"Epoch [{epoch+1}/{epochs}] Loss: {total_loss/len(dataloader):.4f}")


In [12]:
encoder = Encoderlstm(200, 32, 64).to(device)
decoder = decoderlstm(200, 32, 64).to(device)
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()))
loss = nn.CrossEntropyLoss(ignore_index = 0)
trainer = Seq2SeqTrainer(encoder, decoder, optimizer, loss, device)

In [13]:
trainer.train(train_loader, epochs = 10, teacher_forcing_ratio = 0.5)

Epoch [1/10] Loss: 5.2813
Epoch [2/10] Loss: 5.1571
Epoch [3/10] Loss: 4.9127
Epoch [4/10] Loss: 4.5475
Epoch [5/10] Loss: 4.2878
Epoch [6/10] Loss: 4.1614
Epoch [7/10] Loss: 4.1097
Epoch [8/10] Loss: 4.0806
Epoch [9/10] Loss: 4.0484
Epoch [10/10] Loss: 4.0133


In [14]:
def generate_summary(encoder, decoder, input_ids, max_len=30, start_token=0, eos_token=None, device='cpu'):
    encoder.eval()
    decoder.eval()

    input_ids = input_ids.to(device)
    with torch.no_grad():
        # Encode input
        _, encoder_hidden = encoder.ForwardEncode(input_ids)

        decoder_input = torch.tensor([[start_token]], device=device)  # start with <BOS>
        decoder_hidden = encoder_hidden

        summary_ids = [start_token]

        for _ in range(max_len):
            output = decoder.ForwordDecode(decoder_input, decoder_hidden)
            logits = output[:, -1, :]  # take output for current step

            next_token = logits.argmax(1).item()
            summary_ids.append(next_token)

            if eos_token is not None and next_token == eos_token:
                break

            decoder_input = torch.tensor([[next_token]], device=device)

    return summary_ids  # token IDs to decode


In [16]:
# Assume test_input is a tensor of shape (1, seq_len)
test_input = torch.tensor([x_test[0]],dtype = torch.long)
summary_ids = generate_summary(encoder, decoder, test_input, start_token=0, eos_token=1, device=device)

# Decode the tokens back to text
decoded_summary = sp.decode(summary_ids)  # or use your tokenizer's decode method
print("Generated Summary:", decoded_summary)
print("test Output : ",sp.Decode(y_test[0]))

Generated Summary:  ⁇ ssssssssssssssssssssssssssssss
test Output :  Balances uneven data distributions.
