<a href="https://colab.research.google.com/github/GilbertKrantz/Scientific-Paper-Summarization/blob/main/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install torch torchvision torchaudio datasets accelerate huggingface_hub[hf_transfer] torch_xla -q

%pip uninstall -y tensorflow && pip install tensorflow-cpu



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

# Import tqdm
from tqdm import tqdm

In [3]:
from datasets import load_dataset

ds = load_dataset("scillm/scientific_papers-archive")

In [4]:
ds_train = ds['train'].select(range(2000))
ds_val = ds['validation'].select(range(200))
ds_test = ds['test'].select(range(200))
ds_train

Dataset({
    features: ['id', 'input', 'output'],
    num_rows: 2000
})

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value

        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [6]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.lstm = nn.LSTM(d_model, d_ff, batch_first=True)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        # Create a tensor filled with zeros, which will be populated with positional encodings.
        pe = torch.zeros(max_seq_length, d_model)

        # Create a A tensor containing the position indices for each position in the sequence.
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        #  A term used to scale the position indices in a specific way.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [9]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [10]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(src.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(tgt.device)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=tgt.device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [11]:
from transformers import BertTokenizer
import numpy as np

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print('Tokenizer Loaded')

# Tokenize the sentences and pad/truncate to the same length
# Set max sequence length to the mean word length in the list of string
max_seq_length = int(np.mean([len(sentence.split()) for sentence in ds_train['input']]))

# Print the maximum sequence length
print(f"Max Sequence Length: {max_seq_length}")

tokenized_train_src = tokenizer(ds_train['input'], return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
print('Tokenized Train Input')

tokenized_train_tgt = tokenizer(ds_train['output'], return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
print('Tokenized Train Output')

tokenized_val_src = tokenizer(ds_val['input'], return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
print('Tokenized Val Input')

tokenized_val_tgt = tokenizer(ds_val['output'], return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)
print('Tokenized Val Output')



Tokenizer Loaded
Max Sequence Length: 317
Tokenized Train Input
Tokenized Train Output
Tokenized Val Input
Tokenized Val Output


In [12]:
# Extract input IDs (tokenized data) and attention masks
train_src_data = tokenized_train_src['input_ids']  # Token IDs for the source (input sentences)

train_tgt_data = tokenized_train_tgt['input_ids']  # Token IDs for the target (same as source for this example)

val_src_data = tokenized_val_src['input_ids']  # Token IDs for the source (input sentences)

val_tgt_data = tokenized_val_tgt['input_ids']  # Token IDs for the target (same as source for this example)

# Check tensor shape
print(f"train_src_data shape: {train_src_data.shape}")
print(f"train_tgt_data shape: {train_tgt_data.shape}")

# Check tensor shape
print(f"val_src_data shape: {val_src_data.shape}")
print(f"val_tgt_data shape: {val_tgt_data.shape}")

train_src_data shape: torch.Size([2000, 317])
train_tgt_data shape: torch.Size([2000, 317])
val_src_data shape: torch.Size([200, 317])
val_tgt_data shape: torch.Size([200, 268])


In [13]:
# Create Batched dataset
batch_size = 32
dataset = data.TensorDataset(train_src_data, train_tgt_data)
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

val_dataset = data.TensorDataset(val_src_data, val_tgt_data)
val_dataloader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [14]:
# Hyperparameters (example values)
src_vocab_size = tokenizer.vocab_size  # Use the vocab size from the tokenizer
tgt_vocab_size = tokenizer.vocab_size
d_model = 1024
num_heads = 4
num_layers = 2
d_ff = 2048
dropout = 0.1

# Transformer Model
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

num_epochs = 10  # Define number of epochs
criterion = torch.nn.CrossEntropyLoss()  # Loss function
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)  # Optimizer

In [15]:
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

def train(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, tgt_vocab_size):
    # Determine the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Move model to the device
    model = model.to(device)

    def validate(model, val_dataloader, criterion):
        model.eval()
        total_loss = 0
        total_tokens = 0

        for batch in tqdm(val_dataloader, desc="Validating", leave=False):
            src_data, tgt_data = batch
            src_data, tgt_data = src_data.to(device), tgt_data.to(device)
            with torch.no_grad():
                output = model(src_data, tgt_data[:, :-1])
                output = output.contiguous().view(-1, tgt_vocab_size)
                tgt_data_shifted = tgt_data[:, 1:].contiguous().view(-1)

                loss = criterion(output, tgt_data_shifted)

                total_loss += loss.item() * tgt_data_shifted.size(0)
                total_tokens += tgt_data_shifted.size(0)

        avg_loss = total_loss / total_tokens
        return avg_loss

    for epoch in range(num_epochs):
        # Train
        model.train()
        train_loss = 0
        train_tokens = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}", leave=False)
        for batch in train_progress_bar:
            src_data, tgt_data = batch
            src_data, tgt_data = src_data.to(device), tgt_data.to(device)

            optimizer.zero_grad()
            output = model(src_data, tgt_data[:, :-1])

            output = output.contiguous().view(-1, tgt_vocab_size)
            tgt_data_shifted = tgt_data[:, 1:].contiguous().view(-1)

            loss = criterion(output, tgt_data_shifted)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * tgt_data_shifted.size(0)
            train_tokens += tgt_data_shifted.size(0)

            train_progress_bar.set_postfix({"Train Loss": train_loss / train_tokens})

        avg_train_loss = train_loss / train_tokens

        # Validate
        val_loss = validate(model, val_dataloader, criterion)

        print(f"Epoch: {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {val_loss:.4f}")

    return model

# Usage example:
# trained_model = train(transformer, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, tgt_vocab_size)

In [None]:
trained_model = train(transformer, dataloader, val_dataloader, optimizer, criterion, num_epochs, tgt_vocab_size)

Using device: cuda


Training Epoch 1/10:   0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
# Initialize the tokenizer (use the same tokenizer as during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = transformer

# Define the function for inference with a start token
def inference(model, input_sentence, max_seq_length):
    # Step 1: Tokenize the input sentence
    tokenized_input = tokenizer(input_sentence, return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)

    # Step 2: Extract input_ids and prepare for model input
    src_input = tokenized_input['input_ids'].to("cuda")

    # Step 3: Create a tensor to start the generation process
    tgt_input = torch.full((1, 1), tokenizer.cls_token_id, dtype=torch.long).to("cuda")

    # Step 4: Perform iterative generation of output tokens (greedy decoding)
    generated_tokens = []
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        for _ in range(max_seq_length):
            # Forward pass through the model
            output = model(src_input, tgt_input)

            # Get the token with the highest probability
            next_token = torch.argmax(output[:, -1, :], dim=-1).unsqueeze(0)

            # Print all token probability with it's decoded word
            # generated_word = tokenizer.decode(next_token[0], skip_special_tokens=False)
            # print(f"Generated Word: {generated_word}")

            # Append the predicted token to the generated sequence
            generated_tokens.append(next_token.item())

            # Break if <EOS> (end of sentence) token is generated
            if next_token.item() == tokenizer.sep_token_id:  # SEP token is used for <EOS> in BERT
                break

            # Update target input with the new predicted token
            tgt_input = torch.cat((tgt_input, next_token), dim=1)

    # Step 5: Decode the generated tokens to a sentence
    predicted_sentence = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return predicted_sentence

# Example input sentence
input_sentence = "In a quiet corner of a digital universe, there resided a brilliant"

# Call the inference function
predicted_output = inference(model, input_sentence, max_seq_length=max_seq_length)

# Display the prediction
print(f"Input: {input_sentence}")
print(f"Predicted Output: {predicted_output}")

In [None]:
# Check number of parameter in the transformer model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters')