<div align="center">

# Assignment 6, 7, & 8 - Abstract Generation from COVID-19 Dataset   
### Name: Gauranga Kumar Baishya
### Roll No.: MDS202325  

</div>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"



- Importing Necessary Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import numpy as np
from sklearn.model_selection import train_test_split

- Read extracted abstracts stored in form of text separated by lines

In [None]:
with open('/content/drive/MyDrive/Assgn678/extracted_abstracts.txt', 'r') as file:
    text = file.readlines()
    text = [line.strip() for line in text if len(line) > 50]

# Training on first 60,000 (total approx 70,000 lines as some files have more than 1 abstracts) and keeping other for generation
text = text[:60000]

- Tokenize, prepare dataset and create vocabulary

In [None]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    return text.split()

tokenized_data = [preprocess_text(abstract) for abstract in text]

# Count word frequencies
all_words = [word for abstract in tokenized_data for word in abstract]
word_counts = Counter(all_words)

# Special tokens
start_token, end_token, pad_token, unk_token = "<START>", "<END>", "<PAD>", "<UNK>"
vocab_spcl = {pad_token: 0, start_token: 1, end_token: 2, unk_token: 3}

# Create vocabulary
most_common_words = word_counts.most_common()
vocab = {word: idx + len(vocab_spcl) for idx, (word, _) in enumerate(most_common_words)}
vocab.update(vocab_spcl)   # Added special tokens to original vocab

id_to_word = {idx: wrd for wrd, idx in vocab.items()}

- Convert tokens to IDs and padding

In [None]:
def tokens_to_ids_with_start_end(tokens, vocab, start_token, end_token, unk_token):
    return [vocab[start_token]] + [vocab.get(word, vocab[unk_token]) for word in tokens] + [vocab[end_token]]

tokenized_data_ids = [tokens_to_ids_with_start_end(abstract, vocab, start_token, end_token, unk_token) for abstract in tokenized_data]

max_len = 256
padded_data_ids = pad_sequences(tokenized_data_ids, maxlen=max_len, padding='post', truncating='post', value=vocab[pad_token])

In [None]:
class COVIDAbstractDataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][:-1]), torch.tensor(self.data[idx][1:])

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = torch.stack(inputs, dim=0)
    targets = torch.stack(targets, dim=0)
    return inputs, targets

# Split data into training and validation sets
train_data, val_data = train_test_split(padded_data_ids, test_size=0.25, random_state=42)

batch_size = 8
train_dataset = COVIDAbstractDataset(train_data)
val_dataset = COVIDAbstractDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

## Assignment - 06

- Description of the architecture

It consists of three primary components: an **Embedding Layer**, a **Stacked LSTM Layer**, and a **Fully Connected (Linear) Layer**.

#### **1. Embedding Layer**
- **Purpose**: Maps each token to a dense vector representation, capturing semantic relationships.
- **Input**: Sequence of token indices.
- **Output**: Each token is embedded into a **128-dimensional vector**.
- **Details**: Vocabulary size \( V = 324,033 \), output embedding size = **128**.

#### **2. LSTM Layer**
- **Purpose**: Captures temporal dependencies between tokens, learning contextual relationships.
- **Input**: Embedded token sequence (128-dimensional vectors).
- **Hidden State**: *2 stacked LSTM layers* with **256-dimensional hidden states**.
- **Output**: Sequence of hidden states used for prediction.

#### **3. Fully Connected (Linear) Layer**
- **Purpose**: Maps the final LSTM hidden state to a probability distribution over the vocabulary.
- **Input**: 256-dimensional hidden state from the LSTM.
- **Output**: **324,033**-dimensional vector representing the predicted token probabilities.

#### **4. Hidden State Initialization**
- The LSTM’s hidden and cell states are initialized as zero vectors of shape **(2, batch_size, 256)**.

#### **Forward Pass**
1. **Input**: Sequence of token indices.
2. **Embedding**: Tokens are converted into 128-dimensional vectors.
3. **LSTM**: The embedded sequence is processed by the stacked LSTM layers to capture sequential dependencies.
4. **Prediction**: The final hidden state is passed through the fully connected layer to generate a probability distribution over the vocabulary.


In [None]:
embedding_dim = 128
hidden_dim = 256
num_layers = 2
vocab_size = len(vocab)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class AbstractGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(AbstractGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        out, hidden = self.lstm(embedded, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                torch.zeros(num_layers, batch_size, hidden_dim).to(device))

model = AbstractGenerator(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

Using device: cuda


In [None]:
model

AbstractGenerator(
  (embedding): Embedding(324033, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=324033, bias=True)
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from tqdm import tqdm

num_epochs = 20
loss_history = {'train': [], 'val': []}
best_val_loss = float('inf')
patience = 5
epochs_without_improvement = 0

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    # Training phase
    model.train()
    train_loss = 0
    train_pbar = tqdm(train_loader, desc="Training", leave=False)

    for inputs, targets in train_pbar:
        actual_batch_size = inputs.size(0)
        hidden = model.init_hidden(actual_batch_size)

        inputs, targets = inputs.to(device), targets.to(device).long()  # Ensure targets are torch.long
        optimizer.zero_grad()

        outputs, hidden = model(inputs, hidden)
        hidden = tuple(h.detach() for h in hidden)

        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_pbar.set_postfix({"Batch Loss": loss.item()})

    train_loss /= len(train_loader)
    loss_history['train'].append(train_loss)

    # Validation phase
    model.eval()
    val_loss = 0
    val_pbar = tqdm(val_loader, desc="Validation", leave=False)

    with torch.no_grad():
        for inputs, targets in val_pbar:
            actual_batch_size = inputs.size(0)
            hidden = model.init_hidden(actual_batch_size)

            inputs, targets = inputs.to(device), targets.to(device).long()  # Ensure targets are torch.long
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

            val_loss += loss.item()
            val_pbar.set_postfix({"Batch Loss": loss.item()})

    val_loss /= len(val_loader)
    loss_history['val'].append(val_loss)

    print(f"Epoch {epoch+1}/{num_epochs} Results: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        print("Validation loss improved. Saving the model...")
        torch.save(model.state_dict(), f"best_model_epoch_{epoch+1}.pth")
    else:
        epochs_without_improvement += 1
        print("Validation loss did not improve.")

    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

Epoch 1/20
                                                                                
Epoch 1/20 Results: Train Loss: 3.1630, Val Loss: 2.8533
Validation loss improved. Saving the model...

Epoch 2/20
                                                                                
Epoch 2/20 Results: Train Loss: 2.6793, Val Loss: 2.7405
Validation loss improved. Saving the model...

Epoch 3/20
                                                                                
Epoch 3/20 Results: Train Loss: 2.4599, Val Loss: 2.7131
Validation loss improved. Saving the model...

Epoch 4/20
                                                                                
Epoch 4/20 Results: Train Loss: 2.2929, Val Loss: 2.7466
Validation loss did not improve.

Epoch 5/20
                                                                                
Epoch 5/20 Results: Train Loss: 2.1621, Val Loss: 2.8026
Validation loss did not improve.

Epoch 6/20
                                   

In [None]:
plt.plot(loss_history['train'], label='Train Loss')
plt.plot(loss_history['val'], label='Validation Loss')
plt.title("Loss over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

- Save the model weights

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Assgn678/generator_LSTM_full_sent.pth")
print("Model weights saved!")

Model weights saved!


- Load the model weights

In [None]:
 model.load_state_dict(torch.load("/content/drive/MyDrive/Assgn678/generator_LSTM_full_sent.pth", weights_only=True))
print("Model weights loaded!")

Model weights loaded!


In [None]:
def sentence_to_token_ids(sentence, word_to_id, start_token=1):
    tokens = sentence.split()
    token_ids = [word_to_id.get(token, word_to_id['<UNK>']) for token in tokens]
    token_ids = [start_token] + token_ids
    return token_ids

def generate_abstract_from_sentence(input_sentence, word_to_id, id_to_word, max_length=50, start_token=1, end_token=2):
    model.eval()
    input_tokens = sentence_to_token_ids(input_sentence, word_to_id, start_token)
    input_tensor = torch.tensor(input_tokens).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    generated_tokens = input_tokens

    for _ in range(max_length):
        output, hidden = model(input_tensor, hidden)
        next_token = output.squeeze(0)[-1].argmax(dim=-1).item()

        generated_tokens.append(next_token)
        input_tensor = torch.tensor([[next_token]]).to(device)

        if next_token == end_token:
            break

    return ' '.join([id_to_word[token] for token in generated_tokens[1:]])

In [None]:
sen1 = 'COVID-19 caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) primarily appeared in Wuhan, China, in December 2019. At present, no proper therapy and vaccinations are available for the disease, and it is increasing day by day with a high mortality rate. Pharmacophore based virtual screening of the selected natural product databases followed by Glide molecular docking and dynamics studies against SARS-CoV-2 main protease was investigated to identify potential ligands that may act as inhibitors. The molecules SN00293542 and SN00382835 revealed the highest docking score of 214.57 and 212.42 kcal/mol, respectively, when compared with the co-crystal ligands of PDB-6Y2F (O6K) and 6W63 (X77) of the SARS-CoV-2 M pro . To further validate the interactions of top scored molecules SN00293542 and SN00382835, molecular dynamics study of 100 ns was carried out. This indicated that the protein-ligand complex was stable throughout the simulation period, and minimal backbone fluctuations have ensued in the system. Post-MM-GBSA analysis of molecular dynamics data showed free binding energy-71.7004 1/2 7.98, 256.811/2 7.54 kcal/mol, respectively. The computational study identified several ligands that may act as potential inhibitors of SARS-CoV-2 M pro . The top-ranked molecules SN00293542, and SN00382835 occupied the active site of the target, the main protease like that of the co-crystal ligand. These molecules may emerge as a promising ligands against SARS-CoV-2 and thus needs further detailed investigations.'

In [None]:
input_sentence = 'COVID-19 caused by severe acute respiratory'
generated_text = generate_abstract_from_sentence(input_sentence, vocab, id_to_word)
print("Generated Abstract:", generated_text)

Generated Abstract: <UNK> caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2) is a novel coronavirus named coronavirus disease 2019 caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2). the spike protein of sars-cov is a novel coronavirus named coronavirus disease 2019 caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2). the spike (s) protein binds to the


In [None]:
generated_text

'<UNK> caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2) is a novel coronavirus named coronavirus disease 2019 caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2). the spike protein of sars-cov is a novel coronavirus named coronavirus disease 2019 caused by severe acute respiratory syndrome coronavirus 2 (sars-cov-2). the spike (s) protein binds to the'

In [None]:
sen2 =  'Results: A cohort of 488 patients was analyzed. Infective causes were found in 137 (28.1%) patients. Bacterial, viral and mixed infections were detected in 86 (17.6%), 41 (8.4%) and 10 (2.0%) patients, respectively. Bacteriology was established mostly by sputum culture and virology by nasopharyngeal aspirate (NPA) viral culture. The commonest bacterial isolates were Haemophilus influenzae (31), Pseudomonas aeruginosa (15), Mycobacterium tuberculosis (14), Klebsiella spp. (9) and Streptococcus pneumoniae (6). Influenza A virus (28, 8 were pandemic 2009 A/H1N1 subtype) and respiratory syncytial virus (16) were the most frequent viral causes. Independent predictors of viral pneumonia included nursing home residence (RR 3.056, P = 0.009) and absence of leukocytosis (RR 0.425, P = 0.026).'

In [None]:
input_sentence = "A cohort of 488 patients was analyzed. Infective causes"
generated_text = generate_abstract_from_sentence(input_sentence, vocab, id_to_word)
print("Generated Abstract:", generated_text)

Generated Abstract: <UNK> cohort of 488 patients was analyzed. <UNK> causes severe hypoxia (pao2 50 mm mmol/l and a four-fold muscarinic cutoff value of the nose/eye were not associated with the covid-19 pandemic. <END>


In [None]:
generated_text

'<UNK> cohort of 488 patients was analyzed. <UNK> causes severe hypoxia (pao2 50 mm mmol/l and a four-fold muscarinic cutoff value of the nose/eye were not associated with the covid-19 pandemic. <END>'

In [None]:
sen3 = 'We have identified here the main venom proteins of two braconid wasps, Psyttalia lounsburyi (two strains from South Africa and Kenya) and P. concolor, olive fruit fly parasitoids that differ in host range. Among the shared abundant proteins, we found a GH1 β-glucosidase and a family of leucine-rich repeat (LRR) proteins. Olive is extremely rich in glycoside compounds that are hydrolyzed by β-glucosidases into defensive toxic products in response to phytophagous insect attacks. Assuming that Psyttalia host larvae sequester ingested glycosides, the injected venom GH1 β-glucosidase could induce the release of toxic compounds, thus participating in parasitism success by weakening the host. Venom LRR proteins are similar to truncated Toll-like receptors and may possibly scavenge the host immunity. The abundance of one of these LRR proteins in the venom of only one of the two P. lounsburyi strains evidences intraspecific variation in venom composition. Altogether, venom intra-and inter-specific variation in Psyttalia spp. were much lower than previously reported in the Leptopilina genus (Figitidae), suggesting it might depend upon the parasitoid taxa.'

In [None]:
input_sentence = 'We have identified here the main venom'
generated_text = generate_abstract_from_sentence(input_sentence, vocab, id_to_word)
print("Generated Abstract:", generated_text)

Generated Abstract: <UNK> have identified here the main venom and g-quadruplex properties of the human coronavirus (hcov) 229e and the non-pathogenic mopeia virus (mopv) were isolated from the human body, and the hemagglutinin-esterase (he) protein was inserted into the hemolymph of the body, and the hemagglutinin-esterase (he) protein was inserted into the golgi group and the ∆na(rbd) of the


In [None]:
generated_text

'<UNK> have identified here the main venom and g-quadruplex properties of the human coronavirus (hcov) 229e and the non-pathogenic mopeia virus (mopv) were isolated from the human body, and the hemagglutinin-esterase (he) protein was inserted into the hemolymph of the body, and the hemagglutinin-esterase (he) protein was inserted into the golgi group and the ∆na(rbd) of the'

In [None]:
input_sentence = "Background: Evidence on the effectiveness of respiratory"
generated_text = generate_abstract_from_sentence(input_sentence, vocab, id_to_word)
print("Generated Abstract:", generated_text)

Generated Abstract: <UNK> <UNK> on the effectiveness of respiratory viral infections in the respiratory tract and pancreas of the mexican population are not fully known. the aim of this study was to investigate the association between respiratory signs/symptoms and respiratory viruses in children with acute respiratory distress syndrome (ards). <END>


In [None]:
generated_text

'<UNK> <UNK> on the effectiveness of respiratory viral infections in the respiratory tract and pancreas of the mexican population are not fully known. the aim of this study was to investigate the association between respiratory signs/symptoms and respiratory viruses in children with acute respiratory distress syndrome (ards). <END>'