# Chatbot for Scientific Paper Summarization using RAG Model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import pandas as pd, numpy as np
import json, random

# Reservoir Sampling due to Huge dataset.

In [3]:
def sample_jsonl_file(input_file, output_file, sample_size):
    """
    Randomly samples lines from a JSON Lines file using reservoir sampling.

    Args:
        input_file (str): Path to the input JSON Lines file.
        output_file (str): Path to the output file where sampled lines will be saved.
        sample_size (int): Number of lines to sample.

    """
    sampled_lines = []
    with open(input_file, 'r') as fin:
        for i, line in enumerate(fin):
            if i < sample_size:
                sampled_lines.append(line)
            else:
                r = random.randint(0, i)
                if r < sample_size:
                    sampled_lines[r] = line
    with open(output_file, 'w') as fout:
        fout.writelines(sampled_lines)


In [4]:
random.seed(42)

In [5]:
# Paths to your original dataset files
train_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/pubmed-dataset/train.txt'
test_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/pubmed-dataset/test.txt'
val_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/pubmed-dataset/val.txt'

# Paths where you want to save the sampled datasets
sampled_train_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/train_sampled.txt'
sampled_test_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/test_sampled.txt'
sampled_val_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/val_sampled.txt'


In [None]:
# Sample 50000 entries from the train dataset
sample_jsonl_file(train_file, sampled_train_file, 50000)

# Sample 2500 entries from the test dataset
sample_jsonl_file(test_file, sampled_test_file, 2500)

# Sample 5000 entries from the validation dataset
sample_jsonl_file(val_file, sampled_val_file, 5000)


In [6]:
!wc -l '/content/drive/MyDrive/NLP_project/pubmed-dataset/train_sampled.txt'

!wc -l '/content/drive/MyDrive/NLP_project/pubmed-dataset/test_sampled.txt'

!wc -l '/content/drive/MyDrive/NLP_project/pubmed-dataset/val_sampled.txt'


50000 /content/drive/MyDrive/NLP_project/pubmed-dataset/train_sampled.txt
2500 /content/drive/MyDrive/NLP_project/pubmed-dataset/test_sampled.txt
5000 /content/drive/MyDrive/NLP_project/pubmed-dataset/val_sampled.txt


In [7]:
import pandas as pd

sampled_train_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/train_sampled.txt'
sampled_valid_file = '/content/drive/MyDrive/NLP_project/pubmed-dataset/val_sampled.txt'

try:
    train_df = pd.read_json(sampled_train_file, lines=True)
    val_df = pd.read_json(sampled_valid_file, lines=True)
    print("Data loaded successfully.")
except ValueError as e:
    print(f"Error loading data: {e}")


Data loaded successfully.


In [8]:
print("Train DataFrame Columns:", train_df.columns.tolist())
print("Number of records in Train DataFrame:", len(train_df))

Train DataFrame Columns: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections']
Number of records in Train DataFrame: 50000


In [9]:
test_df = pd.read_json(sampled_test_file, lines=True)

In [10]:
print("Test DataFrame Columns:", test_df.columns.tolist())
print("Number of records in Test DataFrame:", len(test_df))

Test DataFrame Columns: ['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections']
Number of records in Test DataFrame: 2500


# **1. Data Pre-Processing:**
* Combine article_text and abstract_text into single string, because both are lists of sentences.

In [11]:
columns_to_drop = ['article_id','labels']
train_df = train_df.drop(columns=columns_to_drop, errors='ignore')
val_df=val_df.drop(columns=columns_to_drop, errors='ignore')

In [12]:
train_df['article_text_combined'] = train_df['article_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
train_df['abstract_text_combined'] = train_df['abstract_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
val_df['article_text_combined'] = val_df['article_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
val_df['abstract_text_combined'] = val_df['abstract_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')


In [13]:
print(train_df[['article_text_combined', 'abstract_text_combined']].head())

                               article_text_combined  \
0  a recent systematic analysis showed that in 20...   
1  the morphology of the cervix changes during pr...   
2  tardive dystonia ( td ) , a rarer side effect ...   
3  lepidoptera include agricultural pests that , ...   
4  hypertension , diabetes mellitus and obesity t...   

                              abstract_text_combined  
0  <S> background : the present study was carried...  
1  <S> background : we would like to find out tha...  
2  <S> tardive dystonia ( td ) is a serious side ...  
3  <S> many lepidopteran insects are agricultural...  
4  <S> obesity has become a global epidemic over ...  


In [14]:
import re

def clean_text(text):
    # Remove special tokens using regex
    text = re.sub(r'</?S>', '', text)  # Removes both <S> and </S>
    # Strip leading/trailing whitespace
    return text.strip()

train_df['article_text_combined'] = train_df['article_text_combined'].apply(clean_text)
train_df['abstract_text_combined'] = train_df['abstract_text_combined'].apply(clean_text)

val_df['article_text_combined'] = val_df['article_text_combined'].apply(clean_text)
val_df['abstract_text_combined'] = val_df['abstract_text_combined'].apply(clean_text)


In [15]:
train_df.head()

Unnamed: 0,article_text,abstract_text,section_names,sections,article_text_combined,abstract_text_combined
0,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,"[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...,a recent systematic analysis showed that in 20...,background : the present study was carried out...
1,[the morphology of the cervix changes during p...,[<S> background : we would like to find out th...,"[Introduction, Methods, Results, General chara...",[[the morphology of the cervix changes during ...,the morphology of the cervix changes during pr...,background : we would like to find out that wh...
2,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec...","tardive dystonia ( td ) , a rarer side effect ...",tardive dystonia ( td ) is a serious side effe...
3,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,"[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...,"lepidoptera include agricultural pests that , ...",many lepidopteran insects are agricultural pes...
4,"[hypertension , diabetes mellitus and obesity ...",[<S> obesity has become a global epidemic over...,"[1. Introduction, 2. Life-Style Interventions ...","[[hypertension , diabetes mellitus and obesity...","hypertension , diabetes mellitus and obesity t...",obesity has become a global epidemic over the ...


In [16]:
train_df.columns

Index(['article_text', 'abstract_text', 'section_names', 'sections',
       'article_text_combined', 'abstract_text_combined'],
      dtype='object')

# **2. Preparing the Retrieval Component:**

In [3]:
!pip install faiss-gpu
!pip install sentence-transformers

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


## Embedding model initialization

* 'allenai-specter' model is designed for scientific documents and can capture their semantic content effectively.
*  Setting up FAISS Index


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Initialize the same embedding model used for training data
embedding_model = SentenceTransformer('allenai/scibert_scivocab_uncased')

  from tqdm.autonotebook import tqdm, trange


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [None]:
train_embeddings = embedding_model.encode(
    train_df['article_text_combined'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)
val_embeddings = embedding_model.encode(
    val_df['article_text_combined'].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

# Combine embeddings and proceed with FAISS indexing
all_embeddings = np.vstack((train_embeddings, val_embeddings))

# Initialize a FAISS index for the combined embeddings
index = faiss.IndexFlatL2(all_embeddings.shape[1])  # Using L2 (Euclidean) distance
index.add(all_embeddings)

# Save the FAISS index and the embeddings for later use
faiss.write_index(index, "/content/drive/MyDrive/NLP_project/all_faiss_index")
np.save("/content/drive/MyDrive/NLP_project/all_embeddings.npy", all_embeddings)


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [24]:
# Create a combined list of article texts for reference during retrieval
all_texts = train_df['article_text_combined'].tolist() + val_df['article_text_combined'].tolist()

In [9]:
# Load FAISS index and embeddings
faiss_index_path = "/content/drive/MyDrive/NLP_project/all_faiss_index"
embeddings_path = "/content/drive/MyDrive/NLP_project/all_embeddings.npy"
all_texts_path = "/content/drive/MyDrive/NLP_project/all_texts.txt"

In [10]:
index = faiss.read_index(faiss_index_path)
all_embeddings = np.load(embeddings_path)

# Load text data
with open(all_texts_path, 'r') as f:
    all_texts = f.readlines()

In [48]:
type(all_texts)

list

In [49]:
# Ensure all elements in all_texts are strings
all_texts = [str(entry) for entry in all_texts]

In [51]:
import os

save_dir = "/content/drive/MyDrive/NLP_project"
os.makedirs(save_dir, exist_ok=True)
all_texts_txt_path = os.path.join(save_dir, "all_texts.txt")

try:
    with open(all_texts_txt_path, 'w') as f:
        for text in all_texts:
            f.write(text + "\n")
    print(f"all_texts saved successfully as a text file at {all_texts_txt_path}")
except Exception as e:
    print(f"Error saving all_texts as text: {e}")

all_texts saved successfully as a text file at /content/drive/MyDrive/NLP_project/all_texts.txt


In [52]:
print(all_texts[:5])

["a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states there are also some reports regarding school feeding programs in developing countries . in vietnam 

In [None]:
import sys

print(f"Number of entries in all_texts: {len(all_texts)}")
print(f"Estimated size in memory: {sys.getsizeof(all_texts) / (1024 ** 2):.2f} MB")  # Size in MB

Number of entries in all_texts: 55000
Estimated size in memory: 0.42 MB


In [7]:
!pip install pyMuPDF
!pip install python-docx


Collecting pyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyMuPDF
Successfully installed pyMuPDF-1.24.14
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [11]:
import os
import fitz
from docx import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flask import Flask, request, jsonify


## Input File Handling Functions:
* Text Extraction from the respective Files uploaded in chatbot.
* Handling File Uploads by determining the type of file uploaded.

In [12]:
def parse_pdf(filepath):
    text = ""
    with fitz.open(filepath) as doc:
        for page in doc:
            text += page.get_text()
    return text

def parse_docx(filepath):
    doc = Document(filepath)
    return "\n".join([para.text for para in doc.paragraphs])

def parse_txt(filepath):
    with open(filepath, 'r') as file:
        return file.read()


In [13]:
def extract_text_from_file(filepath):
    if filepath.endswith('.pdf'):
        return parse_pdf(filepath)
    elif filepath.endswith('.docx'):
        return parse_docx(filepath)
    elif filepath.endswith('.txt'):
        return parse_txt(filepath)
    else:
        raise ValueError("Unsupported file format. Please upload a .pdf, .docx, or .txt file.")


# **3. Fine-tuning the Generator Model**

In [14]:
!pip install transformers
import torch



## Initialize tokenizer and model


In [15]:
from transformers import BartTokenizer, BartForConditionalGeneration


tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

# **4. Integrating Retrieval and Generation**

In [16]:
def retrieve_context(query_text, index, all_texts, embedding_model, k=5):
    query_embedding = embedding_model.encode([query_text], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    retrieved_contexts = [all_texts[i] for i in indices.flatten()]
    return " ".join(retrieved_contexts)


In [17]:
def generate_summary(context, model, tokenizer, max_length=200):
    inputs = tokenizer(context, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Testing the Pipeline

In [35]:
test_file_path = "/content/A_RAG-based_Medical_Assistant_Especially_for_Infectious_Diseases.pdf"

In [37]:
try:
    # Step 1: Extract text from file
    extracted_text = extract_text_from_file(test_file_path)

    # Step 2: Retrieve relevant context
    retrieved_context = retrieve_context(extracted_text, index, all_texts, embedding_model)

    # Step 3: Combine input and retrieved context
    combined_input = extracted_text + "\n" + retrieved_context

    # Step 4: Generate summary
    summary = generate_summary(combined_input, summarization_model, tokenizer)
    print("Generated Summary:\n", summary)

except Exception as e:
    print(f"Error during pipeline testing: {e}")

Generated Summary:
 A RAG-based Medical Assistant Especially for Infectious Diseases. The chatbot can interpret and respond appropriately with the use of Natural Language Processing (NLP) The data is stored in the graph database as nodes and relationships, and the knowledge graph is constructed on top of it. augmented generation is utilized to extract the pertinent content from the data.


In [18]:
test_file_path ='/content/ciae425.pdf'

In [19]:
try:
    # Step 1: Extract text from file
    extracted_text = extract_text_from_file(test_file_path)

    # Step 2: Retrieve relevant context
    retrieved_context = retrieve_context(extracted_text, index, all_texts, embedding_model)

    # Step 3: Combine input and retrieved context
    combined_input = extracted_text + "\n" + retrieved_context

    # Step 4: Generate summary
    summary = generate_summary(combined_input, summarization_model, tokenizer)
    print("Generated Summary:\n", summary)

except Exception as e:
    print(f"Error during pipeline testing: {e}")

Generated Summary:
 Nontuberculous mycobacterial pulmonary disease (NTM-PD) is increasing globally. Patients with NTM-PD typically present with chronic nonspecific respiratory and constitutional symptoms. Macrolides and aminoglycosides are the 2 most potent anti-microbial classes against most NTM.
