In [7]:
# First, check and enable GPU in Colab
# Go to Runtime -> Change runtime type -> Select T4 GPU

# Install required libraries separately to ensure all are installed
!pip install PyPDF2
!pip install sentence-transformers faiss-cpu
!pip install transformers accelerate
!pip install sentencepiece


import PyPDF2
import re
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch
import os

# Check GPU availability
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

# PDF extraction function with page limit
def extract_text_from_pdf(pdf_path, max_pages=500):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
            pages_to_extract = min(max_pages, total_pages)

            print(f"Extracting text from {pages_to_extract} pages...")

            for page_num in range(pages_to_extract):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                if page_text.strip():
                    text += f"Page {page_num+1}:\n{page_text}\n\n"

                # Show progress
                if (page_num + 1) % 50 == 0:
                    print(f"Processed {page_num + 1} pages...")

    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

    print(f"Successfully extracted text from {pages_to_extract} pages")
    return text

# Extract text from your PDF (limit to 500 pages)
pdf_path = '/content/The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf'
medical_text = extract_text_from_pdf(pdf_path, max_pages=500)

if medical_text:
    print(f"Extracted {len(medical_text)} characters from PDF")

    # Split text into chunks
    def split_into_chunks(text, chunk_size=500):
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < chunk_size:
                current_chunk += sentence + " "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + " "

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    # Create knowledge chunks
    knowledge_chunks = split_into_chunks(medical_text)
    print(f"Created {len(knowledge_chunks)} knowledge chunks")

    # Use GPU for embeddings if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Create embeddings (using a model that works well on GPU)
    model = SentenceTransformer('all-mpnet-base-v2', device=device)
    chunk_embeddings = model.encode(knowledge_chunks, show_progress_bar=True)

    # Create FAISS index for efficient searching
    dimension = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)

    # Convert to float32 for FAISS
    chunk_embeddings_np = np.array(chunk_embeddings).astype('float32')
    index.add(chunk_embeddings_np)

    # Set up QA pipeline with a model that can use GPU
    qa_pipeline = pipeline(
        "question-answering",
        model="deepset/roberta-base-squad2",
        tokenizer="deepset/roberta-base-squad2",
        device=0 if torch.cuda.is_available() else -1
    )

    # Medical chatbot function
    def medical_chatbot(question, top_k=3):
        # Find relevant context
        question_embedding = model.encode([question])
        question_embedding_np = np.array(question_embedding).astype('float32')

        # Search for most similar chunks
        D, I = index.search(question_embedding_np, k=top_k)

        # Get the most relevant chunks
        context = " ".join([knowledge_chunks[i] for i in I[0]])

        # Use QA model to find answer
        result = qa_pipeline(question=question, context=context)

        return result['answer'], context, [knowledge_chunks[i] for i in I[0]]

    # Test the chatbot
    print("\nMedical AI Assistant is ready!")
    print("You can ask medical questions based on the encyclopedia content.")
    print("Type 'quit' to exit.\n")

    while True:
        question = input("Ask a medical question: ")
        if question.lower() in ['quit', 'exit', 'q']:
            break

        answer, context, sources = medical_chatbot(question)
        print(f"\nAnswer: {answer}")
        print("\nSources used:")
        for i, source in enumerate(sources):
            print(f"{i+1}. {source[:200]}...")
        print("-" * 80 + "\n")

else:
    print("Could not extract text from PDF")

GPU available: True
GPU device: Tesla T4
GPU memory: 15.828320256 GB
Extracting text from 500 pages...
Processed 50 pages...
Processed 100 pages...
Processed 150 pages...
Processed 200 pages...
Processed 250 pages...
Processed 300 pages...
Processed 350 pages...
Processed 400 pages...
Processed 450 pages...
Processed 500 pages...
Successfully extracted text from 500 pages
Extracted 2068742 characters from PDF
Created 4809 knowledge chunks
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cuda:0



Medical AI Assistant is ready!
You can ask medical questions based on the encyclopedia content.
Type 'quit' to exit.

Ask a medical question: fever

Answer: high
fever

Sources used:
1. Symp-
toms of the disease appear suddenly and include high
fever , chills, headache , eye pain , red eyes, enlarged
lymph nodes, a red flush to the face, lower back pain,
extreme weakness, and severe ...
2. The disease is not spread from one person
to another. Approximately 60% of people who are infect-
ed exhibit no symptoms (asymptomatic). In the other
40%, symptoms appear 10–30 days after exposure. Th...
3. Shock can result in damage to
the body’s organs (especially the heart and kidneys)
because low blood flow deprives them of oxygen. Diagnosis
Diagnosis should be suspected in endemic areas when-
ever a...
--------------------------------------------------------------------------------

Ask a medical question: diabatic

Answer: Diabetic ketoacidosis

Sources used:
1. OTHER
“Foot Care.” American Dia