In [2]:
!pip install pandas numpy sentence-transformers transformers torch fuzzywuzzy python-Levenshtein Pillow google-cloud-vision protobuf

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting google-cloud-vision
  Downloading google_cloud_vision-3.10.1-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_

In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [1]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setup
import os
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

DRIVE_FOLDER = "/content/drive/My Drive/MsDSAI/January 2025/NLP/Project/Data/"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

# Paths
csv_path = os.path.join(DRIVE_FOLDER, "drugbank_clean.csv")
index_path = os.path.join(DRIVE_FOLDER, "drug_index.faiss")
embed_path = os.path.join(DRIVE_FOLDER, "drug_embeddings.npy")

In [3]:
# Load DrugBank Dataset
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(df.head())
else:
    # TODO: Parse XML yourself if not exist
    raise FileNotFoundError("Please prepare drugbank_clean.csv first.")

                           drugbank_ids                 name  \
0  ['DB00001', 'BTD00024', 'BIOD00024']            Lepirudin   
1  ['DB00002', 'BTD00071', 'BIOD00071']            Cetuximab   
2  ['DB00003', 'BTD00001', 'BIOD00001']         Dornase alfa   
3  ['DB00004', 'BTD00084', 'BIOD00084']  Denileukin diftitox   
4  ['DB00005', 'BTD00052', 'BIOD00052']           Etanercept   

                                         description  \
0  Lepirudin is a recombinant hirudin formed by 6...   
1  Cetuximab is a recombinant chimeric human/mous...   
2  Dornase alfa is a biosynthetic form of human d...   
3  Denileukin diftitox is an IL2-receptor-directe...   
4  Dimeric fusion protein consisting of the extra...   

                                          indication  \
0  Lepirudin is indicated for anticoagulation in ...   
1  Cetuximab indicated for the treatment of local...   
2  Used as adjunct therapy in the treatment of cy...   
3  Denileukin diftitox was previously indicated f...  

In [5]:
# login to hugging face
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Load FAISS Index
model = SentenceTransformer('all-MiniLM-L6-v2')

if os.path.exists(index_path) and os.path.exists(embed_path):
    index = faiss.read_index(index_path)
    embeddings = np.load(embed_path)
else:
    print("Building FAISS index...")
    texts = df["description"].fillna("").tolist()
    embeddings = model.encode(texts, show_progress_bar=True)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))

    # Save
    np.save(embed_path, embeddings)
    faiss.write_index(index, index_path)
    print("Saved FAISS index and embeddings.")

In [7]:
# OCR and Extract Text
from google.cloud import vision

def ocr_google(path):
    client = vision.ImageAnnotatorClient()
    with open(path, 'rb') as img_file:
        content = img_file.read()
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    return response.text_annotations[0].description if response.text_annotations else ""

In [8]:
# Fuzzy Match OCR Result to Drug Name
from fuzzywuzzy import process

def find_best_drug(ocr_text, drug_names):
    words = ocr_text.split()
    candidates = []

    for word in words:
        match, score = process.extractOne(word, drug_names)
        if score > 85:
            candidates.append((match, score))

    if candidates:
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[0][0]
    else:
        return None

In [9]:
# Retrieve Info from FAISS
def retrieve_drug_info(drug_name):
    query_emb = model.encode([drug_name])
    D, I = index.search(np.array(query_emb), k=1)
    return df.iloc[I[0][0]].to_dict()

In [12]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA A100-SXM4-40GB


In [11]:
# Generate Summary with LLM
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-Instruct-v0.2",
    device=0  # Move to GPU if available
)

def build_prompt(info):
    return f"""
You are a medical assistant. Summarize the following drug information.

Name: {info.get('name', 'N/A')}
Description: {info.get('description', 'N/A')}
Indication: {info.get('indication', 'N/A')}
Mechanism of Action: {info.get('mechanism_of_action', 'N/A')}
Toxicity: {info.get('toxicity', 'N/A')}

Summarize this for a general audience.
"""

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda:0


In [13]:
def generate_summary(info):
    prompt = build_prompt(info)
    summary = generator(prompt, max_new_tokens=250)[0]["generated_text"]
    return summary

In [16]:
import os

key_path = os.path.join(DRIVE_FOLDER, "google-cloud-service-key.json")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path

In [17]:
# Input
image_path = os.path.join(DRIVE_FOLDER, "Calpol500.jpg")

# 1. OCR
ocr_text = ocr_google(image_path)
print("OCR Result:", ocr_text)

# 2. Find Drug
best_drug_name = find_best_drug(ocr_text, df["name"].tolist())
print("Best Matched Drug:", best_drug_name)

# 3. Retrieve Drug Info
if best_drug_name:
    drug_info = retrieve_drug_info(best_drug_name)

    # 4. Generate Summary
    final_summary = generate_summary(drug_info)
    print("Summary:\n", final_summary)
else:
    print("No matching drug found.")

OCR Result: «medical zadvice beyond 3 days;
Paracetamol overdose may be injurious to ver
To be used as directed by physician
Store at temperature not exceeding 30°C.
Protect from light and oisture.
Keep out of reach of children Mfg. Lic. No.: 25A/AD/258A
serious liver damage or allergic reactions (e.g. swing)
of the face, mouth and throat, difficulty in breathing,
Filching or rash)
Manufactured by GlaxoSmithKline
Pharmaceuticals Limited
At Plot No. D-5, MIDC Industrial Area,
Paithan, Aurangabad-431 148
Regd. Office: Dr. Annie Besant Road,
Worli, Mumbai 400 030.
Trade marks are owned by or licensed
to the GSK group of companies
For Toll free Customer Care Call 1800222203
Paracetamol Tablets IP 500 mg sk
Calpol 500
Each uncoated tablet contains
Paracetamol IP 500 mg
Analgesic and Antipyretic
15 Tablets
Dose Adults & children 12 years and above: 1-2 tablets
4-6 hourly upto maximum 4000mg per day Children i
6-11years 10-15mg/kg 4-6 houry upto maximum
60mg kg per day Children 6-Byears: 250m



Best Matched Drug: Medical air


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Summary:
 
You are a medical assistant. Summarize the following drug information.

Name: Medical air
Description: Medical air is supplied by a special air compressor to patient care areas using clean outside air.
Indication: For use as a source of clean air [FDA Label].
Mechanism of Action: Air is approximately 21% oxygen which enters the body through the lungs, crossing the alveolar membrane to reach systemic circulation [T36]. One there it is bound by hemoglobin and transported to tissues thoughout the body where it is used as a terminal electron acceptor in oxidative phosphorylation. This allows efficient generation of adenosine triphosphate, the primary storage molecule for energy, in the mitochondria.
Toxicity: nan

Summarize this for a general audience.

Medical air, also known as just plain air, is a type of gas that is used in healthcare settings to provide patients with clean air to breathe. It comes from a special compressor that filters out impurities from the outside air. T