In [1]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

BATCH_SIZE = 1024
EMBED_MODEL = "all-MiniLM-L6-v2"
INPUT_FILE = "parsed_pmc_2_chunked.jsonl"
FAISS_INDEX_PATH = "faiss_index.index"
METADATA_PATH = "faiss_metadata.jsonl"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer(EMBED_MODEL)
index = None
metadata = []

In [3]:
def embed_texts(texts):
    return model.encode(texts, batch_size=32, show_progress_bar=False).astype('float32')

with open(INPUT_FILE, "r") as fin:
    batch_texts = []
    batch_meta = []
    for line in tqdm(fin, desc="Processing JSONL"):
        item = json.loads(line)
        if item['section'] == 'id':  
            continue
        batch_texts.append(item['text'])
        batch_meta.append(item)
        if len(batch_texts) == BATCH_SIZE:
            embs = embed_texts(batch_texts)
            if index is None:
                dim = embs.shape[1]
                index = faiss.IndexFlatL2(dim)  #IndexFlatIP for cosine
            index.add(embs)
            metadata.extend(batch_meta)
            batch_texts, batch_meta = [], []
    # LEFTOVERS
    if batch_texts:
        embs = embed_texts(batch_texts)
        if index is None:
            dim = embs.shape[1]
            index = faiss.IndexFlatL2(dim)
        index.add(embs)
        metadata.extend(batch_meta)

Processing JSONL: 1497it [00:07, 191.78it/s]


In [4]:
faiss.write_index(index, FAISS_INDEX_PATH)
with open(METADATA_PATH, "w") as fout:
    for m in metadata:
        fout.write(json.dumps(m) + "\n")

## Experiment

In [7]:
index = faiss.read_index("faiss_index.index")
with open("faiss_metadata.jsonl") as f:
    meta = [json.loads(line) for line in f]

query = "What are the comparative outcomes of Tenon duplication versus dura mater covering techniques in Ahmed glaucoma valve implantation?"
query_emb = model.encode([query]).astype('float32')
D, I = index.search(query_emb, k=5)
#scores and indices of the nearest neighbors
print("Nearest neighbors:")
print("Distances:", D[0])
print("Indices:", I[0])
# Print the metadata for the nearest neighbors
print("Metadata for nearest neighbors:")
for idx in I[0]:
    print(meta[idx]['section'], meta[idx]['text'])



Nearest neighbors:
Distances: [0.12918356 0.4176816  0.68611026 0.7343014  1.0372672 ]
Indices: [620 621 626 625 623]
Metadata for nearest neighbors:
title Comparison of Tenon duplication with dura mater covering technique for Ahmed glaucoma valve implantation
abstract To compare the efficacy and complications of Tenon duplication with dura mater covering technique for Ahmed glaucoma valve (AGV) implantation. This retrospective study included 44 refractory glaucoma patients (44 eyes) who underwent AGV implantation from 2017 to 2020 in the Ophthalmology Clinic of Eskişehir Osmangazi University Hospital and attended regular postoperative follow-ups. The patients were divided based on whether they underwent Tenon duplication technique (group 1: n = 20) or dura mater covering technique (group 2: n = 24) during surgery. The patients’ age, gender, systemic diseases, glaucoma type, pre-op intraocular pressure (IOP), and ocular surgeries were recorded. The groups were compared for IOP level co