In [None]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

BATCH_SIZE = 1024
EMBED_MODEL = "all-MiniLM-L6-v2"
INPUT_FILE = "chunked_output_paragraph_based.jsonl"
FAISS_INDEX_PATH = "faiss_index.index"
METADATA_PATH = "faiss_metadata.jsonl"

In [4]:
model = SentenceTransformer(EMBED_MODEL)
index = None
metadata = []

In [3]:
def embed_texts(texts):
    return model.encode(texts, batch_size=32, show_progress_bar=False).astype('float32')

with open(INPUT_FILE, "r") as fin:
    batch_texts = []
    batch_meta = []
    for line in tqdm(fin, desc="Processing JSONL"):
        item = json.loads(line)
        if item['section'] == 'id':  
            continue
        batch_texts.append(item['text'])
        batch_meta.append(item)
        if len(batch_texts) == BATCH_SIZE:
            embs = embed_texts(batch_texts)
            if index is None:
                dim = embs.shape[1]
                index = faiss.IndexFlatL2(dim)  #IndexFlatIP for cosine
            index.add(embs)
            metadata.extend(batch_meta)
            batch_texts, batch_meta = [], []
    # LEFTOVERS
    if batch_texts:
        embs = embed_texts(batch_texts)
        if index is None:
            dim = embs.shape[1]
            index = faiss.IndexFlatL2(dim)
        index.add(embs)
        metadata.extend(batch_meta)

Processing JSONL: 67it [00:00, 66988.88it/s]


In [None]:
faiss.write_index(index, FAISS_INDEX_PATH)
with open(METADATA_PATH, "w") as fout:
    for m in metadata:
        fout.write(json.dumps(m) + "\n")

## Experiment

In [5]:
index = faiss.read_index("faiss_index.index")
with open("faiss_metadata.jsonl") as f:
    meta = [json.loads(line) for line in f]

query = "Explain the function of CD4 T cells."
query_emb = model.encode([query]).astype('float32')
D, I = index.search(query_emb, k=5)
for idx in I[0]:
    print(meta[idx]['section'], meta[idx]['text'])



title CD4 Effector T Cell Subsets in the Response to Influenza
discussion It is important to focus on how to achieve this maximum differentiation, as the cell which can go to the site of infection are the ones that can participate in viral clearance. Also of interest is that even in this vigorous response, a broad spectrum of effectors which range over the gamut of properties are retained in spleen and LN and a fraction of each is retained after viral clearance. This multipotential population may have some advantage in a secondary response. The link between division and cytokine expression by T cells remains controversial. Cytokine expression by T cells involves epigenetic changes in chromatin structure, locus accessibility, and DNA methylation that occur during T cell differentiation. Production of high levels of IFN- and IL-4 production require S phase entry and are often well-correlated in vitro with multiple rounds of cell division. However, others have reported that T cell divisio