In [None]:
import zipfile
import os

zip_path = "../corpora/passage_level/clapnq.jsonl.zip"
extract_dir = "../corpora/passage_level/"

# Extract only if not already extracted
target_file = os.path.join(extract_dir, "clapnq.jsonl")

if not os.path.exists(target_file):
    print("Extracting clapnq.jsonl.zip...")
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall(extract_dir)
    print("✓ Extracted!")
else:
    print("clapnq.jsonl already exists, skipping unzip.")


Extracting clapnq.jsonl.zip...
✓ Extracted!


In [None]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import json

#############################################
# 1. Connect to Milvus
#############################################
connections.connect(
    alias="default",
    host="127.0.0.1",   # works in your setup
    port="19530"
)

#############################################
# 2. Create Collection Schema
#############################################
collection_name = "clapnq_passages"

# Drop if exists
from pymilvus import utility
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=200),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),  # MiniLM output dimension
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
]

schema = CollectionSchema(fields, description="ClapNQ passage-level corpus")
collection = Collection(name=collection_name, schema=schema)
print("✓ Collection created")

#############################################
# 3. Load MiniLM Model
#############################################
print("Loading MiniLM embeddings model...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("✓ Model loaded")

#############################################
# 4. Load ClapNQ Passages
#############################################
path = "/Users/mohit/Desktop/Sem Eval task 8/NLP-Project-SemEval-Task8/corpora/passage_level/clapnq.jsonl"

passage_ids = []
passage_texts = []

print("Reading passages...")
with open(path) as f:
    for line in f:
        obj = json.loads(line)
        passage_ids.append(obj["_id"])
        passage_texts.append(obj["text"])

print(f"✓ Loaded {len(passage_ids)} passages")

#############################################
# 5. Insert in Batches with Embeddings
#############################################
BATCH = 256
print("Starting embedding + insertion...")

for i in tqdm(range(0, len(passage_texts), BATCH)):
    batch_ids = passage_ids[i:i+BATCH]
    batch_texts = passage_texts[i:i+BATCH]

    # Generate embeddings (MiniLM -> 384-dim)
    batch_embeddings = model.encode(batch_texts, convert_to_numpy=True).tolist()

    # Insert into Milvus
    collection.insert([batch_ids, batch_embeddings, batch_texts])

print("✓ Finished inserting all passages")

#############################################
# 6. Create Index
#############################################
index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 2048}
}

print("Building vector index...")
collection.create_index(field_name="embedding", index_params=index_params)
print("✓ Index built")

#############################################
# 7. Load Collection for Searching
#############################################
collection.load()
print("✓ Collection loaded & ready for search!")


✓ Collection created
Loading MiniLM embeddings model...
✓ Model loaded
Reading passages...
✓ Loaded 183408 passages
Starting embedding + insertion...


100%|██████████| 717/717 [13:21<00:00,  1.12s/it]


✓ Finished inserting all passages
Building vector index...
✓ Index built
✓ Collection loaded & ready for search!


In [None]:
# Print first 3 embeddings after encoding
emb = model.encode(passage_texts[:10])
for i, e in enumerate(emb):
    print(f"Embedding {i}: {e[:10]}... (dim={len(e)})")


Embedding 0: [-0.11855029 -0.02213986  0.01778213 -0.02428699 -0.00693649  0.00115193
 -0.0012625   0.04398824 -0.00718389  0.05651155]... (dim=384)
Embedding 1: [-0.02696555 -0.01277182  0.02977032 -0.08650455  0.06651282  0.03194625
 -0.03068677  0.01504944 -0.00052012 -0.0320821 ]... (dim=384)
Embedding 2: [-0.021848    0.07122426  0.00184692 -0.11089488  0.02228159  0.01925579
 -0.02854635 -0.02844507  0.01250403 -0.02986964]... (dim=384)
Embedding 3: [ 0.00701434  0.00355477 -0.01642065 -0.08840613  0.10574561  0.03929438
  0.01180612 -0.02926991 -0.04262488  0.03408423]... (dim=384)
Embedding 4: [-0.03925532 -0.00158098  0.02350306 -0.01048431  0.04540617  0.03902591
 -0.04975101  0.0625867  -0.0528319  -0.01387044]... (dim=384)
Embedding 5: [ 0.09981296  0.03689335 -0.0677207  -0.06775098  0.08487421  0.00608058
  0.02794629 -0.04413663  0.03554465  0.00873239]... (dim=384)
Embedding 6: [-0.06892251 -0.0256021   0.00053463 -0.0173679  -0.01588602  0.01997466
  0.03531457  0.0413

In [15]:
query = "What caused the fall of the French Directory?"

embedding = model.encode([query]).tolist()

results = collection.search(
    data=embedding,
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"nprobe": 10}},
    limit=5,
    output_fields=["text"]
)

for hit in results[0]:
    print(hit.id, hit.distance)
    print(hit.entity.get("text"))
    print("----")


837799097_6931-7548-0-617 0.627049446105957
French Revolution
After the Thermidorian Reaction , an executive council known as the Directory assumed control of the French state in 1795 . They suspended elections , repudiated debt - resulting in financial instability , persecuted the Catholic clergy , and made significant military conquests abroad . Dogged by charges of corruption , the Directory collapsed in a coup led by Napoleon Bonaparte in 1799 . Napoleon , who became the hero of the Revolution through his popular military campaigns , established the Consulate and later the First Empire , setting the stage for a wider array of global conflicts in the Napoleonic Wars .
----
837799097_88359-88956-0-597 0.6068961024284363
French Revolution
Although committed to Republicanism , the Directory distrusted democracy . Historians have seldom praised the Directory ; it was a government of self - interest rather than virtue , thus losing any claim on idealism . It never had a strong base of po

In [16]:
embeddings = model.encode(passage_texts, convert_to_numpy=True)



KeyboardInterrupt: 

In [None]:
import numpy as np

np.save("clapnq_embeddings.npy", embeddings)
