In [None]:
import json
import re
from pathlib import Path
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

INPUT_FILE = r"/home/logan78/rag_one_iitp/departmental/EEE.json"
OUTPUT_FILE = "EE_department_indexed2.json"
BATCH_SIZE = 100  


pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("rag-iitpatna-test2")  


In [None]:
model = SentenceTransformer("intfloat/e5-large-v2")

In [None]:
def entry_to_text(entry: dict, department: str, entry_type: str) -> str:
    """Convert JSON entry dict into a single text string for embedding & storage"""
    text_parts = [f"Department: {department}", f"Category: {entry_type}"]
    for k, v in entry.items():
        if v and v != "-":
            text_parts.append(f"{k}: {v}")
    return ", ".join(text_parts)

def make_id(prefix: str, name: str) -> str:
    """Create a stable ID"""
    return f"EE-{prefix}-{re.sub(r'[^a-zA-Z0-9]+', '-', name)}"

def get_embedding(text: str):
    """Get normalized embedding"""
    return model.encode(text, normalize_embeddings=True).tolist()

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

department_name = data.get("department_name", "Unknown")  

local_storage = {"faculty": [], "staff": [], "labs": []}

def process_entries(entries, entry_type):
    batch = []
    batch_count = 1  
    for i, item in enumerate(entries, start=1):
        text = entry_to_text(item, department_name, entry_type)  
        embedding = get_embedding(text)
        doc_id = make_id(entry_type, item.get("Name", item.get("Laboratory & Offices", "unknown")))

        record = {
            "id": doc_id,
            "values": embedding,
            "metadata": {
                "text": text,   
                "department": department_name,
                "type": entry_type,
                "batch": f"{entry_type}_{batch_count}",
                **item
            }
        }

        batch.append(record)
        local_storage[entry_type].append(record)

        
        if len(batch) >= BATCH_SIZE:
            index.upsert(batch)
            batch = []
            batch_count += 1

    
    if batch:
        index.upsert(batch)

In [None]:
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)


local_storage = {"faculty": [], "staff": [], "labs": []}

In [None]:
def process_entries(entries, entry_type):
    batch = []
    batch_count = 1  # start batch numbering
    for i, item in enumerate(entries, start=1):
        
        text = entry_to_text(item, department_name, entry_type)  
        embedding = get_embedding(text)
        doc_id = make_id(entry_type, item.get("Name", item.get("Laboratory & Offices", "unknown")))

        record = {
            "id": doc_id,
            "values": embedding,
            "metadata": {
                "text": text,
                "department": department_name,
                "type": entry_type,
                "batch": f"{entry_type}_{batch_count}",
                **item
            }
        }

        batch.append(record)
        local_storage[entry_type].append(record)

        if len(batch) >= BATCH_SIZE:
            index.upsert(batch)
            batch = []
            batch_count += 1

    if batch:
        index.upsert(batch)


In [None]:
process_entries(data.get("faculty", []), "faculty")
process_entries(data.get("staff", []), "staff")
process_entries(data.get("labs", []), "labs")
Path(OUTPUT_FILE).write_text(json.dumps(local_storage, indent=2), encoding="utf-8")
print(f" Uploaded {OUTPUT_FILE}")