# Retrieval evaluation: adding more PubMed data to Pinecone (250k)

In [1]:
! pip install langchain_pinecone
!pip install pinecone sentence-transformers
! pip install langchain_community
! pip install bertopic
! pip install plotly


Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.6-py3-none-any.whl.metadata (5.3 kB)
Collecting pinecone<7.0.0,>=6.0.0 (from pinecone[async]<7.0.0,>=6.0.0->langchain_pinecone)
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting aiohttp<3.11,>=3.10 (from langchain_pinecone)
  Downloading aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain_pinecone)
  Downloading langchain_tests-0.3.19-py3-none-any.whl.metadata (3.2 kB)
Collecting pytest-asyncio<1,>=0.20 (from langchain-tests<1.0.0,>=0.3.7->langchain_pinecone)
  Downloading pytest_asyncio-0.26.0-py3-none-any.whl.metadata (4.0 kB)
Collecting syrupy<5,>=4 (from langchain-tests<1.0.0,>=0.3.7->langchain_pinecone)
  Downloading syrupy-4.9.1-py3-none-any.whl.metadata (38 kB)
Collecting pytest-socket<1,>=0.6.0 (from langchain-tests<1.0.0,>=0.3.7->langchain_pinecone)
  Downloading pytest_socket-0.7.0-py3-non

In [2]:
import sys
!{sys.executable} -m pip install --upgrade bertopic




In [3]:
from datasets import Dataset, DatasetDict
import pprint
import json
import logging
logging.basicConfig(level=logging.DEBUG,  # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
                    format='%(asctime)s - %(levelname)s - %(message)s')  # Define log message format
logger = logging.getLogger(__name__)

import torch
import ast
from collections import Counter
import re
from sentence_transformers import SentenceTransformer

from tqdm import tqdm


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# loading 250 k random preprocessed sample from previous upsert: topic modeling and keyword extraction

In [70]:
import json

with open("/content/drive/MyDrive/NLP/04_RAG/TopicModeling/PubMed_raw/pubmed_sample_prepro.json", "r") as f:
    pubmed_data = json.load(f)

print(f"Loaded {len(pubmed_data)} records")



Loaded 250000 records


In [13]:
all_cleaned = [item["clean_content"] for item in pubmed_data]

In [10]:
from bertopic import BERTopic

save_path = "/content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/bertopic_model"
topic_model = BERTopic.load(save_path)

print("Model loaded successfully.")


Sat May 31 07:02:18 2025 Building and compiling search function
Model loaded successfully.


In [15]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


CUDA available: True
Device: cuda


In [19]:
from tqdm import tqdm
import numpy as np

def batched_transform_with_progress(topic_model, texts, batch_size=256, desc="Embedding"):
    """
    Embed texts with progress bar and apply BERTopic.transform() using precomputed embeddings.
    """
    embedder = topic_model.embedding_model
    all_embs = []

    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = texts[i : i + batch_size]

        if hasattr(embedder, "model") and hasattr(embedder.model, "encode"):
            embs = embedder.model.encode(batch, show_progress_bar=False)
        else:

            embs = embedder.embed(batch)

        all_embs.append(embs)

    all_embs = np.vstack(all_embs)
    full_topics, full_probs = topic_model.transform(texts, all_embs)
    return full_topics, full_probs


In [20]:
full_topics, full_probs = batched_transform_with_progress(
    topic_model,
    all_cleaned,
    batch_size=256,
    desc="Embedding + Transform 250k"
)

Embedding + Transform 250k: 100%|██████████| 977/977 [12:17<00:00,  1.33it/s]


Epochs completed:   0%|            0/30 [00:00]

	completed  0  /  30 epochs
	completed  3  /  30 epochs
	completed  6  /  30 epochs
	completed  9  /  30 epochs
	completed  12  /  30 epochs
	completed  15  /  30 epochs
	completed  18  /  30 epochs
	completed  21  /  30 epochs
	completed  24  /  30 epochs
	completed  27  /  30 epochs


In [21]:
print(len(all_cleaned))


250000


### create "rag_seed_250k_enriched_1024d.jsonl"

In [22]:
all_enriched = []
for idx, record in enumerate(pubmed_data):
    topic_id = int(full_topics[idx])
    confidence = float(full_probs[idx][topic_id])
    all_enriched.append({
        "id": record["id"],
        "topic_id": topic_id,
        "confidence": confidence,
        "clean_content": record["clean_content"]
    })


### Extracting key phrases + embed

In [24]:
! pip install keybert

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keybert
Successfully installed keybert-0.9.0


In [27]:
! pip install joblib



In [38]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, models
from tqdm import tqdm
import json
import os

# ------------------------
# Config
# ------------------------
BATCH_SIZE = 10_000
SAVE_PATH = "/content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl"
TEST_MODE = True

# ------------------------
# Load models
# ------------------------
model_name = "emilyalsentzer/Bio_ClinicalBERT"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
kw_model = KeyBERT(model=embedder)

bge = SentenceTransformer("BAAI/bge-large-en-v1.5")  # uses GPU by default

# ------------------------
# Load records
# ------------------------
records = all_enriched[:50000] if TEST_MODE else all_enriched
total = len(records)
print(f"Processing {total} records...")

# ------------------------
# Prepare save location
# ------------------------
os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)

# ------------------------
# Process in batches
# ------------------------
for i in range(0, total, BATCH_SIZE):
    batch = records[i:i + BATCH_SIZE]

    processed = []
    keyphrase_texts = []

    for rec in tqdm(batch, desc=f"Extracting keyphrases {i}-{i+len(batch)}"):
        try:
            text = rec["clean_content"]
            kws = kw_model.extract_keywords(
                text,
                keyphrase_ngram_range=(1, 2),
                stop_words="english",
                top_n=5,
                use_mmr=True,
                diversity=0.7
            )
            keyphrases = [kw for kw, _ in kws]
        except Exception:
            keyphrases = []

        joined = "; ".join(keyphrases)
        keyphrase_texts.append(joined)

        processed.append({
            "id": rec["id"],
            "keyphrases": keyphrases
        })

    embeddings = bge.encode(keyphrase_texts, batch_size=64, show_progress_bar=True)

    for rec, emb in zip(processed, embeddings):
        rec["kp_embedding"] = emb.tolist()

    # Save to disk
    mode = "a" if os.path.exists(SAVE_PATH) else "w"
    with open(SAVE_PATH, mode) as f:
        for rec in processed:
            f.write(json.dumps(rec) + "\n")

    print(f"Saved batch {i}-{i+len(batch)} to {SAVE_PATH}")


Processing 50000 records...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Extracting keyphrases 0-10000:  86%|████████▌ | 8551/10000 [10:52<01:38, 14.70it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8553/10000 [10:52<01:44, 13.82it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8555/10000 [10:52<01:50, 13.07it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8557/10000 [10:53<01:49, 13.23it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8559/10000 [10:53<01:43, 13.89it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8561/10000 [10:53<01:35, 15.13it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8563/10000 [10:53<01:35, 14.99it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ | 8565/10000 [10:53<01:41, 14.11it/s][A[A[A[A[A[A[A






Extracting keyphrases 0-10000:  86%|████████▌ 

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Saved batch 0-10000 to /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Extracting keyphrases 10000-20000:  86%|████████▌ | 8551/10000 [10:49<01:43, 13.96it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8553/10000 [10:49<01:39, 14.56it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8555/10000 [10:49<01:55, 12.47it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8557/10000 [10:50<01:57, 12.24it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8559/10000 [10:50<01:59, 12.06it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8561/10000 [10:50<02:11, 10.96it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8563/10000 [10:50<02:15, 10.63it/s][A[A[A[A[A[A[A






Extracting keyphrases 10000-20000:  86%|████████▌ | 8565/10000 [10:50<02:06, 11.35it/s][A[A[A[A[A[A[A







Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Saved batch 10000-20000 to /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Extracting keyphrases 20000-30000:  86%|████████▌ | 8557/10000 [10:56<01:37, 14.79it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8559/10000 [10:56<01:36, 15.00it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8561/10000 [10:57<01:42, 14.00it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8563/10000 [10:57<01:43, 13.93it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8565/10000 [10:57<01:45, 13.62it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8567/10000 [10:57<01:48, 13.22it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8569/10000 [10:57<01:56, 12.29it/s][A[A[A[A[A[A[A






Extracting keyphrases 20000-30000:  86%|████████▌ | 8571/10000 [10:57<01:57, 12.16it/s][A[A[A[A[A[A[A







Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Saved batch 20000-30000 to /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Extracting keyphrases 30000-40000:  85%|████████▌ | 8549/10000 [10:53<01:44, 13.87it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8551/10000 [10:54<01:58, 12.18it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8553/10000 [10:54<01:46, 13.58it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8555/10000 [10:54<01:56, 12.44it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8557/10000 [10:54<02:02, 11.80it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8559/10000 [10:54<01:48, 13.26it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8561/10000 [10:54<01:39, 14.43it/s][A[A[A[A[A[A[A






Extracting keyphrases 30000-40000:  86%|████████▌ | 8563/10000 [10:55<01:46, 13.47it/s][A[A[A[A[A[A[A







Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Saved batch 30000-40000 to /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Extracting keyphrases 40000-50000:  86%|████████▌ | 8550/10000 [10:56<02:20, 10.30it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8552/10000 [10:57<02:01, 11.87it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8554/10000 [10:57<01:53, 12.78it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8556/10000 [10:57<01:46, 13.62it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8558/10000 [10:57<02:00, 11.94it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8560/10000 [10:57<01:57, 12.27it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8562/10000 [10:57<01:57, 12.20it/s][A[A[A[A[A[A[A






Extracting keyphrases 40000-50000:  86%|████████▌ | 8564/10000 [10:57<01:52, 12.71it/s][A[A[A[A[A[A[A







Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Saved batch 40000-50000 to /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl


In [64]:
import json


paths = [
    "/content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu.jsonl",
    "/content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_250k_embedded_gpu1.jsonl"
]

merged_output_path = "/content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_merged.jsonl"

unique_ids = set()
merged_records = []

for path in paths:
    with open(path, "r") as f:
        for line in f:
            try:
                record = json.loads(line)
                record_id = record["id"]
                if record_id not in unique_ids:
                    unique_ids.add(record_id)
                    merged_records.append(record)
            except json.JSONDecodeError:
                continue

with open(merged_output_path, "w") as out_f:
    for rec in merged_records:
        out_f.write(json.dumps(rec) + "\n")

print(f"Merged {len(merged_records)} unique records into: {merged_output_path}")


Merged 100000 unique records into: /content/drive/MyDrive/NLP/04_RAG/TopicModeling/NEW_TM/rag_keyphrases_merged.jsonl


In [62]:
import pprint
pprint.pprint(record)

{'id': 'pubmed_988289',
 'keyphrases': ['coronary artery',
                'synergistically interact',
                'anesthetized dog',
                'blood',
                'value'],
 'kp_embedding': [0.05218308046460152,
                  0.03859206289052963,
                  0.05248124524950981,
                  0.050236135721206665,
                  -0.06885047256946564,
                  -0.03991742804646492,
                  -0.016939789056777954,
                  0.013324574567377567,
                  0.001447366434149444,
                  0.03330853208899498,
                  0.03547537326812744,
                  0.0003296468057669699,
                  -0.015274152159690857,
                  0.002457447350025177,
                  -0.004849624820053577,
                  0.050781745463609695,
                  -0.018914954736828804,
                  -0.05342079699039459,
                  -0.046072423458099365,
                  0.044004276394844055,
         

# add additional PubMed data to pinecone

In [47]:
!git clone https://github.com/LorenaRaichle/ClinIQ.git

Cloning into 'ClinIQ'...
remote: Enumerating objects: 36203, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 36203 (delta 0), reused 0 (delta 0), pack-reused 36189 (from 1)[K
Receiving objects: 100% (36203/36203), 175.27 MiB | 33.22 MiB/s, done.
Resolving deltas: 100% (12962/12962), done.
Updating files: 100% (128/128), done.


In [51]:
from google.colab import userdata
from pinecone import Pinecone, ServerlessSpec

pinecone_key = userdata.get("PINECONE")

pc.create_index(
    name="balanced-index",
    dimension=1024,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


{
    "name": "balanced-index",
    "metric": "cosine",
    "host": "balanced-index-sycn1y2.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1024,
    "deletion_protection": "disabled",
    "tags": null
}

In [53]:
%cd /content/ClinIQ
!git pull origin main

/content/ClinIQ
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 20 (delta 12), reused 20 (delta 12), pack-reused 0 (from 0)[K
Unpacking objects: 100% (20/20), 338.15 KiB | 2.84 MiB/s, done.
From https://github.com/LorenaRaichle/ClinIQ
 * branch              main       -> FETCH_HEAD
   925f866c..87da9c1f  main       -> origin/main
Updating 925f866c..87da9c1f
Fast-forward
 .gitignore                                        |     3 [32m+[m
 2c_TopicModeling_PubMed.ipynb                     |    38 [32m+[m[31m-[m
 3a_Training_7b_LoRA_v2_balanced_data.ipynb        | 28330 [32m++++++++++[m[31m----------[m
 empty_notebooks/5_Demo.ipynb => 5_Demo.ipynb      |     0
 config.py                                         |     5 [32m+[m[31m-[m
 content/Topic_modeling/InteractivePlot_Topics.zip |   Bin [31m0[m -> [32m190536[m bytes
 content/Topic_modeling/doc_per_cluster.png   

### check empty new index

In [59]:
index = pc.Index("balanced-index")
stats = index.describe_index_stats()
print(stats)


{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [65]:
index_name = "balanced-index"

vectors_to_upsert = []
counter = 0
index = pc.Index(index_name)
for rec in tqdm(merged_records, desc="Preparing upsert"):

    pine_id = str(counter)
    counter += 1

    vector = rec["kp_embedding"]

    metadata = {
        "id":     rec["id"],
        "page_content": rec["id"]
    }

    vectors_to_upsert.append((pine_id, vector, metadata))

for i in tqdm(range(0, len(vectors_to_upsert), 100), desc="Upserting"):
    batch = vectors_to_upsert[i : i + 100]
    index.upsert(vectors=batch)

print(f"Upserted {len(vectors_to_upsert)} vectors.")


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Upserting:  29%|██▊       | 286/1000 [03:08<08:19,  1.43it/s][A[A[A[A[A[A[A






Upserting:  29%|██▊       | 287/1000 [03:09<08:20,  1.43it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 288/1000 [03:09<08:26,  1.40it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 289/1000 [03:10<08:25,  1.41it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 290/1000 [03:11<08:14,  1.44it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 291/1000 [03:12<08:11,  1.44it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 292/1000 [03:12<07:56,  1.49it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 293/1000 [03:13<08:11,  1.44it/s][A[A[A[A[A[A[A






Upserting:  29%|██▉       | 294/1000 [03:14<08:11,  1.44it/s][A[A[A[A[A[A[A






Upserting:  30%|██▉       | 295/1000 [03:14<08:05,  1.45it/s][A[A[A[A[A[A[A






Upserting:  30%|██▉       | 2

Upserted 100000 vectors.





In [66]:
index.describe_index_stats()


{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 100000}},
 'total_vector_count': 100000,
 'vector_type': 'dense'}

### -> done inserting 100 k pubmed data to pinecone

# Adding 100 k train set (restructured), balanced for each question type

In [72]:
import json

with open("/content/drive/MyDrive/NLP/04_RAG/train_datatset_RAG.json", "r") as f:
    train_dataset = json.load(f)

print(f"Loaded {len(train_dataset)} records")


Loaded 4 records


In [69]:
from utils.RAG_metadata import extract_keywords_and_entities, extract_age_gender


In [73]:
def insert_question(data):
    vectors_to_upsert = []

    for qtype, questions in data.items():
        count = 0

        for item in tqdm(questions, desc=f"Vectorizing {qtype}", unit="q"):

            if count >= 20_000:
                break  #  Limit reached for this qtype

            question = item.get("question")
            source = item.get("source")
            qid = item.get("id")

            if not question:
                continue


            answer, reasoning = None, None

            if qtype == "multiple_choice":
                answer = item.get("correct_answer")
                if not answer:
                    continue

            elif qtype == "multi_hop":
                answer = item.get("answer")
                reasoning = "\n".join(item.get("reasoning", []))

            elif qtype == "true_false":
                answer = item.get("answer")
                if not answer or answer.strip().lower() != "true":
                    continue  # only inserting true answers
            else:
                answer = item.get("answer")


            content_parts = [question]
            if answer: content_parts.append(f"Answer: {answer}")
            if reasoning: content_parts.append(f"Reasoning: {reasoning}")
            page_content = "\n".join(content_parts)


            vector = embedding_model.encode(page_content)


            keywords, diseases, symptoms, procedures = extract_keywords_and_entities(question + " " + answer)
            age, gender = extract_age_gender(question + " " + answer)

            metadata = {
                "id": qid,
                "age": age or [],
                "gender": gender or [],
                "keywords": keywords or [],
                "diseases": diseases or [],
                "symptoms": symptoms or [],
                "procedures": procedures or [],
                "page_content": qid  # optional
            }

            vectors_to_upsert.append((str(qid), vector.tolist(), metadata))
            count += 1


    batch_size = 100
    for i in tqdm(range(0, len(vectors_to_upsert), batch_size), desc="Upserting"):
        batch = vectors_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)

    print(f"Upserted {len(vectors_to_upsert)} total vectors to Pinecone.")


In [75]:

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
insert_question(train_dataset)


[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
Upserting:   6%|▌         | 44/758 [00:27<07:46,  1.53it/s][A[A[A[A[A[A[A






Upserting:   6%|▌         | 45/758 [00:28<07:38,  1.55it/s][A[A[A[A[A[A[A






Upserting:   6%|▌         | 46/758 [00:29<07:39,  1.55it/s][A[A[A[A[A[A[A






Upserting:   6%|▌         | 47/758 [00:29<07:39,  1.55it/s][A[A[A[A[A[A[A






Upserting:   6%|▋         | 48/758 [00:30<07:31,  1.57it/s][A[A[A[A[A[A[A






Upserting:   6%|▋         | 49/758 [00:31<07:32,  1.57it/s][A[A[A[A[A[A[A






Upserting:   7%|▋         | 50/758 [00:31<07:15,  1.62it/s][A[A[A[A[A[A[A






Upserting:   7%|▋         | 51/758 [00:32<07:17,  1.62it/s][A[A[A[A[A[A[A






Upserting:   7%|▋         | 52/758 [00:32<07:12,  1.63it/s][A[A[A[A[A[A[A






Upserting:   7%|▋         | 53/758 [00:33<07:08,  1.65it/s][A[A[A[A[A[A[A






Upserting:   7%|▋         | 54/758 [00:33<07:07, 

Upserted 75763 total vectors to Pinecone.





In [76]:
index.describe_index_stats()


{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 175763}},
 'total_vector_count': 175763,
 'vector_type': 'dense'}