In [None]:
import pandas as pd
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models
import os
from tqdm import tqdm
from dotenv import load_dotenv  # Add this import

# Load environment variables from .env file
load_dotenv()

# Get environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = 'https://00c28ff2-8890-425a-84d0-6e06dcbcfc11.us-east4-0.gcp.cloud.qdrant.io'


# Validate environment variables
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")
if not QDRANT_API_KEY:
    raise ValueError("QDRANT_API_KEY not found in environment variables")

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)


# Initialize Qdrant client
qdrant_client = QdrantClient(
     url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
)

# Function to create embedding for text
def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"  # or "text-embedding-ada-002" for older version
    )
    return response.data[0].embedding

# Function to prepare patient data for embedding
def prepare_patient_text(row):
    # Customize this function based on your dataset columns
    return f"""
    Patient ID: {row['patient_uid']}
    [Add relevant patient fields here]
    """

# Read your dataset
df = pd.read_csv('dataset1/allergies.csv')  # Update with your dataset path

# Create collection in Qdrant
collection_name = "patients"
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=1536,  # 1536 for text-embedding-3-small, 1024 for ada-002
        distance=models.Distance.COSINE
    )
)

# Process each patient and upload embeddings
batch_size = 100  # Adjust based on your needs
for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]

    # Prepare batch data
    points = []
    for _, row in batch.iterrows():
        # Prepare text for embedding
        text = prepare_patient_text(row)

        # Get embedding
        embedding = get_embedding(text)

        # Create point
        point = models.PointStruct(
            id=row['patient_uid'],  # Using patient_uid as the point ID
            vector=embedding,
            payload={
                'patient_uid': row['patient_uid'],
                # Add other relevant fields you want to store
            }
        )
        points.append(point)

    # Upload batch to Qdrant
    qdrant_client.upsert(
        collection_name=collection_name,
        points=points
    )

print("Upload complete!")

  qdrant_client = QdrantClient(
  qdrant_client.recreate_collection(


UnexpectedResponse: Unexpected Response: 403 (Forbidden)
Raw response content:
b'{"error":"forbidden"}'