In [2]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
import numpy as np 
import os 

In [3]:
client = QdrantClient(url=os.getenv("Qdrant_cluster_url"), 
                      api_key=os.getenv("Qdrant_API_KEY"))
client

<qdrant_client.qdrant_client.QdrantClient at 0x2a611a478f0>

In [4]:
client.create_collection(
    collection_name="Fed_Speeches",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

True

In [120]:
import json

# Starting ID (set to 1 initially or the last ID used if you're continuing)
starting_id = 6053

# Load the JSON data
with open(r"C:\Users\Navid\Desktop\LLM\mjson\fomcminutes20240320_embedding.json", 'r') as file:
    data = json.load(file)

# Extract the vectors and texts
vectors = [d['embeddings'] for d in data]
texts = [d['text'] for d in data]

# Generate sequential IDs starting from `starting_id`
ids = [starting_id + i for i in range(len(vectors))]

# Update `starting_id` for the next file
starting_id += len(vectors)


In [121]:
texts[0:3]

['Minutes of the Federal Open Market Committee March 19–20, 2024 A joint meeting of the Federal Open Market Committee and the Board of Governors of the Federal Reserve Sys-tem was held in the offices of the Board of Governors on Tuesday, March 19, 2024, at 9:00 a.m',
 'and continued on Wednesday, March 20, 2024, at 9:00 a.m.1 Attendance Jerome H',
 'Powell, Chair John C']

In [122]:
import ast

vectors = [[float(v) for v in ast.literal_eval(vector)] for vector in vectors]

In [123]:
points = [models.PointStruct(id=id, vector=vector, payload={'text': text}) for id, vector, text in zip(ids, vectors, texts)]

In [124]:
len(points)

331

In [125]:
len(vectors)

331

In [126]:
points[0].payload

{'text': 'Minutes of the Federal Open Market Committee March 19–20, 2024 A joint meeting of the Federal Open Market Committee and the Board of Governors of the Federal Reserve Sys-tem was held in the offices of the Board of Governors on Tuesday, March 19, 2024, at 9:00 a.m'}

In [127]:
client.upsert(
    collection_name="Fed_Speeches",
    points=points,)

UpdateResult(operation_id=33, status=<UpdateStatus.COMPLETED: 'completed'>)

In [171]:
client.retrieve(
    collection_name="Fed_Speeches",
    ids=[1000]
    )


[Record(id=1000, payload={'text': ' Labor market condi-tions improved further in December, and indicators of labor compensation continued to show robust increases'}, vector=None, shard_key=None)]

## Semantic Search

In [26]:
import openai

In [27]:
def get_embeddings(text):
    try:
        response = openai.Embedding.create(
            input=text,
            engine="text-embedding-3-small"  # Check the latest documentation for available engines
        )
        return response['data'][0]['embedding']  # Extracting the embedding vector
    except Exception as e:
        return str(e)  # Return error as string for debugging

In [28]:
search_text = 'inflation'
search_vector = get_embeddings(search_text)
client.search(
    collection_name="Fed_Speeches",
    query_vector=search_vector,
    limit = 10)


ValidationError: 3 validation errors for SearchRequest
vector.list[float]
  Input should be a valid list [type=list_type, input_value='\n\nYou tried to access ...ython/discussions/742\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/list_type
vector.NamedVector
  Input should be a valid dictionary or instance of NamedVector [type=model_type, input_value='\n\nYou tried to access ...ython/discussions/742\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type
vector.NamedSparseVector
  Input should be a valid dictionary or instance of NamedSparseVector [type=model_type, input_value='\n\nYou tried to access ...ython/discussions/742\n', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/model_type

In [31]:
import openai
import qdrant_client

# Function to generate embeddings using OpenAI API
def get_embeddings(text):
    try:
        # Create an embedding using the specified engine
        response = openai.Embedding.create(
            input=text,
            engine="text-embedding-ada-002"  # Use the correct embedding engine for OpenAI version 1.31.0
        )
        # Extract and return the embedding vector (a list of floats)
        return response['data'][0]['embedding']
    except Exception as e:
        return str(e)  # Return error as string for debugging

# Initialize Qdrant client
qdrant = qdrant_client.QdrantClient(host="localhost", port=6333)  # Adjust host/port as needed

# Search query text
search_text = 'inflation'
search_vector = get_embeddings(search_text)

if isinstance(search_vector, list):  # Ensure the embedding is a valid list
    # Perform search using Qdrant
    try:
        search_results = qdrant.search(
            collection_name="Fed_Speeches",  # The name of your Qdrant collection
            query_vector=search_vector,      # The list of floats (embedding vector)
            limit=10                         # Number of top results to return
        )
        print(search_results)
    except Exception as e:
        print(f"Error during search: {str(e)}")
else:
    print(f"Error in embedding generation: {search_vector}")


Error in embedding generation: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

