In [None]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# Define the columns we want to embed vs which ones we want in metadata
columns_to_emebd = ["<ARRAY OF FIELDS TO EMBED>"]
columns_to_metadata = ["<ARRAY OF FIELDS FOR METADATA>"]


# Process the CSV into the embedable content vs the metadata and put it into Document format so that we can chunk it into pieces.
docs = []
with open('<PATH TO CSV>', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_emebd if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

# Lets split the document using Chracter splitting. 
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

# Generate embeddings from documents and store in a vector database
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, OpenAIEmbeddings())

# Query the vector database for information.
query = "<QUERY FOR DATA>"
docs = db.similarity_search(query)
print(docs[0].page_content)
print(docs[0].metadata)

In [28]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    # Add Attriute descriptions in the following format
    # AttributeInfo(
    #     name="<METADATA NAME>",
    #     description="<METADATA DESCRIPTION>",
    #     type="<TYPE>",
    # ),
]

document_content_description = "<DESCRIPTION FOR DATA SET>"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, db, document_content_description, metadata_field_info, verbose=True
)

In [None]:
retriever.get_relevant_documents("expensive heart monitor")