# Create simple vector storage

In [1]:
# Import modules 
from langchain.docstore.document import Document
import os 
from uuid import uuid4
import numpy as np
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.llms import HuggingFaceHub
# from langchain.vectorstores import Chroma

In [2]:
simple_document_text = """ 
Best Practices for Graph Structure and State

When building LangGraph workflows, keep these best practices in mind:

    Design a clear State schema: Define your state’s structure up front (using a TypedDict, Pydantic model, or dataclass) to list all fields that nodes will use. This makes it clear what data flows through the graph and helps avoid key collisions or missing data.

    Return state updates from nodes: Each node function should return a dictionary of updates to the state. Only the keys returned will be updated or added to the shared state​
    stackoverflow.com
    . For example, if your state has a field "progress_step", and a node updates it, return {"progress_step": new_value} from that node​
    stackoverflow.com
    . This ensures the intended data is persisted in the global state. (If a node returns END or a string instead of a dict, LangGraph interprets that as a routing instruction rather than a state update.)

    Name nodes and state fields descriptively: Use meaningful names for node keys (e.g., "llm" for a language model step, "summarize_tool" for a summarization tool) and for state fields (e.g., "query", "answer", "analysis"). This makes the graph easier to read and maintain. Some developers suffix node names with _node and routing functions with _router (or similar) to avoid confusion between nodes versus routing logic​
    stackoverflow.com
    .

    Set an entry point and a termination: Always specify an entry point so it’s clear where execution begins. Likewise, ensure your graph can terminate. For linear flows or simple one-step graphs, if a node has no outgoing edges, the execution will end after that node. In loops or conditional flows, use END (or workflow.set_finish_point(node)) for at least one branch so the agent can stop. This prevents infinite loops and clearly defines an endpoint​
    blog.langchain.dev
    .

    Keep state updates independent if possible: Each node should ideally produce its piece of state without requiring internal knowledge of how other nodes store data (beyond the agreed-upon state schema). This modular approach makes it easier to add or swap out nodes without breaking the graph’s state logic.
"""

In [3]:
doc = Document(simple_document_text, metadata={"title": "Langgraph Best Practices"})

In [4]:
def get_openai_models():
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    assert OPENAI_API_KEY, "Please set OPENAI_API_KEY environment variable"
    llm = ChatOpenAI(model="gpt-4o-mini")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    return llm, embeddings

def get_opensource_models():
    HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
    assert HUGGINGFACEHUB_API_TOKEN, "Please set HUGGINGFACEHUB_API_TOKEN environment variable"
    llm = HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
        model_kwargs={"temperature": 0.2}
    )
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return llm, embeddings

In [5]:
llm, embeddings = get_openai_models()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

In [7]:
chunked_doc = text_splitter.split_documents([doc])
chunked_doc

[Document(metadata={'title': 'Langgraph Best Practices', 'start_index': 2}, page_content='Best Practices for Graph Structure and State\n\nWhen building LangGraph workflows, keep these best practices in mind:\n\n    Design a clear State schema: Define your state’s structure up front (using a TypedDict, Pydantic model, or dataclass) to list all fields that nodes will use. This makes it clear what data flows through the graph and helps avoid key collisions or missing data.\n\n    Return state updates from nodes: Each node function should return a dictionary of updates to the state. Only the keys returned will be updated or added to the shared state\u200b\n    stackoverflow.com\n    . For example, if your state has a field "progress_step", and a node updates it, return {"progress_step": new_value} from that node\u200b\n    stackoverflow.com\n    . This ensures the intended data is persisted in the global state. (If a node returns END or a string instead of a dict, LangGraph interprets that

In [8]:
vector_store = Chroma(
    collection_name='openai_embeddings',
    embedding_function=embeddings,
    persist_directory='./openai_chroma_langchain_db',
)

In [9]:
uuids = [str(uuid4()) for _ in range(len(chunked_doc))]

vector_store.add_documents(documents=chunked_doc, ids=uuids)

['762e4b68-8094-4477-ac8f-ed84d2c9db04',
 '81fec0b6-cf87-4f80-b003-015fd4c8d0a8',
 '1a289fa9-0850-438d-b536-cb31673d67b3']