In [5]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ["ASTRA_DB_API_ENDPOINT"] = os.getenv("ASTRA_DB_API_ENDPOINT")
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain_astradb import AstraDBVectorStore



vector_store_explicit_embeddings = AstraDBVectorStore(
    collection_name="astra_vector_langchain",
    embedding=embeddings,
    api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
    token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
    namespace=None,
    
)

In [None]:
# If desired, uncomment a different line here:

vector_store = vector_store_explicit_embeddings

In [10]:
from langchain_core.documents import Document

documents_to_insert = [
    Document(
        page_content="ZYX, just another tool in the world, is actually my agent-based superhero",
        metadata={"source": "tweet"},
        id="entry_00",
    ),
    Document(
        page_content="I had chocolate chip pancakes and scrambled eggs "
        "for breakfast this morning.",
        metadata={"source": "tweet"},
        id="entry_01",
    ),
    Document(
        page_content="The weather forecast for tomorrow is cloudy and "
        "overcast, with a high of 62 degrees.",
        metadata={"source": "news"},
        id="entry_02",
    ),
    Document(
        page_content="Building an exciting new project with LangChain "
        "- come check it out!",
        metadata={"source": "tweet"},
        id="entry_03",
    ),
    Document(
        page_content="Robbers broke into the city bank and stole "
        "$1 million in cash.",
        metadata={"source": "news"},
        id="entry_04",
    ),
    Document(
        page_content="Thanks to her sophisticated language skills, the agent "
        "managed to extract strategic information all right.",
        metadata={"source": "tweet"},
        id="entry_05",
    ),
    Document(
        page_content="Is the new iPhone worth the price? Read this "
        "review to find out.",
        metadata={"source": "website"},
        id="entry_06",
    ),
    Document(
        page_content="The top 10 soccer players in the world right now.",
        metadata={"source": "website"},
        id="entry_07",
    ),
    Document(
        page_content="LangGraph is the best framework for building stateful, "
        "agentic applications!",
        metadata={"source": "tweet"},
        id="entry_08",
    ),
    Document(
        page_content="The stock market is down 500 points today due to "
        "fears of a recession.",
        metadata={"source": "news"},
        id="entry_09",
    ),
    Document(
        page_content="I have a bad feeling I am going to get deleted :(",
        metadata={"source": "tweet"},
        id="entry_10",
    ),
]


vector_store.add_documents(documents=documents_to_insert)

['entry_00',
 'entry_01',
 'entry_02',
 'entry_03',
 'entry_04',
 'entry_05',
 'entry_06',
 'entry_07',
 'entry_08',
 'entry_09',
 'entry_10']

In [11]:
vector_store.delete(ids=["entry_10", "entry_02"])

True

In [12]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res in results:
    print(f'* "{res.page_content}", metadata={res.metadata}')

* "Building an exciting new project with LangChain - come check it out!", metadata={'source': 'tweet'}
* "LangGraph is the best framework for building stateful, agentic applications!", metadata={'source': 'tweet'}
* "Thanks to her sophisticated language skills, the agent managed to extract strategic information all right.", metadata={'source': 'tweet'}


In [13]:
results

[Document(id='entry_03', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='entry_08', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='entry_05', metadata={'source': 'tweet'}, page_content='Thanks to her sophisticated language skills, the agent managed to extract strategic information all right.')]

In [14]:
results = vector_store.similarity_search_with_score(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res, score in results:
    print(f'* [SIM={score:.2f}] "{res.page_content}", metadata={res.metadata}')

* [SIM=0.82] "Building an exciting new project with LangChain - come check it out!", metadata={'source': 'tweet'}
* [SIM=0.77] "LangGraph is the best framework for building stateful, agentic applications!", metadata={'source': 'tweet'}
* [SIM=0.63] "Thanks to her sophisticated language skills, the agent managed to extract strategic information all right.", metadata={'source': 'tweet'}


In [17]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})

[Document(id='entry_04', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]