In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = "<langchain_api_key>"
os.environ['LANGCHAIN_PROJECT']="RAG_test"
os.environ['OPENAI_API_KEY'] = "<openai_api_key>"
# os.environ['GOOGLE_API_KEY'] = ""

In [None]:
# Documents
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(question, "cl100k_base")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
embd = OllamaEmbeddings(model="snowflake-arctic-embed:33m")
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)
len(query_result)

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

In [None]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [None]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, embedding=embd)

retriever = vectorstore.as_retriever()

## Retrieval

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

docs = retriever.get_relevant_documents("What is Task Decomposition?")

len(docs)