In [7]:
import getpass
import os


def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


_set_env("ANTHROPIC_API_KEY")
_set_env("COHERE_API_KEY")

In [14]:

from langchain_community.document_loaders import TextLoader

root_dir = "./codebases/py-llm-core"
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try:
            loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
            docs.extend(loader.load_and_split())
        except Exception:
            pass

In [8]:
from langchain_cohere import CohereEmbeddings

# Set embeddings
embd = CohereEmbeddings(model="embed-english-v3.0")

In [15]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

Created a chunk of size 2643, which is longer than the specified 1000
Created a chunk of size 1020, which is longer than the specified 1000
Created a chunk of size 1540, which is longer than the specified 1000
Created a chunk of size 1369, which is longer than the specified 1000


In [14]:
from PIL import Image

In [2]:
import deeplake

# Create a local dataset
# ds = deeplake.dataset("/datasets/my_dataset", overwrite=True)

/datasets/my_dataset loaded successfully.





In [16]:
from langchain_community.vectorstores import DeepLake

# username = "<USERNAME_OR_ORG>"  # replace with your username from app.activeloop.ai
db = DeepLake(
    dataset_path=f"langchain_store",
    # dataset_path=f"hub://{username}/twitter-algorithm",
    embedding=embd,
)
db.add_documents(texts)

Deep Lake Dataset in langchain_store already exists, loading from the storage


Creating 153 embeddings in 1 batches of size 153:: 100%|██████████| 1/1 [00:04<00:00,  4.97s/it]

Dataset(path='langchain_store', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (153, 1024)  float32   None   
    id        text      (153, 1)      str     None   
 metadata     json      (153, 1)      str     None   
   text       text      (153, 1)      str     None   





['b807eef2-cafd-11ef-abdd-88b111616788',
 'b807eef3-cafd-11ef-a79b-88b111616788',
 'b807eef4-cafd-11ef-8e39-88b111616788',
 'b807eef5-cafd-11ef-81c9-88b111616788',
 'b807eef6-cafd-11ef-a1d1-88b111616788',
 'b807eef7-cafd-11ef-8ab0-88b111616788',
 'b807eef8-cafd-11ef-9aec-88b111616788',
 'b807eef9-cafd-11ef-91d0-88b111616788',
 'b807eefa-cafd-11ef-932c-88b111616788',
 'b807eefb-cafd-11ef-87dc-88b111616788',
 'b807eefc-cafd-11ef-b50f-88b111616788',
 'b807eefd-cafd-11ef-8351-88b111616788',
 'b807eefe-cafd-11ef-98c1-88b111616788',
 'b807eeff-cafd-11ef-b07b-88b111616788',
 'b807ef00-cafd-11ef-8335-88b111616788',
 'b807ef01-cafd-11ef-a275-88b111616788',
 'b807ef02-cafd-11ef-b645-88b111616788',
 'b807ef03-cafd-11ef-a2d0-88b111616788',
 'b807ef04-cafd-11ef-ba77-88b111616788',
 'b807ef05-cafd-11ef-930e-88b111616788',
 'b807ef06-cafd-11ef-bf35-88b111616788',
 'b807ef07-cafd-11ef-9d19-88b111616788',
 'b807ef08-cafd-11ef-84bf-88b111616788',
 'b807ef09-cafd-11ef-a416-88b111616788',
 'b807ef0a-cafd-

In [10]:
db

<langchain_community.vectorstores.deeplake.DeepLake at 0x24e0634bb60>

In [34]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
# retriever.search_kwargs["use_maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 10

In [35]:
def filter(x):
    # filter based on source code
    if "com.google" in x["text"].data()["value"]:
        return False

    # filter based on path e.g. extension
    metadata = x["metadata"].data()["value"]
    return "scala" in metadata["source"] or "py" in metadata["source"]


### turn on below for custom filtering
# retriever.search_kwargs['filter'] = filter

In [36]:
from langchain.chains import ConversationalRetrievalChain
# from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic

# model = ChatOpenAI(model="gpt-3.5-turbo-0613")  # switch to 'gpt-4'
model = ChatAnthropic(
  model="claude-3-haiku-20240307",
    temperature=0,
    max_tokens=1024,
    timeout=None,
    max_retries=2,
)
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [39]:
questions = [
    "What does the class BaseParser do?",
    # "What is the chain of density prompting technique and how is it used here?",
    # "Which LLM are used in the example?",
    # "is it Likes + Bookmarks, or not clear from the code?",
    # "What are the major negative modifiers that lower your linear ranking parameters?",
    # "How do you get assigned to SimClusters?",
    # "What is needed to migrate from one SimClusters to another SimClusters?",
    # "How much do I get boosted within my cluster?",
    # "How does Heavy ranker work. what are it’s main inputs?",
    # "How can one influence Heavy ranker?",
    # "why threads and long tweets do so well on the platform?",
    # "Are thread and long tweet creators building a following that reacts to only threads?",
    # "Do you need to follow different strategies to get most followers vs to get most likes and bookmarks per tweet?",
    # "Content meta data and how it impacts virality (e.g. ALT in images).",
    # "What are some unexpected fingerprints for spam factors?",
    # "Is there any difference between company verified checkmarks and blue verified individual checkmarks?",
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: What does the class BaseParser do? 

**Answer**: The `BaseParser` class in the `py-llm-core` library serves as a base class for different parser implementations that use various Large Language Models (LLMs) to parse and process information from unstructured content.

The key things the `BaseParser` class does:

1. **Holds configuration**: It defines the target class, the LLM model to use, the specific model class, and optional loader and loader kwargs.

2. **Generates JSON schema**: In the `__post_init__` method, it generates a JSON schema for the target class using the `to_json_schema` function from the `schema` module.

3. **Provides a common interface**: The `BaseParser` class serves as a common interface for different parser implementations, such as `OpenAIParser`, `AzureOpenAIParser`, `MistralAIParser`, etc. These derived classes inherit from `BaseParser` and provide specific implementations for their respective LLM models.

4. **Handles LLM-specific configuration