In [1]:
from dotenv import load_dotenv

load_dotenv('../../.env')

True

# Set Environment

In [2]:
import os
import getpass
from langchain.vectorstores import DeepLake
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device':'cpu'} )

# Indexing the Twitter Algorithm Code Base

In [4]:
import os
from langchain.document_loaders import TextLoader

root_dir = './the-algorithm'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass

## Divide to Chunks

In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

Created a chunk of size 2549, which is longer than the specified 1000
Created a chunk of size 2095, which is longer than the specified 1000
Created a chunk of size 1983, which is longer than the specified 1000
Created a chunk of size 1020, which is longer than the specified 1000
Created a chunk of size 1540, which is longer than the specified 1000
Created a chunk of size 1245, which is longer than the specified 1000
Created a chunk of size 1257, which is longer than the specified 1000
Created a chunk of size 2273, which is longer than the specified 1000
Created a chunk of size 1411, which is longer than the specified 1000
Created a chunk of size 1263, which is longer than the specified 1000
Created a chunk of size 1672, which is longer than the specified 1000
Created a chunk of size 1794, which is longer than the specified 1000
Created a chunk of size 1034, which is longer than the specified 1000
Created a chunk of size 1201, which is longer than the specified 1000
Created a chunk of s

In [8]:
username = "thapabibek1129" # replace with your username from app.activeloop.ai
db = DeepLake(dataset_path=f"hub://{username}/twitter-algorithm", embedding_function=embeddings)
db.add_documents(texts)

Your Deep Lake dataset has been successfully created!


Creating 2799 embeddings in 6 batches of size 500:: 100%|██████████| 6/6 [08:08<00:00, 81.49s/it]

Dataset(path='hub://thapabibek1129/twitter-algorithm', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
   text       text      (2799, 1)     str     None   
 metadata     json      (2799, 1)     str     None   
 embedding  embedding  (2799, 384)  float32   None   
    id        text      (2799, 1)     str     None   





['4f60f69f-c9b6-11ee-9644-a434d9523559',
 '4f60f6a0-c9b6-11ee-b5a0-a434d9523559',
 '4f60f6a1-c9b6-11ee-bddb-a434d9523559',
 '4f60f6a2-c9b6-11ee-8a14-a434d9523559',
 '4f60f6a3-c9b6-11ee-a348-a434d9523559',
 '4f60f6a4-c9b6-11ee-997a-a434d9523559',
 '4f60f6a5-c9b6-11ee-abba-a434d9523559',
 '4f60f6a6-c9b6-11ee-b6d9-a434d9523559',
 '4f60f6a7-c9b6-11ee-bc5b-a434d9523559',
 '4f60f6a8-c9b6-11ee-9ebe-a434d9523559',
 '4f60f6a9-c9b6-11ee-a8ae-a434d9523559',
 '4f60f6aa-c9b6-11ee-8cff-a434d9523559',
 '4f60f6ab-c9b6-11ee-aa8b-a434d9523559',
 '4f60f6ac-c9b6-11ee-99fb-a434d9523559',
 '4f60f6ad-c9b6-11ee-b6e5-a434d9523559',
 '4f60f6ae-c9b6-11ee-aa3f-a434d9523559',
 '4f60f6af-c9b6-11ee-a7e1-a434d9523559',
 '4f60f6b0-c9b6-11ee-b1f0-a434d9523559',
 '4f60f6b1-c9b6-11ee-aca5-a434d9523559',
 '4f611f0d-c9b6-11ee-9f74-a434d9523559',
 '4f611f0e-c9b6-11ee-9bc6-a434d9523559',
 '4f611f0f-c9b6-11ee-8aea-a434d9523559',
 '4f611f10-c9b6-11ee-9d86-a434d9523559',
 '4f611f11-c9b6-11ee-9384-a434d9523559',
 '4f611f12-c9b6-

# Conversational Retriever Chain

In [9]:
db = DeepLake(dataset_path="hub://thapabibek1129/twitter-algorithm", read_only=True, embedding_function=embeddings)

Deep Lake Dataset in hub://thapabibek1129/twitter-algorithm already exists, loading from the storage


In [11]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['k'] = 10

# Using DeepLake Filters

In [12]:
def filter(x):
    if 'com.google' in x['text'].data()['value']:
        return False
    metadata = x['metadata'].data()['value']
    return 'scala' in metadata['source'] or 'py' in metadata['source']

# Connect to HugginFace models

In [13]:
from langchain import HuggingFaceHub

llm_mistral = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-Instruct-v0.2',
    model_kwargs={'temperature':0.5,"max_length": 64,"max_new_tokens":512}
)



In [14]:
from langchain.chains import ConversationalRetrievalChain

qa_mistral = ConversationalRetrievalChain.from_llm(llm_mistral,retriever=retriever)


# Ask Questions

In [15]:
questions = [
    "What does favCountParams do?",
    "is it Likes + Bookmarks, or not clear from the code?",
    "What are the major negative modifiers that lower your linear ranking parameters?",   
    "How do you get assigned to SimClusters?",
    "What is needed to migrate from one SimClusters to another SimClusters?",
    "How much do I get boosted within my cluster?",   
    "How does Heavy ranker work. what are it’s main inputs?",
    "How can one influence Heavy ranker?",
    "why threads and long tweets do so well on the platform?",
    "Are thread and long tweet creators building a following that reacts to only threads?",
    "Do you need to follow different strategies to get most followers vs to get most likes and bookmarks per tweet?",
    "Content meta data and how it impacts virality (e.g. ALT in images).",
    "What are some unexpected fingerprints for spam factors?",
    "Is there any difference between company verified checkmarks and blue verified individual checkmarks?",
] 

In [18]:
chat_history = []

for question in questions:  
    result = qa_mistral({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")


-> **Question**: What does favCountParams do? 

**Answer**:  The context does not provide enough information to answer the question directly. The code snippet includes several FSParam and FSBoundedParam definitions with names that include "favCount", but it's not clear what those parameters are used for or how they relate to each other. Without additional context or documentation, it's impossible to determine the exact purpose of "favCountParams". 

-> **Question**: is it Likes + Bookmarks, or not clear from the code? 

**Answer**: 
Based on the code snippet provided, it is not clear whether 'favCountParams' represents the total of 'Likes' and 'Bookmarks'. The code snippet only shows that 'favCountParams' is being added to 'interestedInCandidates.size'. However, it does not provide any context about what 'interestedInCandidates' represents or whether it includes 'Likes' and 'Bookmarks'. To determine if 'favCountParams' represents the total of 'Likes' and 'Bookmarks', more context about