In [82]:
import os
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import re
import chromadb
from langchain.vectorstores import Chroma
import openai
import streamlit as st
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from streamlit_chat import message
from dotenv import load_dotenv


In [83]:
import warnings
warnings.filterwarnings("ignore")

In [84]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [85]:
import os
os.environ['OPENAI_API_KEY'] = ""
os.environ["ACTIVELOOP_TOKEN"] = ""
os.environ["GOOGLE_API_KEY"]= ""
os.environ["GOOGLE_CSE_ID"]= ""
os.environ["HUGGINGFACEHUB_API_TOKEN"]= ""
os.environ["COHERE_API_KEY"] = ""
os.environ["WOLFRAM_ALPHA_APPID"] = ""
os.environ["SERPAPI_API_KEY"]= ""

In [86]:
embed_ada =  OpenAIEmbeddings(model="text-embedding-ada-002")
chromadb_collection_name= "ada_hf_hub_doc_collection"
scraped_content_save_path= "/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/"

In [87]:
def get_documentation_urls():
    # List of relative URLs for Hugging Face documentation pages, commented a lot of these because it would take too long to scrape all of them
    return [
    '/docs/huggingface_hub/guides/overview',
    '/docs/huggingface_hub/guides/download',
    '/docs/huggingface_hub/guides/upload',
    '/docs/huggingface_hub/guides/hf_file_system',
    '/docs/huggingface_hub/guides/repository',
    '/docs/huggingface_hub/guides/search',
    '/docs/huggingface_hub/guides/inference',
    '/docs/huggingface_hub/guides/community',
    '/docs/huggingface_hub/guides/manage-cache',
    '/docs/huggingface_hub/guides/model-cards',
    '/docs/huggingface_hub/guides/manage-spaces',
    '/docs/huggingface_hub/guides/integrations',
    '/docs/huggingface_hub/guides/webhooks_server', 
    #Add the rest of the URLs here
    ]

In [88]:
def construct_full_url(base_url, relative_url):
    # Construct the full URL by appending the relative URL to the base URL
    return base_url + relative_url


In [89]:
def scrape_page_content(url):
    # Send a GET request to the URL and parse the HTML response using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract the desired content from the page (in this case, the body text)
    text=soup.body.text.strip()
    # Remove non-ASCII characters
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]', '', text)
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [90]:
def scrape_all_content(base_url, relative_url, filepath):
    # Loop through the list of URLs, scrape content, and add it to the content list
#     content = []
#     for relative_url in relative_urls:
    full_url = construct_full_url(base_url, relative_url)
    scraped_content = scrape_page_content(full_url)
    scraped_content= scraped_content.rstrip('\n')
    filename= filepath+ relative_url.split("/")[-1] + ".txt"
    # Write the scraped content to a file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write("%s\n" % scraped_content)
    
    return scraped_content, filename

In [91]:
# Define a function to load documents from a file
def load_docs(filename):
    # Create an empty list to hold the documents
    docs = []
    try:
        # Load the file using the TextLoader class and UTF-8 encoding
        loader = TextLoader(filename, encoding='utf-8')
        # Split the loaded file into separate documents and add them to the list of documents
        docs.extend(loader.load_and_split())
    except Exception as e:
        # If an error occurs during loading, ignore it and return an empty list of documents
        pass
    # Return the list of documents
    return docs

In [92]:
def split_docs(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20, length_function=len)
    return text_splitter.split_documents(docs) 

In [93]:
def load_vectors_into_db(documents):
    client = chromadb.Client()
    db = Chroma.from_documents(documents, embed_ada, client=client, collection_name=chromadb_collection_name)
    print("There are", db._collection.count(), "in the collection")
    return db

In [94]:
# Define the main function
client = chromadb.Client()
client.delete_collection(name=chromadb_collection_name)
def main():
    base_url = 'https://huggingface.co'
    # Set the root directory where the content file will be saved
    root_dir ='./'
    relative_urls = get_documentation_urls()
    # Scrape all the content from the relative urls and save it to the content file
    for url in relative_urls:
        content,filename = scrape_all_content(base_url, url, scraped_content_save_path)
        # Load the content from the file
        docs = load_docs(filename)
        # Split the content into individual documents
        docs = split_docs(docs)
        # Create a DeepLake database with the given dataset path and embedding function
        ada_hf_hub_doc_db = load_vectors_into_db(docs)

# Call the main function if this script is being run as the main program
if __name__ == '__main__':
    main()

/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/overview.txt
There are 6 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/download.txt
There are 32 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/upload.txt
There are 97 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/hf_file_system.txt
There are 110 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/repository.txt
There are 136 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/search.txt
There are 144 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/inference.txt
There are 178 in the collection
/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/community.txt
There are 194 in the collection
/home/jupyter/

In [95]:
# client = chromadb.Client()
# collection = client.get_collection(name=chromadb_collection_name, embedding_function=embed_ada)
# print("There are", collection.count(), "in the collection")

In [96]:
# collection.get(include=['documents'])

In [97]:
# Load embeddings and DeepLake database
def load_embeddings_and_database(collection_name):
    embed_ada =  OpenAIEmbeddings(model="text-embedding-ada-002")
    client = chromadb.Client()
    db = Chroma(client = client, collection_name = collection_name, embedding_function=embed_ada)
#     db = client.get_collection(name=collection_name, embedding_function=embed_ada)
    print("There are", db._collection.count(), "in the collection")
    return db

In [98]:
# Get user input from Streamlit text input field
def get_user_input():
    return st.text_input("", value=st.session_state.get("input", "Hello, how are you?"), key="input")

In [99]:
# Search the database for a response based on the user's query
def search_db(user_input, db):
#     print("Question:", user_input)
    retriever = db.as_retriever(search_kwargs={"k": 10})
    model = ChatOpenAI(model='gpt-3.5-turbo')
    qa = RetrievalQAWithSourcesChain.from_llm(model, retriever=retriever)
    return qa({'question': user_input})

In [100]:
# Display conversation history using Streamlit messages
def display_conversation(history):
    for i in range(len(history["generated"])):
        message(history["past"][i], is_user=True, key=str(i) + "_user")
        message(history["generated"][i],key=str(i))


In [109]:
# Main function to run the app
def main():
    # Load embeddings and the DeepLake database
    db = load_embeddings_and_database(chromadb_collection_name)
    user_input= "Write a code to Load a Model Card from the Hub?"
    output = search_db(user_input, db)
    print(output)
#     print(output['sources'])
#     response = str(output["result"])
#     print(response)
# Run the main function when the script is executed
if __name__ == "__main__":
    main()

There are 366 in the collection
{'question': 'Write a code to Load a Model Card from the Hub?', 'answer': "To load a Model Card from the Hub, you can use the following code:\n\n```python\nfrom huggingface_hub import ModelCard\n\nmodel_card = ModelCard.load_model_card_from_hub(repo_id='username/repository_name')\n```\n", 'sources': '/home/jupyter/self_learning/Langchain/code/JarvisBase/data/hf_hub_doc_crawled/model-cards.txt'}


In [74]:
# # Main function to run the app
# def main():
#     # Initialize Streamlit app with a title
#     st.write("# JarvisBase 🧙")
   
#     # Load embeddings and the DeepLake database
#     db = load_embeddings_and_database(chromadb_collection_name)

#     # Record and transcribe audio
# #     transcription = record_and_transcribe_audio()

#     # Get user input from text input or audio transcription
#     user_input = get_user_input()

#     # Initialize session state for generated responses and past messages
#     if "generated" not in st.session_state:
#         st.session_state["generated"] = ["I am ready to help you"]
#     if "past" not in st.session_state:
#         st.session_state["past"] = ["Hey there!"]
        
#     # Search the database for a response based on user input and update session state
#     if user_input:
#         output = search_db(user_input, db)
#         print(output['source_documents'])
# #         st.session_state.past.append(user_input)
#         response = str(output["result"])
# #         st.session_state.generated.append(response)

#     # Display conversation history using Streamlit messages
# #     if st.session_state["generated"]:
# #         display_conversation(st.session_state)

# # Run the main function when the script is executed
# if __name__ == "__main__":
#     main()

2024-03-18 18:41:32.391 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.


There are 3 in the collection
Hello, how are you?
[Document(page_content='Hugging Face Models Datasets Spaces Posts Docs Solutions Pricing Log In Sign Up Hub Python Library documentation How-to guides Hub Python Library 🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerAutoTrainCompetitionsDatasetsDatasets-serverDiffusersEvaluateGradioHubHub Python LibraryHuggingface.jsInference API (serverless)Inference Endpoints (dedicated)OptimumPEFTSafetensorsTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jstimm Search documentation mainv0.21.4v0.20.3v0.19.3v0.18.0.rc0v0.17.3v0.16.3v0.15.1v0.14.1v0.13.4v0.12.1v0.11.0v0.10.1v0.9.1v0.8.1v0.7.0.rc0v0.6.0.rc0v0.5.1 CNDEENFRHIKO Get started Home Quickstart Installation How-to guides Overview Download files Upload files Use the CLI HfFileSystem Repository Search Inference Inference Endpoints Community Tab Collections Cache Model Cards Manage your Space Integrate a library Webhooks server Conce