Colab_Link: https://colab.research.google.com/github/ArtisanLabs/vocode-python/blob/main/apps/rag/manual_pinecone_ingestor.ipynb


In [None]:
# Install the Python Requests library via pip
# python 
# langchain 
# spacy 
# unstructured = {extras = ["local-inference"]}
# layoutparser = {extras = ["layoutmodels", "tesseract"]}
# pinecone-client
# openai 
# torch 
# tiktoken
# git

%pip install langchain spacy unstructured[local-inference] layoutparser[layoutmodels,tesseract] pinecone-client openai torch tiktoken git


In [None]:
'''
Importing necessary modules and functions:
- os module to interact with the OS
- Pinecone module for vector database operations
- OpenAIEmbeddings from langchain.embeddings.openai for generating embeddings
- SpacyTextSplitter from langchain.text_splitter for splitting text into chunks
- Pinecone from langchain.vectorstores for storing and retrieving vectors
- DirectoryLoader and UnstructuredFileLoader from langchain.document_loaders for loading documents from directories and unstructured files
'''
import os
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import SpacyTextSplitter
from langchain.vectorstores import Pinecone
from langchain.document_loaders import DirectoryLoader, UnstructuredFileLoader


In [None]:
'''
We are using Google Colab's secret manager to securely input our API keys.
This ensures that the keys are not visible in the notebook and are not stored in the notebook's history.
We also retrieve the Pinecone index name from the secret manager.
Then, we add these keys to the environment variables.
'''
from google.colab import userdata
import os

PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

PINECONE_ENVIRONMENT = userdata.get('PINECONE_ENVIRONMENT')
os.environ['PINECONE_ENVIRONMENT'] = PINECONE_ENVIRONMENT

PINECONE_INDEX = userdata.get('PINECONE_INDEX')
os.environ['PINECONE_INDEX'] = PINECONE_INDEX

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [None]:
'''
We are defining a list of libraries from which we want to fetch the code.
Each library is represented as a dictionary with three keys:
  - 'name': the name of the library
  - 'code': the URL of the library's GitHub repository
  - 'documentation_path': the path to the documentation within the repository
'''
libraries = [
    {
        'name': 'langchain',
        'code': 'https://github.com/langchain-ai/langchain',
        'documentation_path': 'docs'
    },
    {
        'name': 'supabase',
        'code': 'https://github.com/supabase/supabase',
        'documentation_path': 'apps/docs'
    },
    {
        'name': 'next.js',
        'code': 'https://github.com/vercel/next.js',
        'documentation_path': 'docs'
    },
    {
        'name': 'fastapi',
        'code': 'https://github.com/tiangolo/fastapi',
        'documentation_path': 'docs/en/docs'
    },
    {
        'name': 'vocode-python',
        'code': 'https://github.com/vocodedev/vocode-python',
        'documentation_path': 'docs'
    }
]

'''
The function get_code_from_github is defined to fetch the code from the GitHub repositories.
It iterates over the libraries list and for each library, it clones the repository to a temporary directory.
Then, it moves the documentation to a directory named 'libraries_documentation/{name}'.
'''
import os
import shutil
import tempfile
import subprocess

def get_code_from_github():
    # Create a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        for library in libraries:
            # Define the clone directory path
            clone_dir = os.path.join(temp_dir, library['name'])

            # Clone the repository using the git command
            subprocess.run(['git', 'clone', library['code'], clone_dir], check=True)
            
            # Define the source and destination for moving documentation
            doc_source = os.path.join(clone_dir, library['documentation_path'])
            doc_dest = os.path.join('libraries_documentation', library['name'])

            # Create the destination directory if it doesn't exist
            os.makedirs(doc_dest, exist_ok=True)

            # Move the documentation to the destination directory
            shutil.move(doc_source, doc_dest)

get_code_from_github()

In [None]:
'''
The following code block is responsible for loading, splitting, and indexing the documents.
'''

'''
Create a DirectoryLoader object to load all .md and .mdx files from the 'libraries_documentation' directory.
The glob pattern "**/*.md*" is used to match any markdown file in the directory or its subdirectories.
The UnstructuredFileLoader class is used to load the files.
'''
loader = DirectoryLoader('./libraries_documentation', glob="**/*.md*", show_progress=True, loader_cls=UnstructuredFileLoader)

'''
Load the documents from the directory.
'''
print("Loading documents...")
documents = loader.load()

'''
Create a SpacyTextSplitter object to split the documents into chunks of 1000 characters.
'''
text_splitter = SpacyTextSplitter(chunk_size=1000)

'''
Split the documents into chunks.
'''
print("Splitting documents...")
docs = text_splitter.split_documents(documents)

'''
Create an OpenAIEmbeddings object to generate embeddings for the documents.
'''
embeddings = OpenAIEmbeddings()

'''
Initialize the Pinecone client with the API key and environment variables.
'''
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
)

'''
Set the name of the Pinecone index.
'''
index_name = PINECONE_INDEX

'''
Create a Pinecone index from the documents and their embeddings.
'''
print("Creating index...")
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)