In [1]:
import warnings
warnings.filterwarnings("ignore")
%load_ext watermark
%watermark

Last updated: 2023-11-21T11:06:49.696080+05:30

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [2]:
# Either you can store the  OpenAI key in the “OPENAI_API_KEY” environment variable.
# or pass it here as below from a config.ini
import configparser
workingFolder='C:\\Users\\jfrancis\\OneDrive - GalaxE. Solutions, Inc\\GalaxE D Drive\\AI Journey\\Gen AI\\'
# Read the configuration file
config = configparser.ConfigParser()
config.read(workingFolder+'\\config.ini')
OPENAI_API_KEY=config.get('General','OPENAI_API_KEY')
ACTIVELOOP_TOKEN=config.get('General','ACTIVELOOP_TOKEN')
ACTIVELOOP_ORG_ID=config.get('General','ACTIVELOOP_ORG_ID')
HUGGINGFACEHUB_API_TOKEN=config.get('General','HUGGINGFACEHUB_API_TOKEN')
GOOGLE_API_KEY=config.get('General','GOOGLE_API_KEY')
GOOGLE_CSE_ID=config.get('General','GOOGLE_CSE_ID')
COHERE_API_KEY=config.get('General','COHERE_API_KEY')

In [3]:
# Get the token from OPENAI/Active loop website before this. Now we are taking from the config.ini
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ["ACTIVELOOP_TOKEN"] = ACTIVELOOP_TOKEN
# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = ACTIVELOOP_ORG_ID

In [11]:
import langchain
import streamlit
import streamlit_chat
%watermark --iversions 

langchain     : 0.0.336
streamlit     : 1.28.2
streamlit_chat: 0.1.1



## Chat with a GitHub Repository

### Introduction

Large language models (LLMs) accomplish a remarkable level of language comprehension during their training process. It enables them to generate human-like text and creates powerful representations from textual data. We already covered leveraging LangChain to use LLMs for writing content with hands-on projects.

This will focus on using the language models for generating embeddings from corpora. The mentioned representation will power a chat application that can answer questions from any text by finding the closest data point to an inquiry. This project focuses on finding answers from a GitHub repository’s text files like .md and .txt. So, we will start by capturing data from a GitHub repository and converting it to embeddings. These embeddings will be saved on the Activeloop’s Deep Lake vector database for fast and easy access. The Deep Lake’s retriever object will find the related files based on the user’s query and provide them as the context to the model. Lastly, the model leverages the provided information to the best of its ability to answer the question.

1) Processing the Files 

2) Saving the Embedding 

3) Retrieving from Database 

4) Creating an Interface.

### Processing the Repository Files

In order to access the files in the target repository, the script will clone the desired repository onto your computer, placing the files in a folder named "repos". Once we download the files, it is a matter of looping through the directory to create a list of files. It is possible to filter out specific extensions or environmental items.

Run git clone in gitbash to clone your targetted repo and note down the root directory.

git clone https://github.com/peterw/Chat-with-Git-Repo.git


In [4]:
from langchain.document_loaders import TextLoader

root_dir = workingFolder + "Chat-with-Github-Repo"
docs = []
file_extensions = ['.md', '.txt']

for dirpath, dirnames, filenames in os.walk(root_dir):    
    for file in filenames:
        file_path = os.path.join(dirpath, file)
        if file_extensions and os.path.splitext(file)[1] not in file_extensions:
            continue
        loader = TextLoader(file_path, encoding="utf-8")
        docs.extend(loader.load_and_split())

print(docs)

[Document(page_content='black\nflake8', metadata={'source': 'C:\\Users\\jfrancis\\OneDrive - GalaxE. Solutions, Inc\\GalaxE D Drive\\AI Journey\\Gen AI\\Chat-with-Github-Repo\\dev-requirements.txt'}), Document(page_content="# Chat-with-Github-Repo\n\nThis repository contains Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake.\n\nThe chatbot searches a dataset stored in Deep Lake to find relevant information from any Git repository and generates responses based on the user's input.\n\n## Files\n\n- `src/utils/process.py`: This script clones a Git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.\n\n- `src/utils/chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.\n\n- `src/main.py`: This script contains the command l

The sample code above creates a list of all the files in a repository. It is possible to filter each item by extension types like file_extensions=['.md', '.txt'] which only focus on markdown and text files. 

Now that the list of files are created, the split_documents method from the CharacterTextSplitter class in the LangChain library will read the files and split their contents into chunks of 1000 characters.

In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splitted_text = text_splitter.split_documents(docs)

print(splitted_text)

[Document(page_content='black\nflake8', metadata={'source': 'C:\\Users\\jfrancis\\OneDrive - GalaxE. Solutions, Inc\\GalaxE D Drive\\AI Journey\\Gen AI\\Chat-with-Github-Repo\\dev-requirements.txt'}), Document(page_content="# Chat-with-Github-Repo\n\nThis repository contains Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake.\n\nThe chatbot searches a dataset stored in Deep Lake to find relevant information from any Git repository and generates responses based on the user's input.\n\n## Files\n\n- `src/utils/process.py`: This script clones a Git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.\n\n- `src/utils/chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.\n\n- `src/main.py`: This script contains the command l

The splitted_text variable holds the textual content which is ready to be converted to embedding representations.

### Saving the Embeddings

Let’s create the database before going through the process of converting texts to embeddings. It is where the integration between LangChain and Deep Lake comes in handy! We initialize the database in cloud using the hub://... format and the OpenAIEmbeddings() from LangChain as the embedding function. The Deep Lake library will iterate through the content and generate the embedding automatically.

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

# Before executing the following code, make sure to have
# your OpenAI key saved in the “OPENAI_API_KEY” environment variable.
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = ACTIVELOOP_ORG_ID
my_activeloop_dataset_name = "langchain_course_chat_with_gh"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
db.add_documents(splitted_text)

Using embedding function is deprecated and will be removed in the future. Please use embedding instead.


Your Deep Lake dataset has been successfully created!


Creating 5 embeddings in 1 batches of size 5:: 100%|█████████████████████████████████████| 1/1 [00:53<00:00, 53.74s/it]

Dataset(path='hub://jfrancis/langchain_course_chat_with_gh', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (5, 1)      str     None   
 metadata     json      (5, 1)      str     None   
 embedding  embedding  (5, 1536)  float32   None   
    id        text      (5, 1)      str     None   





['c4f6a1f8-8835-11ee-b60b-401c83da435e',
 'c4f6a1f9-8835-11ee-98a7-401c83da435e',
 'c4f6a1fa-8835-11ee-b082-401c83da435e',
 'c4f6a1fb-8835-11ee-b8f9-401c83da435e',
 'c4f6a1fc-8835-11ee-9aea-401c83da435e']

### Retrieving from Database

The last step is to code the process to answer the user’s question based on the database’s information. Once again, the integration of LangChain and Deep Lake simplifies the process significantly, making it exceptionally easy. We need 1) a retriever object from the Deep Lake database using the .as_retriever() method, and 2) a conversational model like ChatGPT using the ChatOpenAI() class.

Finally, LangChain’s RetrievalQA class ties everything together! It uses the user’s input as the prompt while including the results from the database as the context. So, the ChatGPT model can find the correct one from the provided context. It is worth noting that the database retriever is configured to gather instances closely related to the user’s query by utilizing cosine similarities.

In [9]:
# Create a retriever from the DeepLake instance
retriever = db.as_retriever()

# Set the search parameters for the retriever
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 10

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# Create a ChatOpenAI model instance
model = ChatOpenAI()

# Create a RetrievalQA instance from the model and retriever
qa = RetrievalQA.from_llm(model, retriever=retriever)

# Return the result of the query
qa.run("What is the repository's name?")

'The repository\'s name is "Chat-with-Github-Repo".'

### Create an Interface

Creating a user interface (UI) for the bot to be accessed through a web browser is an optional yet crucial step. This addition will elevate your ideas to new heights, allowing users to engage with the application effortlessly, even without any programming expertise.

In [None]:
import streamlit as st
from streamlit_chat import message

# Set the title for the Streamlit app
st.title(f"Chat with GitHub Repository")

# Initialize the session state for placeholder messages.
if "generated" not in st.session_state:
    st.session_state["generated"] = ["i am ready to help you ser"]

if "past" not in st.session_state:
    st.session_state["past"] = ["hello"]

# A field input to receive user queries
input_text = st.text_input("", key="input")

# Search the databse and add the responses to state
if input_text:
    output = qa.run(input_text)
    st.session_state.past.append(input_text)
    st.session_state.generated.append(output)

# Create the conversational UI using the previous states
if st.session_state["generated"]:
    for i in range(len(st.session_state["generated"])):
        message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
        message(st.session_state["generated"][i], key=str(i))

The code above is straightforward. We call st.text_input() to create text input for users queries. The query will be passed to the previously declared RetrievalQA object, and the results will be shown using the message component. You should store the mentioned code in a Python file (for example, chat.py) and run the following command to see the interface locally.

Please read the documentation https://docs.streamlit.io/library/get-started on how to deploy https://docs.streamlit.io/library/get-started/create-an-app#share-your-app the application on the web so anyone can access it.

### Putting Everything Together

Below is the summing up with all code. As we already created the the vector database in active lake. This code will just read it.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import streamlit as st
from streamlit_chat import message
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import configparser
import os

@st.cache_resource  # This will run only once
def get_llm_qa():
    workingFolder='C:\\Users\\jfrancis\\OneDrive - GalaxE. Solutions, Inc\\GalaxE D Drive\\AI Journey\\Gen AI\\'
    # Read the configuration file
    config = configparser.ConfigParser()
    config.read(workingFolder+'\\config.ini')
    OPENAI_API_KEY=config.get('General','OPENAI_API_KEY')
    ACTIVELOOP_TOKEN=config.get('General','ACTIVELOOP_TOKEN')
    ACTIVELOOP_ORG_ID=config.get('General','ACTIVELOOP_ORG_ID')
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
    os.environ["ACTIVELOOP_TOKEN"] = ACTIVELOOP_TOKEN
    my_activeloop_org_id = ACTIVELOOP_ORG_ID

    # Read from activeloop vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    my_activeloop_org_id = ACTIVELOOP_ORG_ID
    my_activeloop_dataset_name = "langchain_course_chat_with_gh"
    dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
    db = DeepLake(dataset_path=dataset_path, read_only=True, embedding=embeddings)

    # Retrieval queue from activeloop
    retriever = db.as_retriever()
    retriever.search_kwargs["distance_metric"] = "cos"
    retriever.search_kwargs["fetch_k"] = 100
    retriever.search_kwargs["maximal_marginal_relevance"] = True
    retriever.search_kwargs["k"] = 10
    model = ChatOpenAI()
    qa = RetrievalQA.from_llm(model, retriever=retriever)
    st.success("Loaded RetrievalQA")  # 👈 Show a success message
    return qa

qa = get_llm_qa()

# Design the front end chat app
st.title(f"Chat with GitHub Repository")
if "generated" not in st.session_state:
    st.session_state["generated"] = ["i am ready to help you ser"]
if "past" not in st.session_state:
    st.session_state["past"] = ["hello"]
input_text = st.text_input("", key="input")
if input_text:
    output = qa.run(input_text)
    st.session_state.past.append(input_text)
    st.session_state.generated.append(output)
if st.session_state["generated"]:
    for i in range(len(st.session_state["generated"])):
        message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
        message(st.session_state["generated"][i], key=str(i))