In [2]:
## going through the code base is important and daunting too, understanding how everything works and knowing what sorts of function and classes is used is important
## hence, we will be building a RAG based pipeline on code bases / code repo

## github co-pilot, codeium are all steroid versions of these 

In [6]:
## Same vanilla RAG pipeline, but we use ContextAwareSplitting in RecursiveCharacterTextSplitting by specifying language of the codebase

In [7]:
# How context aware works 

# In particular, we can employ a splitting strategy that does a few things:

# Keeps each top-level function and class in the code is loaded into separate documents.
# Puts remaining into a separate document.
# Retains metadata about where each split comes from

In [8]:
# !pip install langchain tiktoken chromadb  openai

In [10]:
# let's clone a repo and use that to test and create my RAG 
# will clone my ChatAI proj and will use that 
from git import Repo

code_repo_path = r"/mnt/e/tinkering/github/tinkering_RAG/code_repo"
Repo.clone_from("https://github.com/MANISH007700/ChatAI.git", to_path = code_repo_path)

<git.repo.base.Repo '/mnt/e/tinkering/github/tinkering_RAG/code_repo/.git'>

### Import libs and pkges

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter # this will help us split data 
from langchain.text_splitter import Language   # this will help us decide the language

from langchain.document_loaders.generic import GenericLoader    # kind of like AutoModel from HF, helps in loading generic stuff by passing additional kwargs
from langchain.document_loaders.parsers import LanguageParser   # kind of parser you wanna use in GenericLoader 

### Parse the code repo and load it

In [14]:
loader = GenericLoader.from_filesystem(
    r"/mnt/e/tinkering/github/tinkering_RAG/code_repo",
    glob = "*",
    suffixes = [".py"],  # only python files [ as of now, python and js is supported by LanguageParser],
    parser = LanguageParser(language=Language.PYTHON, parser_threshold=30)   # parse_threshold : min number of code lines for the parser to split into text chunks
)

docs = loader.load()

In [15]:
docs

[Document(page_content='def main():\n\n    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")\n    st.write(css, unsafe_allow_html=True)\n\n    if "conversation" not in st.session_state:\n        st.session_state.conversation = None\n    if "chat_history" not in st.session_state:\n        st.session_state.chat_history = None\n\n    with st.sidebar:\n        st.subheader("Your documents")\n        pdf_docs = st.file_uploader("Upload your PDFs, API KEY here and click on \'Process\'", accept_multiple_files=True)\n        openai_api_key = st.text_input("OPENAI API KEY", key="file_qa_api_key", type=\'password\')\n        \n        if st.button("Process"):\n            with st.spinner("Processing"):\n                \n                # get pdf text\n                raw_text = get_pdf_text(pdf_docs)\n                st.text("Extracting Texts Done ✅")\n\n                # get the text chunks\n                text_chunks = get_text_chunks(raw_text)\n                st

### Split the docs based on language using RecursiveCharacterTextSplitter

In [18]:
text_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size = 1000, chunk_overlap = 200)
split_docs = text_splitter.split_documents(docs)


In [20]:
split_docs, len(split_docs)

([Document(page_content='def main():\n\n    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")\n    st.write(css, unsafe_allow_html=True)\n\n    if "conversation" not in st.session_state:\n        st.session_state.conversation = None\n    if "chat_history" not in st.session_state:\n        st.session_state.chat_history = None\n\n    with st.sidebar:\n        st.subheader("Your documents")\n        pdf_docs = st.file_uploader("Upload your PDFs, API KEY here and click on \'Process\'", accept_multiple_files=True)\n        openai_api_key = st.text_input("OPENAI API KEY", key="file_qa_api_key", type=\'password\')\n        \n        if st.button("Process"):\n            with st.spinner("Processing"):\n                \n                # get pdf text\n                raw_text = get_pdf_text(pdf_docs)\n                st.text("Extracting Texts Done ✅")', metadata={'source': '/mnt/e/tinkering/github/tinkering_RAG/code_repo/app.py', 'content_type': 'functions_classes',

In [21]:
split_docs[0].page_content

'def main():\n\n    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")\n    st.write(css, unsafe_allow_html=True)\n\n    if "conversation" not in st.session_state:\n        st.session_state.conversation = None\n    if "chat_history" not in st.session_state:\n        st.session_state.chat_history = None\n\n    with st.sidebar:\n        st.subheader("Your documents")\n        pdf_docs = st.file_uploader("Upload your PDFs, API KEY here and click on \'Process\'", accept_multiple_files=True)\n        openai_api_key = st.text_input("OPENAI API KEY", key="file_qa_api_key", type=\'password\')\n        \n        if st.button("Process"):\n            with st.spinner("Processing"):\n                \n                # get pdf text\n                raw_text = get_pdf_text(pdf_docs)\n                st.text("Extracting Texts Done ✅")'

### Gonna do the RetrievalQA now

In [25]:
# we gonna use search_type as 'mmr' - max marginal ref [ this will help us remove any duplicated chunks ], top_k = 5 

# we need embeddings to create embeds 
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceBgeEmbeddings

# we need some v-db
from langchain.vectorstores import Chroma


In [29]:
# load the bge embeddings model 

model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

Downloading .gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [31]:
## check some embeddings using this model bge 
embeds = hf.embed_query("how to create a function")

In [32]:
len(embeds)

384

In [34]:
import getpass, os 

os.environ['OPENAI_API_KEY'] = getpass.getpass()    # prompt user to add openai key
vector_db = Chroma.from_documents(split_docs, embedding=OpenAIEmbeddings())

retriever = vector_db.as_retriever(
    search_type = 'mmr',
    search_kwargs = {'k': 5},
)

In [35]:
vector_db.similarity_search_with_score('which function will help me to extract text from pdf ? ')

[(Document(page_content='def get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text', metadata={'content_type': 'functions_classes', 'language': 'python', 'source': '/mnt/e/tinkering/github/tinkering_RAG/code_repo/utils.py'}),
  0.34997424483299255),
 (Document(page_content='def get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text', metadata={'content_type': 'functions_classes', 'language': 'python', 'source': '/mnt/e/tinkering/github/tinkering_RAG/code_repo/utils.py'}),
  0.34997424483299255),
 (Document(page_content='# get the text chunks\n                text_chunks = get_text_chunks(raw_text)\n                st.text("Proc

In [37]:
vector_db.similarity_search_with_relevance_scores('which function will help me to extract text from pdf ? ')

[(Document(page_content='def get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text', metadata={'content_type': 'functions_classes', 'language': 'python', 'source': '/mnt/e/tinkering/github/tinkering_RAG/code_repo/utils.py'}),
  0.75253083823795),
 (Document(page_content='def get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text', metadata={'content_type': 'functions_classes', 'language': 'python', 'source': '/mnt/e/tinkering/github/tinkering_RAG/code_repo/utils.py'}),
  0.75253083823795),
 (Document(page_content='# get the text chunks\n                text_chunks = get_text_chunks(raw_text)\n                st.text("Processing

### Create the RAG pipeline

In [38]:
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

llm = ChatOpenAI(model_name = 'gpt-3.5-turbo')

In [39]:
memory = ConversationBufferMemory(llm=llm, memory_key='chat_history', return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever = retriever, memory = memory)

In [40]:
qa('which function will help me to extract text from pdf ? ')

{'question': 'which function will help me to extract text from pdf ? ',
 'chat_history': [HumanMessage(content='which function will help me to extract text from pdf ? '),
  AIMessage(content='The function that will help you extract text from PDF is the `get_pdf_text` function.')],
 'answer': 'The function that will help you extract text from PDF is the `get_pdf_text` function.'}

In [41]:
qa('Can you share me which file and the code for get_pdf_text function')['answer']

'Sure! Here\'s the code for the `get_pdf_text` function:\n\n```python\ndef get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text\n```\n\nThis function takes a list of PDF documents as input (`pdf_docs`) and extracts the text from each page of each PDF document using the `extract_text` method from the `PdfReader` class in the `PyPDF2` library. The extracted text is then concatenated and returned as a single string.\n\nPlease note that you will need to have the `PyPDF2` library installed in order to use this function.'

In [53]:
import pprint
pprint.pprint(''' 'Sure! Here\'s the code for the `get_pdf_text` function:\n\n```python\ndef get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text\n```\n\nThis function takes a list of PDF documents as input (`pdf_docs`) and extracts the text from each page of each PDF document using the `extract_text` method from the `PdfReader` class in the `PyPDF2` library. The extracted text is then concatenated and returned as a single string.\n\nPlease note that you will need to have the `PyPDF2` library installed in order to use this function.''')

(" 'Sure! Here's the code for the `get_pdf_text` function:\n"
 '\n'
 '```python\n'
 'def get_pdf_text(pdf_docs):\n'
 '    text = ""\n'
 '    for pdf in pdf_docs:\n'
 '        pdf_reader = PdfReader(pdf)\n'
 '        for page in pdf_reader.pages:\n'
 '            text += page.extract_text()\n'
 '    logger.info("Text Extracted...")\n'
 '    return text\n'
 '```\n'
 '\n'
 'This function takes a list of PDF documents as input (`pdf_docs`) and '
 'extracts the text from each page of each PDF document using the '
 '`extract_text` method from the `PdfReader` class in the `PyPDF2` library. '
 'The extracted text is then concatenated and returned as a single string.\n'
 '\n'
 'Please note that you will need to have the `PyPDF2` library installed in '
 'order to use this function.')


In [56]:
qa.memory.buffer

[HumanMessage(content='which function will help me to extract text from pdf ? '),
 AIMessage(content='The function that will help you extract text from PDF is the `get_pdf_text` function.'),
 HumanMessage(content='Can you share me which file and the code for get_pdf_text function'),
 AIMessage(content='Sure! Here\'s the code for the `get_pdf_text` function:\n\n```python\ndef get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text\n```\n\nThis function takes a list of PDF documents as input (`pdf_docs`) and extracts the text from each page of each PDF document using the `extract_text` method from the `PdfReader` class in the `PyPDF2` library. The extracted text is then concatenated and returned as a single string.\n\nPlease note that you will need to have the `PyPDF2` library installed in order to use this fu

In [57]:
qa('Which embeddings and llm model am I using to generate response ? ')

{'question': 'Which embeddings and llm model am I using to generate response ? ',
 'chat_history': [HumanMessage(content='which function will help me to extract text from pdf ? '),
  AIMessage(content='The function that will help you extract text from PDF is the `get_pdf_text` function.'),
  HumanMessage(content='Can you share me which file and the code for get_pdf_text function'),
  AIMessage(content='Sure! Here\'s the code for the `get_pdf_text` function:\n\n```python\ndef get_pdf_text(pdf_docs):\n    text = ""\n    for pdf in pdf_docs:\n        pdf_reader = PdfReader(pdf)\n        for page in pdf_reader.pages:\n            text += page.extract_text()\n    logger.info("Text Extracted...")\n    return text\n```\n\nThis function takes a list of PDF documents as input (`pdf_docs`) and extracts the text from each page of each PDF document using the `extract_text` method from the `PdfReader` class in the `PyPDF2` library. The extracted text is then concatenated and returned as a single st