# Base example: How to Langchain
This is a simple example of how to use Langchain to split python files.

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PythonLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaLLM
from langchain.retrievers import BM25Retriever
# from langchain_community.llms import CTransformers
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.text_splitter import NLTKTextSplitter
import torch

: 

In [None]:
loader = DirectoryLoader("../../ast_tokenizer",
                         glob="*.py", loader_cls=PythonLoader, recursive=True)
# interpret information in the documents
documents = loader.load()

for i, v in enumerate(documents):
    print(i, v)
# splitter = RecursiveCharacterTextSplitter(chunk_size=256,
#                                           chunk_overlap=20)
# texts = splitter.split_documents(documents)

# text_splitter = NLTKTextSplitter()
# texts = text_splitter.split_documents(documents)


embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': "cuda"})

# create and save the local database
db = FAISS.from_documents(documents, embeddings)

# do prompt engineering here
template = """Use the following context to answer the user's question. 
Context: {context}
Question: {input}
"""

# load the language model
llm = OllamaLLM(model="llama3.1:8b",
                num_predict=-1,
                temperature=0.1)
# llm = CTransformers(model='./llama-2-7b-chat.ggmlv3.q8_0.bin',
#                     model_type='llama',
#                     gpu_layers=50,
#                     config={'max_new_tokens': 1024, 'temperature': 0.05})


# prepare a version of the llm pre-loaded with the local content
retriever = db.as_retriever(search_kwargs={'k': 5})
prompt = PromptTemplate(
    template=template,
    input_variables=['context', 'input'])

combine_docs_chain = create_stuff_documents_chain(llm, prompt)
qa_llm = create_retrieval_chain(retriever, combine_docs_chain)

# qa_llm = RetrievalQA.from_chain_type(llm=llm,
#                                      chain_type='stuff',
#                                      retriever=retriever,
#                                      return_source_documents=True,
#                                      chain_type_kwargs={'prompt': prompt})

# ask the AI chat about information in our local files

while True:
    inputP = input("What do you want to ask?\n")
    output = qa_llm.invoke({"input": inputP})
    print(output["answer"])


# Modified example: Custom Retriever

Testing if using a custom AST based retriever to split makes it better.

In [6]:
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('')), '../ast_tokenizer/languages')))
print(sys.path)

['/Users/javier/miniforge3/envs/rag-codebase/lib/python313.zip', '/Users/javier/miniforge3/envs/rag-codebase/lib/python3.13', '/Users/javier/miniforge3/envs/rag-codebase/lib/python3.13/lib-dynload', '', '/Users/javier/miniforge3/envs/rag-codebase/lib/python3.13/site-packages', '/Users/javier/Documents/info_retrieval_codebase_rag/ast_tokenizer/languages/python_ast', '/Users/javier/Documents/info_retrieval_codebase_rag/ast_tokenizer/languages/python_ast', '/Users/javier/Documents/info_retrieval_codebase_rag/ast_tokenizer/languages/python_ast', '/Users/javier/Documents/info_retrieval_codebase_rag/ast_tokenizer/languages', '/Users/javier/Documents/info_retrieval_codebase_rag/ast_tokenizer/languages']


In [7]:
import sys
import os
from python_ast import PythonASTDocumentLoader
from javascript_ast import JavascriptASTDocumentLoader

In [8]:
loader = DirectoryLoader("../",
                         glob="*.py", loader_cls=PythonASTDocumentLoader, recursive=True)
# interpret information in the documents
documents = loader.load()

In [10]:
for i, v in enumerate(documents):
    print(i, v.metadata[relative)

0 {'relative_path': '../testing/test_splitter.py', 'start_offset': 0, 'end_offset': 1162, 'block_type': 'others', 'block_name': 'Global Scope', 'block_args': [], 'parent_type': 'root', 'parent_name': 'root', 'functions_called': ['open("../../frontend/ui/ui.py")', 'f.read()', 'RecursiveCharacterTextSplitter.from_language(\n    language=Language.PYTHON, chunk_size=50, chunk_overlap=0\n)', 'python_splitter.create_documents([PYTHON_CODE])', 'print("\\n\\n\\n")', 'print("TESTINGTESTINGTESTING")', 'print("\\n\\n\\n")', 'PythonLoader("mainFile.py")', 'loader.load()', 'PythonSegmenter(PYTHON_CODE)', 'print(segmenter.is_valid())', 'segmenter.is_valid()', 'print("\\n Class Func\\n")', 'print(segmenter.extract_functions_classes())', 'segmenter.extract_functions_classes()', 'print("\\n Simplified \\n")', 'print(segmenter.simplify_code())', 'segmenter.simplify_code()'], 'docstrings': [], 'comments': ['for doc in python_docs:', 'Splitter splits per line or statement naively', 'print(type(doc), doc)'

# Chat messages example

For integrating chat messages into Langchain - use ChatPromptTemplate with placeholders

In [None]:
# Using OpenAI templates instead - system, assistant, user
template = ChatPromptTemplate([
    {
        "role": "user",
        "content": "Hello, how are you?",
    },
    {
        "role": "assistant",
        "content": "I'm doing well, thank you for asking.",
    },
    {
        "role": "user",
        "content": "{user_input}",
    },
    {
        "role": "placeholder",  # Not sure if this will work
        "content": "{chat_history}"
    }
])

prompt_value = template.invoke(
    {
        "user_input": "What is your name?",
        "conversation": [  # This is a normal template
            ("human", "Hi!"),
            ("ai", "How can I assist you today?"),
            ("human", "Can you make me an ice cream sundae?"),
            ("ai", "No.")
        ]
    }
)

# Hybrid Retrieval

Combine a sparse search with a dense search - BM25 + Embeddings
Reference Article: https://medium.com/etoai/hybrid-search-combining-bm25-and-semantic-search-for-better-results-with-lan-1358038fe7e6

# Metadata Taggging - OpenAI

For getting code metadata - e.g summaries and other info via an LLM - it enhances the metadata.
https://python.langchain.com/docs/integrations/document_transformers/openai_metadata_tagger/

Unfortunately this needs OpenAI.

In [11]:
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI

ModuleNotFoundError: No module named 'langchain_openai'

In [15]:
from typing import Literal, List
from pydantic import BaseModel, Field
class LLMGeneratedMetadata(BaseModel): 
    """
    Returns the blank schema for Lagnchain MetadataTagger to fill in via LLM. This is used to generate some parts of the metadata that cannot be made just by Langchain alone.

    Metadata tagging can only be generated by OpenAI models in Langchain. Local models that supports OpenAI function calling is only somewhat supported by local-llm-function-calling.

    Some of these fields are redundant, and already detected by AST. This is just kept as a potential use for cross validation, or just for filling in the gaps.
    """
    block_type_llm: str = Field(description="Type of code block, either class/method/function/others. other block refers to when the code has no clear blocks such as when it exists in the root of the code file.")
    block_name_llm:  str = Field(description="name of block e.g class name, function name")
    block_args_llm: List[str] = Field(description="All argument variable names and types if any")
    return_var_llm:  str = Field(description="Returns name of return variable if clearcut, else if the return statement is complex such as an expression, return a variable name that sufficiently represents the return variable")
    functions_called_llm: List[str] = Field(description="List of other functions called, from internal code or external libraries")
    code_summary: str = Field(description="Summary of the code and its purpose")  #  Code summary


In [None]:
# Must be an OpenAI model that supports functions
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

document_transformer = create_metadata_tagger(metadata_schema=LLMGeneratedMetadata, llm=llm)

# Add on to documents
enhanced_documents = document_transformer.transform_documents(documents)

# Metadata Tagging - Local
Based on this https://local-llm-function-calling.readthedocs.io/en/latest/generation.html
Uses the same function calling technique but with ollama models

https://local-llm-function-calling.readthedocs.io/en/latest/constraining.html

In [None]:
from local_llm_function_calling import Generator
from local_llm_function_calling import Constrainer, JsonSchemaConstraint

functions = [
    {
        "name": "llm_generated_metadata",
        "description": "Generates metadata about the code document.",
        "parameters": {
            "type": "object",
            "properties": {
                "block_type_llm": {
                    "type": "string",
                    "description": "Type of code block, either class/function/other. other block refers to when the code has no clear blocks such as when it exists in the root of the code file.",
                    "enum": ["class", "function", "other"],
                },
                "block_name_llm": {
                    "type": "string", 
                    "description": "",
                    "maxLength": 30
                },
                "block_args_llm": {
                    "type": "array",
                    "description": "All argument variable names and types if any",
                },
                "return_var_llm": {
                    "type": "string",
                    "description": "Returns name of return variable if clearcut, else if the return statement is complex such as an expression, return a variable name that sufficiently represents the return variable"
                    "maxLength": 30
                },
                "functions_called": {
                    "type": "array",
                    "description": "List of other functions called, from internal code or external libraries"
                },
                "code_summary": {
                    "type": "string",
                    "description": "Summary of the code and its purpose"
                }
            },
            "required": ["code_summary"],
        },
    },   
]

constraint = JsonSchemaConstraint(schema)
constrainer = Constrainer(HuggingfaceModel("gpt2"))
raw_json = constrainer.generate("Prefix.\n", constraint, max_len=100)
truncated_json = raw_json[:constraint.validate(raw_json).end_index]  # Remove the prefix

# Now the metadata can be added manually to the metadata of the documents - similar to transform_documents(documents)