In [37]:
import ast
from pathlib import Path
from typing import Iterator, Any, Optional, List, NotRequired, Union
import re

from langchain.prompts import load_prompt
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter

import dotenv
dotenv.load_dotenv()
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain.docstore.document import Document

from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model_name="gpt-4o",

)

embeddings = OpenAIEmbeddings( 
                              model="text-embedding-3-large", 

)

In [42]:
import os
import glob

def list_files(directory):
    # Expand user home directory symbol '~' if present
    directory = os.path.expanduser(directory)
    
    # Get the absolute path of the directory
    abs_directory = os.path.abspath(directory)
    
    # Check if the provided path is a directory
    if not os.path.isdir(abs_directory):
        return f"The path {abs_directory} is not a valid directory."

    # Define the file extensions to track
    file_extensions = ['.py', '.md', 'rst']
    
    # Use glob to list all files in the directory and subdirectories
    files = [file for file in glob.glob(os.path.join(abs_directory, '**', '*'), recursive=True) if os.path.isfile(file) and os.path.splitext(file)[1] in file_extensions]
    
    return files


In [43]:
from typing import Tuple

def read_file(file_path) -> Tuple[str, str]:
    file_extension = file_path[-3:]
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read(), file_extension

def chunk_text(text: str, language: str, max_length=1000):
    if ".py" == language.casefold().strip():
        splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON, chunk_size=1000, chunk_overlap=100
        )
    elif ".md" == language.casefold().strip():
        splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.MARKDOWN, chunk_size=1000, chunk_overlap=100
        )
    else:
        splitter = RecursiveCharacterTextSplitter(chunk_size=max_length, chunk_overlap=200)
    return splitter.split_text(text)

In [44]:
file_list = list_files("./resources")

list_docs = []
for file in file_list:
    content, language = read_file(file)
    chunks = chunk_text(content, language, max_length=1000)
    docs = [Document(page_content=chunk, metadata={'filePath': file}) for chunk in chunks]
    # docs = [Document(page_content=chunk, metadata={'filePath': file}) for chunk in chunks]
    list_docs.extend(docs)

In [54]:
len(list_docs)

826

In [53]:
vector_store = Chroma.from_documents(
    documents=list_docs,
    embedding=embeddings
)

In [48]:
retriever = vector_store.as_retriever(search_type="similarity", kwargs={"k": 6})

In [49]:
prompt = PromptTemplate.from_template(
    template="""
    You're a helpful assistant that informs the user about the code base.

    Only use the context and if the context is not sufficient, please respond with:
    'Sorry I can't answer your question'.
    Use the following context:
    {context}


    The question from the user:
    ```
    {question}
    ```
    Your answer should always contain your sources, which is the FilePath. Never forget to mention the path of the file in your answer.
    Your answer:
    """
)

In [50]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import format_document

def format_docs(docs): 
    template = PromptTemplate.from_template(
    """
    <DocumentChunk>
        <Content>{page_content}</Content>
        <FilePath>{filePath}</FilePath>
    </DocumentChunk>
    """)
    res = '```\r\n' + format_document(docs[0], template)
    return res + '\r\n'.join(format_document(doc, template) for doc in docs) + '\r\n```'


rag_chain = (
    {"context" : retriever | format_docs, "question" : RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser() 
)

In [51]:
rag_chain.invoke("What is the code of conduct about?")

"The Code of Conduct for the Cookiecutter project's codebases and documentation expects everyone interacting with the project to follow the [PyPA Code of Conduct](https://www.pypa.io/en/latest/code-of-conduct/). This applies to all forms of communication, including issue trackers, chat rooms, mailing lists, and both virtual and in-person interactions.\n\nSource: \n- `c:\\Users\\WilliamRuan\\Documents\\KI_reply_training\\workshop-chat-with-your-sw\\resources\\cookiecutter\\docs\\CODE_OF_CONDUCT.md`\n- `c:\\Users\\WilliamRuan\\Documents\\KI_reply_training\\workshop-chat-with-your-sw\\resources\\cookiecutter\\CODE_OF_CONDUCT.md`"