In [2]:
from pydantic import BaseModel, Field
import ast
from pathlib import Path
from typing import Iterator, Any, Optional, List, NotRequired, Union
import re

from langchain.prompts import load_prompt
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

import dotenv
dotenv.load_dotenv()
import os

In [50]:
import os
import glob

def list_files(directory):
    # Expand user home directory symbol '~' if present
    directory = os.path.expanduser(directory)
    
    # Get the absolute path of the directory
    abs_directory = os.path.abspath(directory)
    
    # Check if the provided path is a directory
    if not os.path.isdir(abs_directory):
        return f"The path {abs_directory} is not a valid directory."

    # Define the file extensions to track
    file_extensions = ['.py', '.md', 'rst']
    
    # Use glob to list all files in the directory and subdirectories
    files = [file for file in glob.glob(os.path.join(abs_directory, '**', '*'), recursive=True) if os.path.isfile(file) and os.path.splitext(file)[1] in file_extensions]
    
    return files


In [54]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, max_length=1000):
    splitter = RecursiveCharacterTextSplitter(chunk_size=max_length, chunk_overlap=200)
    return splitter.split_text(text)

In [85]:
file_list = list_files("./resources")

list_chunk = []
for file in file_list:
    content = read_file(file)
    chunks = chunk_text(content, max_length=1000)
    list_chunk.extend(chunks)

In [22]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [84]:
embeddings = OpenAIEmbeddings(
    openai_api_key="",
    openai_organization=""
)

In [86]:
vector_store = Chroma.from_texts(
    texts=list_chunk,
    embedding=embeddings
)

In [87]:
retriever = vector_store.as_retriever(search_type="similarity", kwargs={"k": 6})

In [88]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    template="""
    You're a helpful assistant that informs the user about the code base.

    Only use the context and if the context is not sufficient, please respond with:
    'Sorry I can't answer your question'.
    Use the following context:
    ```
    {context}
    ```

    The question from the user:
    ```
    {question}
    ```

    Your answer:
    """
)

In [89]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model_name="gpt-4o",
    openai_api_key="",
    openai_organization=""
)

In [90]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs): 
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context" : retriever | format_docs, "question" : RunnablePassthrough()} 
    | prompt 
    | llm 
    | StrOutputParser () 
)

In [92]:
rag_chain.invoke("What is the code of conduct about?")

"The code of conduct for the Cookiecutter project's codebases and documentation expects everyone interacting in the project to follow the [PyPA Code of Conduct](https://www.pypa.io/en/latest/code-of-conduct/). This applies to various forms of communication, including issue trackers, chat rooms, mailing lists, and both virtual and in-person interactions."