In [None]:

import os
from pathlib import Path
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.callbacks import get_openai_callback
from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama
from langchain.vectorstores import Chroma
from groq import Groq
from langchain_text_splitters import CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core import SimpleDirectoryReader
import os
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from llama_index.core.node_parser import HierarchicalNodeParser, SimpleNodeParser, get_leaf_nodes
from llama_index.core import Prompt
from llama_index.core import VectorStoreIndex,StorageContext, load_index_from_storage
from llama_index.core.query_engine import RetrieverQueryEngine
import openai
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import Callable, Optional
from llama_index.core.utils import globals_helper, get_tokenizer
from llama_index.core.schema import MetadataMode
import os
from langchain_community.document_loaders import DiffbotLoader
from langchain.embeddings.openai import OpenAIEmbeddings


In [None]:
queries = [
       "What are the steps required to set up a boilerplate project using the Emumba Plugin?",
        "Can you list some of the key features provided by the Emumba Plugin for React applications?",
        "What is the purpose of the generateFiles function in the project setup generator, and how does it use the options provided?",
        "Describe the role of addDependenciesToPackageJson in the project setup process.",
        "How does the ProjectSetupGeneratorSchema interface influence the behavior of the project setup generator?",
        "Explain how the project configuration is added to the workspace using addProjectConfiguration in the context of the setup process.",
        "Describe the process and the purpose of creating a test project in the beforeAll setup of the emumba-plugin tests.",
        "How does the test for emumba-plugin ensure that the plugin is properly installed and functional within a generated project?",
        "there's a function used to create a test project directory. Output the code snippet that showcases how this directory is created and initialized.",
        '''Given the following incomplete snippet, complete the function to add a specific dependency to the project's package.json. 
            Assume the function addDependenciesToPackageJson is already imported.
            function enhancePackageJson(tree: Tree, projectName: string) {
            // Add 'react-redux' as a dependency
            addDependenciesToPackageJson(tree, projectName, {
                'react-redux': '^7.2.0'
            }, {});
            // Complete the function to also add 'redux' as a dependency
        }'''
]

In [None]:
groq_api_key="your-key"
client = Groq(
    api_key="your-key",
)
os.environ["OPENAI_API_KEY"]  = "your-key"
os.environ["COHERE_API_KEY"] = "your-key"
embeddings=OpenAIEmbeddings(model="text-embedding-3-large")
llm = ChatGroq(groq_api_key=groq_api_key,model_name='llama3-8b-8192')   


### E 0

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {} 

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content)) 
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents

repo_path = "./data/Boilerplate"
documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)

docs = text_splitter.split_documents(documents)

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="openai_embeds",
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
)        
retriever = vectorstore.as_retriever()


### E 1 : Metadata Attachment

In [None]:
class Document:
    def __init__(self, content, metadata=None):
        self.page_content = content
        self.metadata = metadata if metadata else {}
    
    def add_metadata(self, key, value):
        self.metadata[key] = value

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    # Example metadata collection
                    metadata = {
                        'file_name': file,
                        'file_path': str(file_path),
                        'file_size': os.path.getsize(file_path)
                    }
                    documents.append(Document(content, metadata))
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents


repo_path = "./data/Boilerplate"
documents = read_files(repo_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)

metadocs = text_splitter.split_documents(documents)
texts = [d.page_content for d in metadocs]
metadatas = [doc.metadata for doc in metadocs]

if len(texts) != len(metadatas):
    raise ValueError(f"Number of texts ({len(texts)}) and metadata entries ({len(metadatas)}) do not match.")

vectorstore = FAISS.from_texts(
    texts, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"), 
    metadatas=metadatas)
retriever = vectorstore.as_retriever()

### E2-1 : Chunking - Character text splitting

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {} 

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content)) 
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents

repo_path = "./data/Boilerplate"
documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

docs=text_splitter.create_documents(docs_texts)

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="openai_embeds",
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
)        
retriever = vectorstore.as_retriever()

### E2-2 : Chunking : Recursive splitting

In [None]:
#PLEASE REFER TO E0 IN THIS NOTEBOOK

### E2-3 : Chunking : tiktoken based splitting

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {} 

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content)) 
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents

repo_path = "./data/Boilerplate"

documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=200
)

docs = text_splitter.create_documents(docs_texts)

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="openai_embeds",
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
)        
retriever = vectorstore.as_retriever()

### E2-4 : Chunking : Semantic based splitting

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {} 

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content)) 
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents


repo_path = "./data/Boilerplate"
documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]
text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type="percentile")
docs = text_splitter.create_documents(docs_texts)

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="openai_embeds",
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
)        
retriever = vectorstore.as_retriever()


### E3 : Hierarichal Indexing

In [None]:
class LimitRetrievedNodesLength:

    def __init__(self, limit: int = 2500, tokenizer: Optional[Callable] = None):
        self._tokenizer = tokenizer or get_tokenizer()

        self.limit = limit

    def postprocess_nodes(self, nodes, query_bundle):
        included_nodes = []
        current_length = 0

        for node in nodes:
            current_length += len(self._tokenizer(node.node.get_content(metadata_mode=MetadataMode.LLM)))
            if current_length > self.limit:
                break
            included_nodes.append(node)

        return included_nodes
    
LLAMA_PROMPT_TEMPLATE = (
 "<s>[INST] <<SYS>>"
 "Use the following context to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer."
 "<</SYS>>"
 "<s>[INST] Context: {context_str} Question: {query_str} Only return the helpful answer below and nothing else. Helpful answer:[/INST]"
)
qa_template = Prompt(LLAMA_PROMPT_TEMPLATE)

node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128],
    chunk_overlap=0
)

In [None]:
directory_path = "./data/Boilerplate"
input_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if not file.startswith('.')]
documents = SimpleDirectoryReader(input_files=input_files).load_data()
nodes = node_parser.get_nodes_from_documents(documents)
leaf_nodes=get_leaf_nodes(nodes)
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)
directory="./llama_docs/docs/Boilerplate"
index = VectorStoreIndex(leaf_nodes, storage_context=storage_context)
index.storage_context.persist(persist_dir=f"./data_{os.path.basename(directory)}")

### E4 : Knwoledge Graph Indexing

### Rag retrieval


In [None]:
def QA_chain(llm,retriever,queries):
    rag_template = """
    The following data comes from various files in a GitHub repository, which may contain information of any file extension. Your task is to search for an answer to a specific question within this data. Do not attempt to create an answer on your own. If you cannot find any reference to the query within the provided data, simply respond with, "There is no such reference to this."

    Data Context:
    {context}

    Question: {question}

    Answer:"""

    def process_context(source_documents):
        contexts = []
        for doc in source_documents:
            if hasattr(doc, 'page_content'):
                contexts.append(doc.page_content)
            else:
                contexts.append("Invalid document format")
        return contexts

    rag_prompt = ChatPromptTemplate.from_template(rag_template)
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": rag_prompt}
    )

    result=[]
    file_path = 'positive_ground_truths.xlsx'
    df = pd.read_excel(file_path)
    for i, query in enumerate(queries, start=0): 
            response = qa({"query": query})
            
            ground_truth = df.iloc[i]['ground truth']  
            contexts = process_context(response['source_documents'])
            result.append({'question': query, 'answer':response['result'], 'ground_truths': ground_truth,'contexts':contexts})

           

    return result

In [None]:
result=QA_chain(llm,retriever,queries)