In [None]:
import os
from pathlib import Path
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.callbacks import get_openai_callback
from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from groq import Groq
import openai
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
import voyageai
from langchain_voyageai import VoyageAIEmbeddings
from llama_index.embeddings.jinaai import JinaEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.response.notebook_utils import display_source_node
from langchain_cohere import CohereEmbeddings


In [None]:

queries = [
       "What are the steps required to set up a boilerplate project using the Emumba Plugin?",
        "Can you list some of the key features provided by the Emumba Plugin for React applications?",
        "What is the purpose of the generateFiles function in the project setup generator, and how does it use the options provided?",
        "Describe the role of addDependenciesToPackageJson in the project setup process.",
        "How does the ProjectSetupGeneratorSchema interface influence the behavior of the project setup generator?",
        "Explain how the project configuration is added to the workspace using addProjectConfiguration in the context of the setup process.",
        "Describe the process and the purpose of creating a test project in the beforeAll setup of the emumba-plugin tests.",
        "How does the test for emumba-plugin ensure that the plugin is properly installed and functional within a generated project?",
        "there's a function used to create a test project directory. Output the code snippet that showcases how this directory is created and initialized.",
        '''Given the following incomplete snippet, complete the function to add a specific dependency to the project's package.json. 
            Assume the function addDependenciesToPackageJson is already imported.
            function enhancePackageJson(tree: Tree, projectName: string) {
            // Add 'react-redux' as a dependency
            addDependenciesToPackageJson(tree, projectName, {
                'react-redux': '^7.2.0'
            }, {});
            // Complete the function to also add 'redux' as a dependency
        }'''
]

### Document Loading

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {} 

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content)) 
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents

repo_path = 'data/Boilerplate'
documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(documents)

### Init

In [None]:
groq_api_key="your-key"
groq_api_key="your-key"
os.environ["OPENAI_API_KEY"]="your-key"
os.environ["VOYAGE_API_KEY"]="your-key"
os.environ["COHERE_API_KEY"]="your-key"

llm = ChatGroq(groq_api_key=groq_api_key,model_name='llama3-70b-8192')    
client = Groq(api_key="your-key")
jinaai_api_key="your-key"

### Vector Store : Cohere

In [None]:
cohere_embeddings = CohereEmbeddings()

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="chroma_embeds",
    embedding=cohere_embeddings,
)        
retriever = vectorstore.as_retriever()

### Vector Store : Ollama Embeddings : mxbai-embed-large

In [None]:
embedding=OllamaEmbeddings(model='mxbai-embed-large')
vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="ollama_embeds_mxbai",
    embedding=embedding,
)        
retriever = vectorstore.as_retriever()

### Vector Store : Ollama Embeddings : all-minillm

In [None]:
vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="ollama_embeds_minillm",
    embedding=OllamaEmbeddings(model='all-minilm'),
)        
retriever = vectorstore.as_retriever()

### Vector Store : Openai Embeddings : text-embedding-3-large

In [None]:
vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="openai_embeds",
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
)        
retriever = vectorstore.as_retriever()
    

### Vector Store : Voyage Embedding : Large-2-instruct

In [None]:

vectorstore = Chroma.from_documents(
    documents=docs,
    collection_name="voyage_embeds",
    embedding=VoyageAIEmbeddings(
    voyage_api_key="pa-zfj3RRNPt0KXNoESdHmERG_HM-zel5LdHuVLteV063s", model="voyage-law-2"
),
)        
retriever = vectorstore.as_retriever()

### Vector Store :  Jina-AI  : Has its Own pipeline!!!

In [None]:
documents = SimpleDirectoryReader("data/Boilerplate").load_data()
index = VectorStoreIndex.from_documents(
    documents=documents, embed_model=JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
)
)
retriever = index.as_retriever()

In [None]:
def jina_rag(client,retriever,queries):
    result=[]
    file_path = 'ground_truths.xlsx'
    df = pd.read_excel(file_path)
    for i, query in enumerate(queries, start=0): 
        
            search_query_retrieved_nodes = retriever.retrieve(query)
            
            context=[]
            for n in search_query_retrieved_nodes:
                context.append(n)
            prompt= f"""Answer the question based only on the following context:{context}Question: {query}"""
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model='llama3-70b-8192',
            )
            response=(chat_completion.choices[0].message.content)
            ground_truth = df.iloc[i]['ground truth']  
            result.append({'question': query, 'answer': response, 'ground_truths': ground_truth,'contexts': context})

    return result
    

In [None]:
result=jina_rag(client,retriever,queries)


### Main RAG Chain

In [None]:
import time
def QA_chain(llm,retriever,queries):
    rag_template = """
    The following data comes from various files in a GitHub repository, which may contain information of any file extension. Your task is to search for an answer to a specific question within this data. Do not attempt to create an answer on your own. If you cannot find any reference to the query within the provided data, simply respond with, "There is no such reference to this."

    Data Context:
    {context}

    Question: {question}

    Answer:"""

    def process_context(source_documents):
        contexts = []
        for doc in source_documents:
            if hasattr(doc, 'page_content'):
                contexts.append(doc.page_content)
            else:
                contexts.append("Invalid document format")
        return contexts

    rag_prompt = ChatPromptTemplate.from_template(rag_template)
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": rag_prompt}
    )

    result=[]
    file_path = 'positive_ground_truths.xlsx'
    df = pd.read_excel(file_path)
    for i, query in enumerate(queries, start=0): 
            time.sleep(6)
            response = qa({"query": query})
            
            ground_truth = df.iloc[i]['ground truth']  
            contexts = process_context(response['source_documents'])
            result.append({'question': query, 'answer': response['result'], 'ground_truths': ground_truth,'contexts': contexts})
           

    return result

In [None]:
result=QA_chain(llm,retriever,queries)

### BGE-M3 (Run on GPU)

In [None]:
class Document:
    def __init__(self, content):
        self.page_content = content
        self.metadata = {}

def read_files(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = Path(root) / file
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    documents.append(Document(content))
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents

repo_path = 'Boilerplate'
documents = read_files(repo_path)
docs_texts = [d.page_content for d in documents]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
docs = text_splitter.split_documents(documents)

In [None]:
from transformers import RagTokenizer, RagTokenForGeneration

tokenizer = RagTokenizer.from_pretrained('facebook/rag-token-nq')
model = RagTokenForGeneration.from_pretrained('facebook/rag-token-nq')
model1 = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)

In [None]:
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import faiss
def create_faiss_index(embeddings):
    # Create and return a FAISS index for the given embeddings
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def bge_m3_embed(query: str):
    embeddings = model1.encode([query])['dense_vecs'][0]
    return embeddings

def embed_docs(docs):
    contents = [doc.page_content for doc in docs]
    embeddings = np.array([bge_m3_embed(content) for content in contents])
    return embeddings

embeddings = embed_docs(docs)
index = create_faiss_index(embeddings)

In [None]:
def retrieve_documents(query, index, k=5):
    # Retrieve k most similar documents for a given query
    query_embedding = bge_m3_embed(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    return indices.flatten()

def rag_answer(questions, index,docs):
  for i, question in enumerate(questions, start=0):
      context=[]
      retrieved_indices = retrieve_documents(question, index)
      retrieved_docs = [docs[idx] for idx in retrieved_indices]  # Adjust based on your docs structure
      contexts = [doc.page_content for doc in docs]  # keep as list
      result=[]
      file_path = '/content/positive_ground_truths.xlsx'
      df = pd.read_excel(file_path)
      prompt= f"""Answer the question based only on the following context:{context}Question: {question}"""
      chat_completion = client.chat.completions.create(
      messages=[
           {
                          "role": "user",
                          "content": prompt,
                      }
                  ],
                  model='llama3-70b-8192',
       )
      response=(chat_completion.choices[0].message.content)
      print(df.iloc[i]['ground truth']  )
      ground_truth = df.iloc[i]['ground truth']
      result.append({'question': question, 'answer': response, 'ground_truths': ground_truth,'contexts': context})

  return result

result=rag_answer(queries, index,docs)

### LLM EMBEDDER (Run on GPU)

In [None]:
from FlagEmbedding import LLMEmbedder
model1 = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)
tokenizer = RagTokenizer.from_pretrained('facebook/rag-token-nq')
model = RagTokenForGeneration.from_pretrained('facebook/rag-token-nq')

In [None]:
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import faiss

task = "qa"
def bge_m3_embed(query: str):
    #embeddings = model1.encode([query])['dense_vecs'][0]
    key_embeddings = model1.encode_keys(query, task=task)
    return embeddings

def embed_docs(docs):
    contents = [doc.page_content for doc in docs]
    embeddings = np.array([bge_m3_embed(content) for content in contents])
    return embeddings

embeddings = embed_docs(docs)


In [None]:
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

new_embeddings = embeddings.reshape(-1, embeddings.shape[-1])
index = create_faiss_index(new_embeddings )

In [None]:
def retrieve_documents(query, index, k=2):
    # Retrieve k most similar documents for a given query
    query_embedding = bge_m3_embed(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    return indices.flatten()

In [None]:
def retrieve_documents(query, index, k=5):
    query_embedding = bge_m3_embed(query).reshape(-1, embeddings.shape[-1])
    distances, indices = index.search(query_embedding, k)
    return indices.flatten()

def rag_answer(questions, index, docs):
    result = []
    for i, question in enumerate(questions, start=0):
        retrieved_indices = retrieve_documents(question, index)
        wrapped_indices = [idx % len(docs) for idx in retrieved_indices]
        retrieved_docs = [docs[idx] for idx in wrapped_indices]
        context=retrieved_docs
        file_path = '/content/positive_ground_truths.xlsx'
        df = pd.read_excel(file_path)
        prompt = f"""Answer the question based only on the following context:{context} Question: {question}"""
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user", "content": prompt},
            ],
            model='llama3-70b-8192',
        )
        response = (chat_completion.choices[0].message.content)
        print(response)
        ground_truth = df.iloc[i]['ground truth']
        print(df.iloc[i]['ground truth'])
        result.append({'question': question, 'answer': response, 'ground_truths': ground_truth,'contexts': context})

    return result

result=rag_answer(queries, index,docs)