<a href="https://colab.research.google.com/github/JoshuaM195/DIY_Cursor/blob/main/Codebase_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/48dd9de1-b4d2-4318-8f52-85ec209d8ebc)

# Install Necessary Libraries

In [None]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

# Clone a GitHub Repo locally

In [None]:
github_repo = "https://github.com/evanng07/CodeInsight"
github_repo.split("/")[-1]

In [None]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """
    repo_name = github_repo.split("/")[-1]
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return repo_path


In [None]:
path = clone_repository(github_repo)

# Define which types of files to parse and which files / folders to ignore

In [None]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                         '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

In [None]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
      with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

      rel_path = os.path.relpath(file_path, repo_path)

      return {
          "name": rel_path,
          "content": content
      }

    except Exception as e:
      print(f"Error processing file {file_path}: {str(e)}")
      return None



def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """

    files_content = []

    try:

      for root, _, files in os.walk(repo_path):
        if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
          continue

        for file in files:
          file_path = os.path.join(root, file)
          if os.path.splitext(file) [1] in SUPPORTED_EXTENSIONS:
            file_content = get_file_content(file_path, repo_path)

            if file_content:
              files_content.append(file_content)

    except Exception as e:
      print(e)

    return files_content


In [None]:
files_content = get_main_files_content(path)

In [None]:
files_content

# Embeddings

In [None]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

In [None]:
text = "I am learning"
embeddings = get_huggingface_embeddings(text)

In [None]:
embeddings

In [None]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [None]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

In [None]:
# Insert the codebase embeddings into Pinecone

documents = []

for file in files_content:
  doc = Document(
      page_content=f"{file['name']}\n\n{file['content']}",
      metadata={"source": file['name']}
  )

  documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/evanng07/CodeInsight"
)

In [None]:
documents

In [None]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get("OPENROUTER_API_KEY")
)

In [None]:
print(os.getenv("OPENROUTER_API_KEY"))  # Should not be None or empty

In [None]:
query = "What is happening in this codebase?"

In [None]:
raw_query_embedding = get_huggingface_embeddings(query)

In [None]:
raw_query_embedding

In [None]:
top_mathces = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=5, include_metadata=True, namespace="https://github.com/evanng07/CodeInsight")

In [None]:
top_mathces

In [None]:
context = [item['metadata']['text'] for item in top_mathces['matches']]

In [None]:
context

In [None]:
augmented_query = "<CONTEXT>\n" + "\n\n--------\n\n".join(context)+"\n---------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [None]:
print(augmented_query)

In [None]:
system_prompt = """You are a Senior Software Engineer, who is an expert in Python and fullstack development.

Answer the question I have about the codebase based on the context provided.
Always consider all of the context provided to answer my questions.
"""

llm_response = client.chat.completions.create(
    model="deepseek/deepseek-r1:free",
    messages=[
        {"role":"system", "content": system_prompt},
        {"role":"user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [None]:
#respone = perform_rag("Question")

print(response)