![Img](https://app.theheadstarter.com/static/hs-logo-opengraph.png)

# Headstarter Codebase RAG Project

![Screenshot 2024-11-25 at 7 12 58 PM](https://github.com/user-attachments/assets/48dd9de1-b4d2-4318-8f52-85ec209d8ebc)

# Install Necessary Libraries

In [1]:
! pip install pygithub langchain langchain-community openai tiktoken pinecone-client langchain_pinecone sentence-transformers

Collecting pygithub
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Collecting pynacl>=1.4.0 (from pygithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydanti

In [7]:
# Force pip to ignore hash mismatches
!pip install --no-cache-dir --no-deps --ignore-installed --force-reinstall langchain langchain-community langchain_pinecone
!pip install --no-cache-dir openai tiktoken pinecone==5.4.0 pygithub sentence-transformers

Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.3-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain-0.3.21-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_community-0.3.20-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m146.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_pinecone-0.2.3-py3-none-any.whl (11 kB)
Installing collected packages: langchain_pinecone, langchain-community, langchain
Successfully installed langchain-0.3.21 langchain-community-0.3.20 langchain_pinecone-0.2.3
Collecting pinecone==5.4.0
  Downloading pinecone-5.4.0-py3-none-any.whl.metadata (19 kB)
Collectin

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from pinecone import Pinecone
import os
import tempfile
from github import Github, Repository
from git import Repo
from openai import OpenAI
from pathlib import Path
from langchain.schema import Document
from pinecone import Pinecone

# Clone a GitHub Repo locally

In [9]:
github_repo = "https://github.com/Moe1177/SOEN341_Winter2025"

In [10]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory.

    Args:
        repo_url: The URL of the GitHub repository.

    Returns:
        The path to the cloned repository.
    """
    repo_name = github_repo.split('/')[-1]
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return repo_path

In [11]:
path = clone_repository(github_repo)

# Define which types of files to parse and which files / folders to ignore

In [18]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor', ".idea", ".env", ".venv"}

In [19]:
def get_file_content(file_path, repo_path):
    """
    Get content of a single file.

    Args:
        file_path (str): Path to the file

    Returns:
        Optional[Dict[str, str]]: Dictionary with file name and content
    """
    try:
      with open(file_path, "r", encoding="utf-8") as file:
          content = file.read()

      rel_path = os.path.relpath(file_path, repo_path)

      return {
          "name": rel_path,
          "content": content
      }
    except Exception as e:
      print(f"Error reading file: {file_path}: {str(e)}")
      return None


def get_main_files_content(repo_path: str):
    """
    Get content of supported code files from the local repository.

    Args:
        repo_path: Path to the local repository

    Returns:
        List of dictionaries containing file names and contents
    """

    files_content = []

    try:
      for root, _, files in os.walk(repo_path):
        if any(ignored_dir in root for ignored_dir in IGNORED_DIRS):
            continue

        for file in files:
            file_path = os.path.join(root, file)
            if os.path.splitext(file_path)[1] in SUPPORTED_EXTENSIONS:
                file_content = get_file_content(file_path, repo_path)

                if file_content:
                    files_content.append(file_content)
    except Exception as e:
        print(f"Error reading files: {str(e)}")
        return None

    return files_content


In [20]:
files_content = get_main_files_content(path)

In [21]:
files_content

[{'name': 'soen341-frontend/app/layout.tsx',
  'content': 'import type { Metadata } from "next";\nimport { Toaster } from "react-hot-toast";\n\nimport "./globals.css";\nimport React from "react";\n\nexport const metadata: Metadata = {\n  title: "SOEN 341",\n  description: "SOEN 341 project",\n};\n\nexport default function RootLayout({\n  children,\n}: {\n  children: React.ReactNode;\n}) {\n  return (\n    <html lang="en" className="dark">\n      <body className="bg-background">\n        <main className="relative overflow-hidden">{children}</main>\n        <Toaster position="top-center" />\n      </body>\n    </html>\n  );\n}\n'},
 {'name': 'soen341-frontend/app/page.tsx',
  'content': '"use client";\n\nimport React from "react";\nimport Features from "@/Components/Features";\nimport Hero from "@/Components/Hero";\nimport About from "@/Components/About";\nimport SignUp from "@/Components/SignUp";\nimport Navbar from "@/Components/Navbar";\n\nexport default function Home() {\n  return (\

# Embeddings

In [22]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

# Setting up Pinecone
**1. Create an account on [Pinecone.io](https://app.pinecone.io/)**

**2. Create a new index called "codebase-rag" and set the dimensions to 768. Leave the rest of the settings as they are.**

![Screenshot 2024-11-24 at 10 58 50 PM](https://github.com/user-attachments/assets/f5fda046-4087-432a-a8c2-86e061005238)



**3. Create an API Key for Pinecone**

![Screenshot 2024-11-24 at 10 44 37 PM](https://github.com/user-attachments/assets/e7feacc6-2bd1-472a-82e5-659f65624a88)


**4. Store your Pinecone API Key within Google Colab's secrets section, and then enable access to it (see the blue checkmark)**

![Screenshot 2024-11-24 at 10 45 25 PM](https://github.com/user-attachments/assets/eaf73083-0b5f-4d17-9e0c-eab84f91b0bc)



In [23]:
# Set the PINECONE_API_KEY as an environment variable
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index("codebase-rag")

In [24]:
vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())

  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
  vectorstore = PineconeVectorStore(index_name="codebase-rag", embedding=HuggingFaceEmbeddings())
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [28]:
# Insert the codebase embeddings into Pinecone

documents = []

for file in files_content:
  doc = Document(
      page_content=f"{file['name']}\n\n{file['content']}",
      metadata={"source": file['name']}
  )

  documents.append(doc)


vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(),
    index_name="codebase-rag",
    namespace="https://github.com/Moe1177/SOEN341_Winter2025"
)




  embedding=HuggingFaceEmbeddings(),


# Perform RAG

1. Get your OpenRouter API Key [here](https://openrouter.ai/settings/keys)

2. Paste your OpenRouter Key into your Google Colab secrets, and make sure to enable permissions for it

![Image](https://github.com/user-attachments/assets/bd64c5aa-952e-4a1e-9ac0-01d8fe93aaa1)


In [29]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get("OPENROUTER_API_KEY")
)

In [55]:
query = """How can you tell me that technologies or configurations files we used
  for the UI in the website?
"""

In [76]:
def perform_rag(query, model="deepseek/deepseek-r1-distill-llama-70b:free"):
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(
        vector=raw_query_embedding.tolist(),
        top_k=3,
        include_metadata=True,
        namespace="https://github.com/CoderAgent/SecureAgent"
      )

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n\n\n\n\nMY QUESTION:\n" + query

    # Modify the prompt below as need to improve the response quality
    system_prompt = """You have ultimate knowledge over this codebase.
    You are an AI agent that helps users navigate the website to help them do
    what they need to do.
    You do not answer any questions about the backend, about any configuration
    file,
    or anything that has nothing to do with a functionality from the website.
    A good user request would be for example how can I create a channel or how
    can I log in.
    A bad user request would be how was this component of the website built,
    or what technology was used in this part of the website.
    Also do not release any sensitive information. Do not hallucinate. Answer
    the user's question by following
    the previous instructions. Consider the entire context provided to answer
    the user's question.
    If any invasive questions are asked, ONLY reply that this information
    cannot be given out due to privacy, and security reasons. You should not
    even give out one word of information on tech stack or anything else. Make
    all answers clear, and concise and easy to understand for the user. Also,
    make the answers straight to the point as well as be polite.
    """

    llm_response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    return llm_response.choices[0].message.content

In [77]:
perform_rag(query)

"I'm sorry, but I cannot provide information about the technologies or configuration files used for the website. If you have any questions about how to use the website or need assistance with specific functionalities, feel free to ask, and I’ll be happy to help!"