In [2]:
import os
from langchain_huggingface import HuggingFaceEmbeddings  # Updated import for HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from transformers import pipeline
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
if not torch.backends.mps.is_available():
    raise EnvironmentError("MPS backend is not available. Ensure PyTorch is installed with MPS support.")
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [4]:
# Argument to control whether to use the local database or re-index
USE_LOCAL_DB = False  # Set to True to use existing local database, False to re-index

# Directory paths
PR_FOLDER = "processed_docs"  # Folder with PR data files
VECTOR_DB_DIR = "final_all-MiniLM-L6-v2"  # Directory to save/load vector database

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# Step 1: Load PR Data
def load_pr_data(pr_folder):
    """Load PR data from the specified folder."""
    pr_documents = []
    for file in os.listdir(pr_folder):
        if file.endswith(".txt"):
            with open(os.path.join(pr_folder, file), 'r') as f:
                content = f.read()
                pr_documents.append(
                    Document(
                        page_content=content,
                        metadata={"file_name": file}
                    )
                )
    return pr_documents

if not USE_LOCAL_DB:
    print("Loading and processing PR data...")
    # Load PR data
    pr_documents = load_pr_data(PR_FOLDER)

    # Step 2: Create and store vector database
    print("Generating embeddings and storing in vector database...")
    embedding_model = HuggingFaceEmbeddings(
        model_name=MODEL_NAME,
        model_kwargs={"device": "mps"},  # Adjust device as needed (e.g., "cpu", "cuda")
        encode_kwargs={"normalize_embeddings": True}
    )

    # Create Chroma vector store
    vectorstore = Chroma.from_documents(
        documents=pr_documents,
        embedding=embedding_model,
        persist_directory=VECTOR_DB_DIR  # Save vector database locally
    )

    # Save the database for future use
    vectorstore.persist()
else:
    print("Using existing local vector database...")
    # Load the existing vector database
    embedding_model = HuggingFaceEmbeddings(
        model_name=MODEL_NAME,
        model_kwargs={"device": "mps"},  # Ensure the device matches the previous setup
        encode_kwargs={"normalize_embeddings": True}
    )
    vectorstore = Chroma(
        persist_directory=VECTOR_DB_DIR,
        embedding_function=embedding_model
    )

Loading and processing PR data...
Generating embeddings and storing in vector database...


  embedding_model = HuggingFaceEmbeddings(
  vectorstore.persist()


In [5]:
# Step 4: Create a Retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Step 5: Define RAG Prompt Template
template = """
Use the following pieces of context to summarize the pull request. 
Always include the PR number, title, and any key changes or labels mentioned.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(template)

In [6]:
vectorstore.embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='thenlper/gte-small', cache_folder=None, model_kwargs={'device': 'mps'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [None]:
import re

result = retriever.invoke("""Markdown Live Preview
Reset
Copy

You can check your booking and cancel it through the [landing page] <URL to landing page>. Key in the Travel ID <paste kaligo booking ref> and the guest’s last name to access your booking.


Location

Location screenshot

Copy

Hotel details page, in the room selection

Non-refundable

Free cancellation (except a service fee) Before <Mon, 4 Nov>

Cancellation fee applies

Checkout page

This booking is non-refundable

This booking is non-refundable from 4 Nov 2024 12:00 AM onwards

All times indicated are based on UTC time.

Service fee of 5% of the refundable amount applies.

There will be a cancellation fee of if you cancel the booking between Aug 14, 2017 12:00 AM and Aug 17, 2017 11:59 PM.

All times indicated are based on UTC time.

Service fee of 5% of the refundable amount applies.

Instruction to cancel (below the policy):

This is a pre-paid rate. To change the dates, number of rooms etc. you will need to cancel this reservation subject to the existing cancellation policy and make a new booking based on the prevailing rates and availability. You can check your booking and cancel it through the [landing page] . Key in the Travel ID and the guest’s last name to access your booking.

Confirmation page

Same cancelation policy as checkout page.

Instruction to cancel (below the policy):

This is a pre-paid rate. To change the dates, number of rooms etc. you will need to cancel this reservation subject to the existing cancellation policy and make a new booking based on the prevailing rates and availability. You can check your booking and cancel it through the [landing page] . Key in the Travel ID and the guest’s last name to access your booking.

Confirmation email

Same cancelation policy as checkout page.

Instruction to cancel (below the policy):

This is a pre-paid rate. To change the dates, number of rooms etc. you will need to cancel this reservation subject to the existing cancellation policy and make a new booking based on the prevailing rates and availability. You can check your booking and cancel it through the [landing page] . Key in the Travel ID and the guest’s last name to access your booking.
""")

# Print the entire result
print(len(result))

# Extract and print the pull request number and title for each document
for doc in result:
    content = doc.page_content
    pr_number_match = re.search(r"Pull Request Number: (\d+)", content)
    title_match = re.search(r"Title: (.+)", content)
    if pr_number_match and title_match:
        pr_number = pr_number_match.group(1)
        title = title_match.group(1)
        print(f"Pull Request Number: {pr_number}")
        print(f"Title: {title}")

In [None]:
# Step 6: Load the LLM
llm = HuggingFacePipeline(
    pipeline=pipeline(
        model="Qwen/Qwen2.5-0.5B-Instruct",
        task="text-generation",
        temperature=0.2,
        do_sample=True,
        repetition_penalty=1.1,
        max_new_tokens=400,
        device=device  # Use MPS backend
    )
)

In [None]:
# Step 7: Create the RAG Chain
def format_docs(docs):
    return "\n\n".join(doc["content"] for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | StrOutputParser()
)

# Step 8: Example Query
query = "Summarize the most recent changes to checkout logic."
print("Running query through RAG pipeline...")
result = retriever.invoke(query)
# result = rag_chain.invoke({"question": query})
print("Query Result:")
print(result)