In [1]:
import os
from pathlib import Path

import pandas as pd

In [2]:
DATA_DIR = Path().cwd().parent / "data_pipeline" / "notebooks" / "data"
REVIEWS_DATA_FILE = DATA_DIR / "reviews.csv"

In [3]:
reviews_df = pd.read_csv(REVIEWS_DATA_FILE)
reviews_df

Unnamed: 0,CRN,Course Name,Instructor,Subject,Course Number,Question,Review
0,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Timing and Canvas are Organized.
1,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,very responsive professor
2,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Passionate, knowledgeable, extremely accommoda..."
3,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Scott was a pleasure to have as a professor! I...
4,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Assignments (homework, labs), and live lecture..."
...,...,...,...,...,...,...,...
12657,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Studied more before class so that I could foll...
12658,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,watch vedios and books ahead may be better.
12659,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,I can utilize the additional resources provide...
12660,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Spent more time outside of course hours review...


In [4]:
reviews_df.iloc[0, :]

CRN                                                          35056
Course Name             Data Str, Algo App in CmpSys (Spring 2024)
Instructor                                         Valcourt, Scott
Subject                                                         CS
Course Number                                                 5008
Question         What were the strengths of this course and/or ...
Review                            Timing and Canvas are Organized.
Name: 0, dtype: object

In [5]:
def stringify_review_instance(row: pd.Series) -> str:
    template = f"""Metadata:
    CRN: {row['CRN']}, Course Name: {row['Course Name']}, Instructor: {row['Instructor']},
    Course Number: {row['Subject']}{row['Course Number']}

    Question:
    {row['Question']}

    Review:
    {row['Review']}
    """

    return template

In [6]:
stringified_reviews = reviews_df.apply(stringify_review_instance, axis=1)
stringified_reviews

0        Metadata:\n    CRN: 35056, Course Name: Data S...
1        Metadata:\n    CRN: 35056, Course Name: Data S...
2        Metadata:\n    CRN: 35056, Course Name: Data S...
3        Metadata:\n    CRN: 35056, Course Name: Data S...
4        Metadata:\n    CRN: 35056, Course Name: Data S...
                               ...                        
12657    Metadata:\n    CRN: 35062, Course Name: Object...
12658    Metadata:\n    CRN: 35062, Course Name: Object...
12659    Metadata:\n    CRN: 35062, Course Name: Object...
12660    Metadata:\n    CRN: 35062, Course Name: Object...
12661    Metadata:\n    CRN: 35062, Course Name: Object...
Length: 12662, dtype: object

In [7]:
stringified_reviews[0]

'Metadata:\n    CRN: 35056, Course Name: Data Str, Algo App in CmpSys (Spring 2024), Instructor: Valcourt, Scott,\n    Course Number: CS5008\n\n    Question:\n    What were the strengths of this course and/or this instructor?\n\n    Review:\n    Timing and Canvas are Organized.\n    '

In [9]:
from time import time

import numpy as np
import pandas as pd

import chromadb
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# Step 1: Prepare Corpus
# Convert Series to list
stringified_reviews_list = stringified_reviews.tolist()

start_time = time()
# Step 2: Embed Texts
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(stringified_reviews_list)
end_time = time()
print(f"Took {end_time - start_time} secs.")

In [11]:
from tqdm import tqdm

client = chromadb.PersistentClient(path="./chromadb")

# Create a collection
collection = client.get_or_create_collection("naive_rag_embeddings")

# Add texts and embeddings to ChromaDB
# for idx, (review, embedding) in tqdm(enumerate(zip(stringified_reviews_list, embeddings))):
#     collection.add(
#         documents=[review],
#         metadatas=[{"index": idx}],
#         ids=[str(idx)],
#         embeddings=[embedding.tolist()]
#     # )

In [13]:
import weave
from transformers import AutoModelForCausalLM, AutoTokenizer

weave.init(project_name="Naive_RAG_Reviews")

# Step 2: Load Qwen-2.5-1.5B-Instruct Model
model_name = "Qwen/Qwen2.5-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: RAG Pipeline
class RAGPipeline:
    SYSTEM_INSTRUCTION = """
    You are Course Compass, a chatbot dedicated to assisting Northeastern University graduate students with course registration each semester.
    You have access to the latest information on available graduate courses, faculty profiles, and summarized student feedback from previous semesters.
 
    Your goals are:
    1. To provide accurate, up-to-date information without speculating. If you lack information about a course or question, clearly communicate that to the student.
    2. To maintain a positive, professional tone. If past student feedback includes criticism, you should still respond diplomatically, focusing on constructive or neutral aspects.
    3. To be concise and relevant in your responses, helping students make informed decisions about their course choices.

    Important Guidelines to be followed:
    1. The context is provided to you after retrieving reviews similar to the query being asked using a RAG pipeline.
    Sometimes, the context is not relevant to the particular query being asked. You should always check if the context is related to the query, else reply that you don't have enough information to reply.
    For example, the query could be about a particular course or professor, while the context would be of some other courses or professors, you should reply that you don't have enough information to these cases.
    2. Avoid negative or speculative responses, and prioritize factual information over assumption.
     
    Answer the questions comprehensively using the reviews from the context by summarizing them to help the student.
    """
    
    def __init__(self, embedding_model, collection, model, tokenizer):
        self.embedding_model = embedding_model
        self.collection = collection
        self.model = model
        self.tokenizer = tokenizer

    @weave.op
    def retrieve(self, query, top_k=5):
        # Embed the query
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        
        return results["documents"]

    @weave.op
    def generate_response(self, query, retrieved_docs):
        # Flatten the list of retrieved documents
        flattened_docs = [doc for sublist in retrieved_docs for doc in sublist]
        context = "\n".join(flattened_docs)
        
        # Prepare messages
        messages = [
            {"role": "system", "content": self.SYSTEM_INSTRUCTION},
            {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}\n\nAnswer:"}
        ]
        
        # Tokenize and generate
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=4098,
            temperature=0.1
        )
        # Remove input tokens from output to isolate generated text
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return response

    @weave.op
    def __call__(self, query, top_k=5):
        print("Retrieving")
        # Step 1: Retrieve relevant documents
        retrieved_docs = self.retrieve(query, top_k)

        print("Generating Response")
        # Step 2: Generate a response
        return self.generate_response(query, retrieved_docs)

# Step 4: Use the RAG Pipeline
rag_pipeline = RAGPipeline(embedding_model, collection, model, tokenizer)

with weave.attributes({'user_id': 's-kishore', 'env': 'testing'}):
    # Example Query
    query = "How difficult is Algorithms under Prof. Raj Venkat?"
    response = rag_pipeline(query, top_k=5)
    print(response)

Logged in as Weights & Biases user: s-kishore.
View Weave data at https://wandb.ai/s-kishore/naive_rag_reviews/weave


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving
Generating Response
🍩 https://wandb.ai/s-kishore/naive_rag_reviews/r/call/01940f5c-f109-7112-a8f6-0b182f81e2bc
Based on the context provided, there isn't any specific review mentioning Prof. Raj Venkat teaching Algorithms. Therefore, I don't have enough information to provide insights into how difficult the course might be under his instruction.


In [14]:
import weave
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

weave.init(project_name="Naive_RAG_Reviews")

# Step 2: Load Qwen-2.5-1.5B-Instruct Model
model_name = "Qwen/Qwen2.5-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Reranker Class
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

class Reranker:
    def __init__(self, reranker_model_name="cross-encoder/ms-marco-MiniLM-L-12-v2", device="cpu"):
        """
        Initialize the Reranker with a cross-encoder model.
        Args:
            reranker_model_name: Name of the Hugging Face model for reranking.
            device: Device to run the model on ("cpu" or "cuda").
        """
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name).to(device)

    def rerank(self, query, documents, top_k=None):
        """
        Rerank the documents based on their relevance to the query.
        Args:
            query: The input query string.
            documents: List of documents to rerank.
            top_k: Number of top documents to return (default: all).
        Returns:
            List of reranked documents.
        """
        # Prepare query-document pairs
        pairs = [[query, doc] for doc in documents]
        
        # Tokenize inputs for the cross-encoder
        inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        
        # Predict relevance scores
        with torch.no_grad():
            outputs = self.model(**inputs)
            scores = outputs.logits.squeeze(-1)  # Extract scores from logits
        
        # Sort documents by scores in descending order
        ranked_indices = scores.argsort(descending=True)
        ranked_documents = [documents[idx] for idx in ranked_indices]
        
        # Return the top_k documents if specified
        return ranked_documents[:top_k] if top_k else ranked_documents


# Step 3: RAG Pipeline with Reranking
class RAGPipeline:
    SYSTEM_INSTRUCTION = """
    You are Course Compass, a chatbot dedicated to assisting Northeastern University graduate students with course registration each semester.
    You have access to the latest information on available graduate courses, faculty profiles, and summarized student feedback from previous semesters.
 
    Your goals are:
    1. To provide accurate, up-to-date information without speculating. If you lack information about a course or question, clearly communicate that to the student.
    2. To maintain a positive, professional tone. If past student feedback includes criticism, you should still respond diplomatically, focusing on constructive or neutral aspects.
    3. To be concise and relevant in your responses, helping students make informed decisions about their course choices.

    Important Guidelines to be followed:
    1. The context is provided to you after retrieving reviews similar to the query being asked using a RAG pipeline.
    Sometimes, the context is not relevant to the particular query being asked. You should always check if the context is related to the query, else reply that you don't have enough information to reply.
    For example, the query could be about a particular course or professor, while the context would be of some other courses or professors, you should reply that you don't have enough information to these cases.
    2. Avoid negative or speculative responses, and prioritize factual information over assumption.
     
    Answer the questions comprehensively using the reviews from the context by summarizing them to help the student.
    """
    
    def __init__(self, embedding_model, collection, model, tokenizer, reranker):
        self.embedding_model = embedding_model
        self.collection = collection
        self.model = model
        self.tokenizer = tokenizer
        self.reranker = reranker

    @weave.op
    def retrieve(self, query, top_k=5):
        # Embed the query
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        
        return results["documents"]

    @weave.op
    def rerank_documents(self, query, retrieved_docs, top_k):
        # Flatten the list of retrieved documents
        flattened_docs = [doc for sublist in retrieved_docs for doc in sublist]
        
        # Rerank using the reranker
        reranked_docs = self.reranker.rerank(query, flattened_docs, top_k=top_k)
        return reranked_docs

    @weave.op
    def generate_response(self, query, retrieved_docs):
        # Join reranked documents into a context string
        context = "\n".join(retrieved_docs)
        
        # Prepare messages
        messages = [
            {"role": "system", "content": self.SYSTEM_INSTRUCTION},
            {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}\n\nAnswer:"}
        ]
        
        # Tokenize and generate
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=4098,
            temperature=0.1
        )
        # Remove input tokens from output to isolate generated text
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return response

    @weave.op
    def __call__(self, query, top_k=5):
        print("Retrieving")
        # Step 1: Retrieve relevant documents
        retrieved_docs = self.retrieve(query, top_k)

        print("Reranking")
        # Step 2: Rerank the retrieved documents
        reranked_docs = self.rerank_documents(query, retrieved_docs, top_k)

        print("Generating Response")
        # Step 3: Generate a response
        return self.generate_response(query, reranked_docs)

# Step 4: Use the RAG Pipeline with Reranking
reranker = Reranker(device="cuda" if torch.cuda.is_available() else "cpu")
rag_pipeline = RAGPipeline(embedding_model, collection, model, tokenizer, reranker)

with weave.attributes({'user_id': 's-kishore', 'env': 'testing'}):
    # Example Query
    query = "Can you summarize the reviews for the Foundations of Artificial Intelligence under Prof. Raj Venkat?"
    response = rag_pipeline(query, top_k=5)
    print(response)

Logged in as Weights & Biases user: s-kishore.
View Weave data at https://wandb.ai/s-kishore/naive_rag_reviews/weave


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Retrieving
Reranking
Generating Response
🍩 https://wandb.ai/s-kishore/naive_rag_reviews/r/call/01940f5e-1f1b-7480-91b9-7b5ed86bb8b7
Based on the reviews, the strengths of the Foundations of Artificial Intelligence course taught by Prof. Raj Venkat include:

- **Engagement and Expertise**: Prof. Venkat is described as intelligent and engaging, and he navigates the balance between providing adequate and detailed information well.
- **Caring and Supportive**: He is noted for his willingness to spend extra time explaining concepts and helping students understand the material.
- **Course Content**: While specific details about assignments and projects are mentioned, the general sentiment is positive regarding the content covered in the course.

Overall, students appreciate Prof. Venkat's approach to teaching and his dedication to student understanding.
