In [1]:
from pathlib import Path

import pandas as pd

In [2]:
DATA_DIR = Path().cwd().parent / "data_pipeline" / "notebooks" / "data"
REVIEWS_DATA_FILE = DATA_DIR / "reviews.csv"

In [3]:
reviews_df = pd.read_csv(REVIEWS_DATA_FILE)
reviews_df

Unnamed: 0,CRN,Course Name,Instructor,Subject,Course Number,Question,Review
0,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Timing and Canvas are Organized.
1,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,very responsive professor
2,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Passionate, knowledgeable, extremely accommoda..."
3,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Scott was a pleasure to have as a professor! I...
4,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Assignments (homework, labs), and live lecture..."
...,...,...,...,...,...,...,...
12657,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Studied more before class so that I could foll...
12658,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,watch vedios and books ahead may be better.
12659,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,I can utilize the additional resources provide...
12660,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Spent more time outside of course hours review...


In [4]:
reviews_df.iloc[0, :]

CRN                                                          35056
Course Name             Data Str, Algo App in CmpSys (Spring 2024)
Instructor                                         Valcourt, Scott
Subject                                                         CS
Course Number                                                 5008
Question         What were the strengths of this course and/or ...
Review                            Timing and Canvas are Organized.
Name: 0, dtype: object

In [5]:
def stringify_review_instance(row: pd.Series) -> str:
    template = f"""Metadata:
    CRN: {row['CRN']}, Course Name: {row['Course Name']}, Instructor: {row['Instructor']},
    Course Number: {row['Subject']}{row['Course Number']}

    Question:
    {row['Question']}

    Review:
    {row['Review']}
    """

    return template

In [6]:
stringified_reviews = reviews_df.apply(stringify_review_instance, axis=1)
stringified_reviews

0        Metadata:\n    CRN: 35056, Course Name: Data S...
1        Metadata:\n    CRN: 35056, Course Name: Data S...
2        Metadata:\n    CRN: 35056, Course Name: Data S...
3        Metadata:\n    CRN: 35056, Course Name: Data S...
4        Metadata:\n    CRN: 35056, Course Name: Data S...
                               ...                        
12657    Metadata:\n    CRN: 35062, Course Name: Object...
12658    Metadata:\n    CRN: 35062, Course Name: Object...
12659    Metadata:\n    CRN: 35062, Course Name: Object...
12660    Metadata:\n    CRN: 35062, Course Name: Object...
12661    Metadata:\n    CRN: 35062, Course Name: Object...
Length: 12662, dtype: object

In [7]:
stringified_reviews[0]

'Metadata:\n    CRN: 35056, Course Name: Data Str, Algo App in CmpSys (Spring 2024), Instructor: Valcourt, Scott,\n    Course Number: CS5008\n\n    Question:\n    What were the strengths of this course and/or this instructor?\n\n    Review:\n    Timing and Canvas are Organized.\n    '

In [8]:
from time import time

import numpy as np
import pandas as pd

import chromadb
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Step 1: Prepare Corpus
# Convert Series to list
stringified_reviews_list = stringified_reviews.tolist()

start_time = time()
# Step 2: Embed Texts
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(stringified_reviews_list)
end_time = time()
print(f"Took {end_time - start_time} secs.")



Took 43.27331733703613 secs.


In [10]:
from tqdm import tqdm

client = chromadb.PersistentClient(path="./chromadb")

# Create a collection
collection = client.get_or_create_collection("naive_rag_embeddings")

# Add texts and embeddings to ChromaDB
for idx, (review, embedding) in tqdm(enumerate(zip(stringified_reviews_list, embeddings))):
    collection.add(
        documents=[review],
        metadatas=[{"index": idx}],
        ids=[str(idx)],
        embeddings=[embedding.tolist()]
    )

12662it [21:34,  9.78it/s]


In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Step 2: Load Qwen-2.5-1.5B-Instruct Model
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: RAG Pipeline
class RAGPipeline:
    SYSTEM_INSTRUCTION = """You are Course Compass, a chatbot dedicated to assisting Northeastern University graduate students with course registration each semester. You have access to the latest information on available graduate courses, faculty profiles, and summarized student feedback from previous semesters.
 
    Your goals are:
    1. To provide accurate, up-to-date information without speculating. If you lack information about a course or question, clearly communicate that to the student.
    2. To maintain a positive, professional tone. If past student feedback includes criticism, you should still respond diplomatically, focusing on constructive or neutral aspects.
    3. To be concise and relevant in your responses, helping students make informed decisions about their course choices.
     
    Avoid negative or speculative responses, and prioritize factual information over assumption.
    
    The response should:
    1. Highlight the main topics and unique aspects of the course content.
    2. Summarize the instructor's teaching style and notable strengths or weaknesses.
    3. Clearly address potential benefits and challenges of the course, providing a straightforward recommendation as needed.
    Ensure the answer is direct, informative, and relevant to the question.
     
    Answer the questions comprehensively using the reviews from the context by summarizing them to help the student."""
    
    def __init__(self, embedding_model, collection, model, tokenizer):
        self.embedding_model = embedding_model
        self.collection = collection
        self.model = model
        self.tokenizer = tokenizer

    def retrieve(self, query, top_k=50):
        # Embed the query
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )
        return results["documents"]

    def generate_response(self, query, retrieved_docs):
        # Flatten the list of retrieved documents
        flattened_docs = [doc for sublist in retrieved_docs for doc in sublist]
        context = "\n".join(flattened_docs)
        
        # Prepare messages
        messages = [
            {"role": "system", "content": self.SYSTEM_INSTRUCTION},
            {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}\n\nAnswer:"}
        ]
        
        # Tokenize and generate
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=4098
        )
        # Remove input tokens from output to isolate generated text
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


    def __call__(self, query, top_k=50):
        print("Retrieving")
        # Step 1: Retrieve relevant documents
        retrieved_docs = self.retrieve(query, top_k)

        print("Generating Response")
        # Step 2: Generate a response
        return self.generate_response(query, retrieved_docs)

# Step 4: Use the RAG Pipeline
rag_pipeline = RAGPipeline(embedding_model, collection, model, tokenizer)

# Example Query
query = "How difficult is Algorithms under Prof. Raj Venkat?"
response = rag_pipeline(query, top_k=3)
print(response)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Retrieving
Generating Response
Algorithms taught by Professor Raj Venkat is challenging but rewarding. While it covers a lot of material quickly, it provides an excellent opportunity to understand complex algorithms thoroughly. The course is well-structured, making it easier to follow along with lectures and assignments. However, the pace can be intense at times, so staying focused is key. Overall, it offers significant value for anyone looking to deepen their understanding of algorithms and prepare for technical interviews.
