In [3]:
import pandas as pd
! pip install chromadb sentence_transformers transformers numpy pandas


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/course_registration/courses.csv')

In [5]:
print(df.head())

     CRN Campus Description                           Course Title  \
0  34154             Online  Computer Science and Its Applications   
1  32828             Boston                        Lab for CS 1100   
2  32829             Boston                        Lab for CS 1100   
3  32830             Boston                        Lab for CS 1100   
4  34155             Online                     First Year Seminar   

  Subject Course       Faculty Name  \
0         CS1100   Lieberherr, Karl   
1         CS1101   Lieberherr, Karl   
2         CS1101   Lieberherr, Karl   
3         CS1101   Lieberherr, Karl   
4         CS1200  Wassinger, Claire   

                                  Course Description         Term  Begin Time  \
0  Introduces students to the field of computer s...  Spring 2025         NaN   
1  Accompanies CS 1100. Involves experiments and ...  Spring 2025       800.0   
2  Accompanies CS 1100. Involves experiments and ...  Spring 2025       915.0   
3  Accompanies CS 11

In [6]:
df.shape

(500, 11)

In [4]:
import pandas as pd

def clean_time(time_val):
    if pd.isna(time_val) or time_val == '' or time_val == 0:
        return None
    return str(int(time_val)).zfill(4)


def format_time(time_str):
    if not time_str or len(time_str) != 4:
        return None
    hours = int(time_str[:2])
    minutes = time_str[2:]
    period = "AM" if hours < 12 else "PM"
    if hours > 12:
        hours -= 12
    elif hours == 0:
        hours = 12
    return f"{hours}:{minutes} {period}"



def course_to_sentence(row):
    crn = str(row.get('CRN', ''))
    campus = row.get('Campus Description', '')
    title = row.get('Course Title', '')
    subject_course = row.get('Subject Course', '')
    faculty = row.get('Faculty Name', '')
    description = row.get('Course Description', '')
    term = row.get('Term', '')

    begin_time = clean_time(row.get('Begin Time', ''))
    end_time = clean_time(row.get('End Time', ''))
    days = row.get('Days', '')
    prerequisites = row.get('Prerequisites', '[]')


    sentence_parts = [f"{subject_course} (CRN: {crn}), {title},"]


    if campus.lower() == 'online':
        sentence_parts.append("is an online course")
    elif campus.lower() == 'no campus, no room needed':
        sentence_parts.append("is a self-paced course")
    else:
        sentence_parts.append(f"is offered at {campus}")


    sentence_parts.append(f"for {term}")


    if begin_time and end_time and days:
        formatted_begin = format_time(begin_time)
        formatted_end = format_time(end_time)
        schedule = f"with classes scheduled {days} from {formatted_begin} to {formatted_end}"
        sentence_parts.append(schedule)


    if faculty:
        sentence_parts.append(f"taught by Professor {faculty}")


    sentence = " ".join(sentence_parts) + "."


    if description:
        description = ' '.join(str(description).split())
        sentence += f" {description}"


    if prerequisites == '[]' or not prerequisites or prerequisites.strip() == '':
        sentence += " This course is open to all students with no prerequisites required."
    else:
        prereqs = prerequisites.strip('[]').replace("'", "").replace('"', '')
        if prereqs:
            sentence += f" Prerequisites for this course: {prereqs}."


    return ' '.join(sentence.split())


def course_to_structured_text(row):
    """Convert course data to structured, searchable format"""
    # Course identification
    metadata_section = (
        "=== COURSE METADATA ===\n"
        f"Course Code: {row.get('Subject Course', '')}\n"
        f"CRN: {str(row.get('CRN', ''))}\n"
        f"Title: {row.get('Course Title', '')}\n"
    )

    # Location information
    campus = row.get('Campus Description', '')
    format_type = ("Online" if campus.lower() == 'online'
                  else "Self-paced" if campus.lower() == 'no campus, no room needed'
                  else "In-Person")
    location_section = (
        "=== LOCATION ===\n"
        f"Campus: {campus}\n"
        f"Format: {format_type}\n"
    )

    # Schedule information
    begin_time = clean_time(row.get('Begin Time', ''))
    end_time = clean_time(row.get('End Time', ''))
    days = row.get('Days', '')
    schedule_section = "=== SCHEDULE ===\n"
    if begin_time and end_time and days:
        schedule_section += (f"Days: {days}\n"
                           f"Time: {format_time(begin_time)} to {format_time(end_time)}\n")
    else:
        schedule_section += "Schedule: Flexible/Self-paced\n"

    # Instructor information
    faculty = row.get('Faculty Name', '')
    instructor_section = (
        "=== INSTRUCTOR ===\n"
        f"Professor: {faculty if faculty else 'Not specified'}\n"
    )

    # Course details including prerequisites
    prerequisites = row.get('Prerequisites', '[]')
    prereq_text = ("None required" if prerequisites == '[]' or not prerequisites or prerequisites.strip() == ''
                  else prerequisites.strip('[]').replace("'", "").replace('"', ''))
    details_section = (
        "=== COURSE DETAILS ===\n"
        f"Term: {row.get('Term', '')}\n"
        f"Prerequisites: {prereq_text}\n"
    )

    # Course description
    description = row.get('Course Description', '')
    description_section = (
        "=== DESCRIPTION ===\n"
        f"{description if description else 'No description available'}\n"
    )

    # Combine all sections
    return (f"{metadata_section.lower()}\n"
            f"{location_section.lower()}\n"
            f"{schedule_section.lower()}\n"
            f"{instructor_section.lower()}\n"
            f"{details_section.lower()}\n"
            f"{description_section.lower()}")

def process_course_data(file_path):
    """Process entire course dataset and convert to sentences."""
    df = pd.read_csv(file_path)
    # return [course_to_sentence(row) for _, row in df.iterrows()]
    return [course_to_structured_text(row) for _, row in df.iterrows()]




In [None]:
course_sentences = process_course_data('/content/drive/MyDrive/course_registration/courses.csv')

In [91]:
print(course_sentences[300])

=== course metadata ===
course code: cs5540
crn: 40567
title: game programming

=== location ===
campus: portland, maine
format: in-person

=== schedule ===
days: monday
time: 1:00 pm to 4:20 pm

=== instructor ===
professor: talaei khoei, tala

=== course details ===
term: spring 2025
prerequisites: none required

=== description ===
covers the skills needed to develop easily scalable and modifiable scripts that can be used to implement various game mechanics common to most game genres. programming is an integral part of the digital game design and development life cycle. designed as a foundational game programming course covering numerous aspects of game programming.



In [None]:
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import re

# Utility functions remain the same
def clean_time(time_val):
    if pd.isna(time_val) or time_val == '' or time_val == 0:
        return None
    return str(int(time_val)).zfill(4)

def format_time(time_str):
    if not time_str or len(time_str) != 4:
        return None
    hours = int(time_str[:2])
    minutes = time_str[2:]
    period = "AM" if hours < 12 else "PM"
    if hours > 12:
        hours -= 12
    elif hours == 0:
        hours = 12
    return f"{hours}:{minutes} {period}"

def course_to_structured_text(row):
    metadata_section = (
        "=== COURSE METADATA ===\n"
        f"Course Code: {row.get('Subject Course', '')}\n"
        f"CRN: {str(row.get('CRN', ''))}\n"
        f"Title: {row.get('Course Title', '')}\n"
    )

    campus = row.get('Campus Description', '')
    format_type = ("Online" if campus.lower() == 'online'
                  else "Self-paced" if campus.lower() == 'no campus, no room needed'
                  else "In-Person")
    location_section = (
        "=== LOCATION ===\n"
        f"Campus: {campus}\n"
        f"Format: {format_type}\n"
    )

    begin_time = clean_time(row.get('Begin Time', ''))
    end_time = clean_time(row.get('End Time', ''))
    days = row.get('Days', '')
    schedule_section = "=== SCHEDULE ===\n"
    if begin_time and end_time and days:
        schedule_section += (f"Days: {days}\n"
                           f"Time: {format_time(begin_time)} to {format_time(end_time)}\n")
    else:
        schedule_section += "Schedule: Flexible/Self-paced\n"

    faculty = row.get('Faculty Name', '')
    instructor_section = (
        "=== INSTRUCTOR ===\n"
        f"Professor: {faculty if faculty else 'Not specified'}\n"
    )

    prerequisites = row.get('Prerequisites', '[]')
    prereq_text = ("None required" if prerequisites == '[]' or not prerequisites or prerequisites.strip() == ''
                  else prerequisites.strip('[]').replace("'", "").replace('"', ''))
    details_section = (
        "=== COURSE DETAILS ===\n"
        f"Term: {row.get('Term', '')}\n"
        f"Prerequisites: {prereq_text}\n"
    )

    description = row.get('Course Description', '')
    description_section = (
        "=== DESCRIPTION ===\n"
        f"{description if description else 'No description available'}\n"
    )

    return (f"{metadata_section.lower()}\n"
            f"{location_section.lower()}\n"
            f"{schedule_section.lower()}\n"
            f"{instructor_section.lower()}\n"
            f"{details_section.lower()}\n"
            f"{description_section.lower()}")

def process_course_data(file_path):
    df = pd.read_csv(file_path)
    return [course_to_structured_text(row) for _, row in df.iterrows()]

class CourseSearchSystem:
    def __init__(self):
        self.documents = None
        self.vectorizer = TfidfVectorizer(
            lowercase=True,
            token_pattern=r'(?u)\b\w+\b|===\s*\w+\s*===',  # Include section markers
            ngram_range=(1, 2)  # Include bigrams for better matching
        )
        self.tfidf_matrix = None

    def preprocess_query(self, query):
        """Extract structured information from query"""
        query_parts = {
            'course': None,
            'professor': None,
            'term': None,
            'campus': None
        }

        query = query.lower()

        # Extract course information
        if 'algorithms' in query:
            query_parts['course'] = 'algorithms'
        elif 'artificial intelligence' in query:
            query_parts['course'] = 'artificial intelligence'

        # Extract professor name
        if 'rajagopal' in query or 'venkatesaramani' in query:
            query_parts['professor'] = 'venkatesaramani, rajagopal'

        # Extract term
        if 'spring 2025' in query:
            query_parts['term'] = 'spring 2025'

        # Extract campus
        if 'boston' in query:
            query_parts['campus'] = 'boston'

        return query_parts

    def enhance_query(self, query):
        """Enhance query with structural information"""
        query_parts = self.preprocess_query(query)
        enhanced_query = query.lower()

        if query_parts['course']:
            enhanced_query += f" === course metadata === title: {query_parts['course']}"
        if query_parts['professor']:
            enhanced_query += f" === instructor === professor: {query_parts['professor']}"
        if query_parts['term']:
            enhanced_query += f" === course details === term: {query_parts['term']}"
        if query_parts['campus']:
            enhanced_query += f" === location === campus: {query_parts['campus']}"

        return enhanced_query, query_parts

    def add_course_sentences_to_db(self, course_data):
        self.documents = [doc for doc in course_data if doc is not None]
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)

    def query_courses(self, query_text, n_results=5):
        enhanced_query, query_parts = self.enhance_query(query_text)

        try:
            if self.tfidf_matrix is None:
                return {"documents": [["No documents indexed"]]}

            query_vec = self.vectorizer.transform([enhanced_query])
            scores = (query_vec @ self.tfidf_matrix.T).toarray()[0]
            top_n = np.argsort(scores)[-n_results:][::-1]

            filtered_results = []
            for idx in top_n:
                doc = self.documents[idx]
                doc_lower = doc.lower()

                matches_all = True
                if query_parts['course'] and query_parts['course'] not in doc_lower:
                    matches_all = False
                if query_parts['professor'] and query_parts['professor'] not in doc_lower:
                    matches_all = False
                if query_parts['term'] and query_parts['term'] not in doc_lower:
                    matches_all = False

                if matches_all:
                    filtered_results.append(doc)

            return {"documents": [filtered_results[:n_results]]}

        except Exception as e:
            print(f"Error during search: {e}")
            return {"documents": [["Error occurred during search"]]}

class RAGPipeline:
    SYSTEM_INSTRUCTION = """
    You are Curriculum compass, a chatbot which helps Northeastern University students find course offerings of their choice for the Spring 2025 semester.

    You have access to all the course offerings for the Spring 2025 semester, your objective is to use this context to answer student questions.

    Instructions:
    1. Students may not mention the names of the courses properly. Their input could have typo's, mistakes. For example, students could input 'PDP' instead of
    'Programming Design Paradigm' or they could mention 'Raj Venkat' instead of the full name of the professor 'Rajagopal Venkatesaramani'
    """

    def __init__(self, course_search_system):
        self.course_search_system = course_search_system
        model_name = "Qwen/Qwen2.5-3B-Instruct"
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def generate_response(self, query, retrieved_docs):
        context = "\n\n".join([doc for sublist in retrieved_docs for doc in sublist])
        prompt = f"""Context:{context}, Query: {query}"""

        messages = [
            {"role": "system", "content": self.SYSTEM_INSTRUCTION},
            {"role": "user", "content": prompt}
        ]

        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=4500,
            temperature=0.1
        )

        generated_ids = [
            output_ids[len(input_ids):]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    def __call__(self, query, top_k=10):
        print(f"Processing query: {query}")
        print("Retrieving relevant course information...")
        results = self.course_search_system.query_courses(query, top_k)

        print("Generating response...")
        response = self.generate_response(query, results["documents"])
        return response

def main():
    # Load course data
    course_sentences = process_course_data('/content/drive/MyDrive/course_registration/courses.csv')

    # Initialize TFIDF system
    search_system = CourseSearchSystem()
    search_system.add_course_sentences_to_db(course_sentences)

    # Create RAG pipeline
    rag_pipeline = RAGPipeline(search_system)

    # Test queries
    example_queries = [
        "Are there any prerequisite courses for Artificial Intelligence for Human Computer Interaction?"
    ]

    # Process queries
    for query in example_queries:
        print("\n" + "="*50)
        print(f"Query: {query}")
        try:
            response = rag_pipeline(query)
            print("Response:")
            print(response)
        except Exception as e:
            print(f"Error: {e}")
        print("="*50)

if __name__ == "__main__":
    main()

Experimentation to find the best RAG-retreiver

In [92]:
# import chromadb
# import torch
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# import numpy as np
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import re

# class CustomEmbeddingFunction:
#     def __init__(self, model_name="all-MiniLM-L6-v2"):
#         self.model = SentenceTransformer(model_name)

#     def __call__(self, input):
#         if isinstance(input, str):
#             input = [input]
#         embeddings = self.model.encode(input, convert_to_numpy=True)
#         if len(embeddings.shape) == 1:
#             embeddings = np.expand_dims(embeddings, axis=0)
#         return embeddings.tolist()

# class CourseSearchSystem:
#     def __init__(
#         self,
#         embedding_model_name: str = "all-MiniLM-L6-v2",
#         device: str = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
#     ):
#         self.device = device
#         self.embedding_function = CustomEmbeddingFunction(embedding_model_name)
#         self.chroma_client = chromadb.Client()

#         try:
#             self.chroma_client.delete_collection(name="course_embeddings")
#         except:
#             pass

#         self.collection = self.chroma_client.create_collection(
#             name="course_embeddings",
#             embedding_function=self.embedding_function
#         )


#     def add_course_sentences_to_db(self, course_data):
#         """Process and add structured course descriptions to the database"""

#         documents = [doc for doc in course_data if doc is not None]
#         ids = [f"course_{i}" for i in range(len(documents))]

#         # print(f"Sample metadata: {metadatas[0]}\n\n Sample document: {documents[0]}\n\n Sample ID: {ids[0]}")

#         self.collection.add(
#             documents=documents,
#             ids=ids
#         )

#     def query_courses(self, query_text, n_results=5):
#         """Enhanced query function for structured text"""
#         # Clean and normalize query
#         query_text = query_text.lower()

#         return self.collection.query(
#             query_texts=[query_text],
#             n_results=n_results,
#             include=["documents"]
#         )








In [93]:
# course_search_system = CourseSearchSystem()
# course_search_system.add_course_sentences_to_db(course_sentences)

In [52]:
# course_search_system.query_courses("who teaches algorithms offered for the spring?", n_results=5)["documents"]

In [123]:
# #Implementing the RAG pipeline
# class RAGPipeline:
#     SYSTEM_INSTRUCTION = """
#     You are 'Course Compass', an AI assistant for a university's course registration system. Your primary role is to provide precise and accurate course information based STRICTLY on the provided context, with specific focus on matching query details to context entries accurately.

#     Key guidelines:
#     1. Closely match each detail in the user's query to the contextâ€”course codes, professor names, course timings, and locations must be checked for exact matches or very close approximations.
#     2. Extract and provide comprehensive details about:
#       - Course format (online/in-person)
#       - Schedule and timing
#       - Prerequisites
#       - Course description
#     3. If a query references details seemingly not present in the context, double-check for synonyms or variations before stating absence.
#     4. Clarify any mismatches or ambiguities without making assumptions about unmentioned details.

#     Response protocol:
#     1. Directly address each specific detail mentioned in the query.
#     2. Clearly state when a detail from the query does not exactly match but is closely related, providing the related information.
#     3. Indicate when no matching information is found but also mention similar available information.
#     4. Structure responses to clearly separate confirmed details from those that are not available or ambiguous.

#     Response format:
#     1. Course Information: Code, title, format, campus
#     2. Professor Details: Names, teaching schedules, and any specific notes on their courses
#     3. Schedule Information: Detailed day and time listings
#     4. Course Content/Description: Detailed summary from the provided context
#     5. Prerequisites: Listed if specifically mentioned
#     """

#     def __init__(self, course_search_system):
#         self.course_search_system = course_search_system
#         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
#         self.model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             torch_dtype="auto",
#             device_map="auto"
#         )
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)

#     def generate_response(self, query, retrieved_docs):
#         # Prepare context by joining all retrieved documents
#         context = "\n\n".join([doc for sublist in retrieved_docs for doc in sublist])

#         # Create a detailed prompt for the model
#         prompt = f"""Based on the provided course information, please answer the following query.

#         Query: {query}

#         Context:
#         {context}

#         Instructions:
#         1. Verify the presence of the course or professor mentioned in the query against the context.
#         2. If there is a mismatch or absence of information, explicitly state that the information is not available.
#         3. Provide details only if they are specifically mentioned and confirmed in the context.
#         4. Ignore and clarify the absence of unrelated or unconfirmed details.

#         Response:
#         """

#         # Prepare messages for the model
#         messages = [
#             {"role": "system", "content": self.SYSTEM_INSTRUCTION},
#             {"role": "user", "content": prompt}
#         ]

#         # Generate response using the model
#         text = self.tokenizer.apply_chat_template(
#             messages,
#             tokenize=False,
#             add_generation_prompt=True
#         )

#         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
#         generated_ids = self.model.generate(
#             **model_inputs,
#             max_new_tokens=4500,
#             temperature=0.1
#         )

#         # Process the generated response
#         generated_ids = [
#             output_ids[len(input_ids):]
#             for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
#         ]

#         return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

#     def __call__(self, query, top_k=5):
#         print(f"Processing query: {query}")
#         print("Retrieving relevant course information...")
#         results = self.course_search_system.query_courses(query, top_k)

#         print("Generating response...")
#         response = self.generate_response(query, results["documents"])
#         return response

# def main():
#     print("Initializing RAG pipeline...")
#     rag_pipeline = RAGPipeline(course_search_system)


#     example_queries = [
#         "What are the courses professor shesh amit is teaching?"
#     ]

#     # Process each query
#     for query in example_queries:
#         print("\n" + "="*50)
#         print(f"Query: {query}")
#         response = rag_pipeline(query,5)
#         print("\nResponse:")
#         print(response)
#         print("="*50)

# if __name__ == "__main__":
#     main()


In [121]:
# course_search_system.query_courses("who offers Foundations of Artificial Intelligence offered for Spring 2025?")["documents"]

In [122]:
# course_search_system.query_courses("shesh amit", n_results=5)["documents"]

In [120]:
# import chromadb
# import torch
# import pandas as pd
# from sentence_transformers import SentenceTransformer
# import numpy as np
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from rank_bm25 import BM25Okapi
# from sklearn.feature_extraction.text import TfidfVectorizer
# import re

# # Utility functions
# def clean_time(time_val):
#     if pd.isna(time_val) or time_val == '' or time_val == 0:
#         return None
#     return str(int(time_val)).zfill(4)

# def format_time(time_str):
#     if not time_str or len(time_str) != 4:
#         return None
#     hours = int(time_str[:2])
#     minutes = time_str[2:]
#     period = "AM" if hours < 12 else "PM"
#     if hours > 12:
#         hours -= 12
#     elif hours == 0:
#         hours = 12
#     return f"{hours}:{minutes} {period}"

# def course_to_structured_text(row):
#     metadata_section = (
#         "=== COURSE METADATA ===\n"
#         f"Course Code: {row.get('Subject Course', '')}\n"
#         f"CRN: {str(row.get('CRN', ''))}\n"
#         f"Title: {row.get('Course Title', '')}\n"
#     )

#     campus = row.get('Campus Description', '')
#     format_type = ("Online" if campus.lower() == 'online'
#                   else "Self-paced" if campus.lower() == 'no campus, no room needed'
#                   else "In-Person")
#     location_section = (
#         "=== LOCATION ===\n"
#         f"Campus: {campus}\n"
#         f"Format: {format_type}\n"
#     )

#     begin_time = clean_time(row.get('Begin Time', ''))
#     end_time = clean_time(row.get('End Time', ''))
#     days = row.get('Days', '')
#     schedule_section = "=== SCHEDULE ===\n"
#     if begin_time and end_time and days:
#         schedule_section += (f"Days: {days}\n"
#                            f"Time: {format_time(begin_time)} to {format_time(end_time)}\n")
#     else:
#         schedule_section += "Schedule: Flexible/Self-paced\n"

#     faculty = row.get('Faculty Name', '')
#     instructor_section = (
#         "=== INSTRUCTOR ===\n"
#         f"Professor: {faculty if faculty else 'Not specified'}\n"
#     )

#     prerequisites = row.get('Prerequisites', '[]')
#     prereq_text = ("None required" if prerequisites == '[]' or not prerequisites or prerequisites.strip() == ''
#                   else prerequisites.strip('[]').replace("'", "").replace('"', ''))
#     details_section = (
#         "=== COURSE DETAILS ===\n"
#         f"Term: {row.get('Term', '')}\n"
#         f"Prerequisites: {prereq_text}\n"
#     )

#     description = row.get('Course Description', '')
#     description_section = (
#         "=== DESCRIPTION ===\n"
#         f"{description if description else 'No description available'}\n"
#     )

#     return (f"{metadata_section.lower()}\n"
#             f"{location_section.lower()}\n"
#             f"{schedule_section.lower()}\n"
#             f"{instructor_section.lower()}\n"
#             f"{details_section.lower()}\n"
#             f"{description_section.lower()}")

# def process_course_data(file_path):
#     df = pd.read_csv(file_path)
#     return [course_to_structured_text(row) for _, row in df.iterrows()]

# # Embedding Function
# class CustomEmbeddingFunction:
#     def __init__(self, model_name="all-MiniLM-L6-v2"):
#         self.model = SentenceTransformer(model_name)

#     def __call__(self, input):
#         if isinstance(input, str):
#             input = [input]
#         embeddings = self.model.encode(input, convert_to_numpy=True)
#         if len(embeddings.shape) == 1:
#             embeddings = np.expand_dims(embeddings, axis=0)
#         return embeddings.tolist()



# class CourseSearchSystem:
#     def __init__(
#         self,
#         retriever_type: str = "dense",
#         embedding_model_name: str = "all-MiniLM-L6-v2",
#         device: str = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
#     ):
#         self.device = device
#         self.retriever_type = retriever_type
#         self.documents = None

#         if retriever_type == "dense":
#             self.embedding_function = CustomEmbeddingFunction(embedding_model_name)
#             self.chroma_client = chromadb.Client()
#             try:
#                 self.chroma_client.delete_collection(name="course_embeddings")
#             except:
#                 pass
#             self.collection = self.chroma_client.create_collection(
#                 name="course_embeddings",
#                 embedding_function=self.embedding_function
#             )
#         elif retriever_type == "bm25":
#             self.bm25 = None
#             self.section_weights = {
#                 "metadata": 2.0,  # Higher weight for course title/code matches
#                 "instructor": 2.0,  # Higher weight for professor matches
#                 "details": 1.5,  # Medium weight for term matches
#                 "location": 1.5,  # Medium weight for campus matches
#                 "description": 0.5  # Lower weight for general content
#             }
#         elif retriever_type == "tfidf":
#             self.vectorizer = TfidfVectorizer(
#                 lowercase=True,
#                 token_pattern=r'(?u)\b\w+\b|===\s*\w+\s*===',  # Include section markers
#                 ngram_range=(1, 2)  # Include bigrams for better matching
#             )
#             self.tfidf_matrix = None

#     def preprocess_query(self, query):
#         """Extract structured information from query"""
#         query_parts = {
#             'course': None,
#             'professor': None,
#             'term': None,
#             'campus': None
#         }

#         # Convert to lowercase for consistent matching
#         query = query.lower()

#         # Extract course information
#         if 'algorithms' in query:
#             query_parts['course'] = 'algorithms'
#         elif 'artificial intelligence' in query:
#             query_parts['course'] = 'artificial intelligence'

#         # Extract professor name
#         if 'rajagopal' in query or 'venkatesaramani' in query:
#             query_parts['professor'] = 'venkatesaramani, rajagopal'

#         # Extract term
#         if 'spring 2025' in query:
#             query_parts['term'] = 'spring 2025'

#         # Extract campus
#         if 'boston' in query:
#             query_parts['campus'] = 'boston'

#         return query_parts

#     def enhance_query(self, query):
#         """Enhance query with structural information"""
#         query_parts = self.preprocess_query(query)
#         enhanced_query = query.lower()

#         if query_parts['course']:
#             enhanced_query += f" === course metadata === title: {query_parts['course']}"
#         if query_parts['professor']:
#             enhanced_query += f" === instructor === professor: {query_parts['professor']}"
#         if query_parts['term']:
#             enhanced_query += f" === course details === term: {query_parts['term']}"
#         if query_parts['campus']:
#             enhanced_query += f" === location === campus: {query_parts['campus']}"

#         return enhanced_query, query_parts

#     def add_course_sentences_to_db(self, course_data):
#         self.documents = [doc for doc in course_data if doc is not None]

#         if self.retriever_type == "dense":
#             ids = [f"course_{i}" for i in range(len(self.documents))]
#             self.collection.add(
#                 documents=self.documents,
#                 ids=ids
#             )
#         elif self.retriever_type == "bm25":
#             # Create weighted document representations
#             tokenized_docs = []
#             for doc in self.documents:
#                 tokens = []
#                 for section in ['metadata', 'instructor', 'details', 'location']:
#                     if f"=== {section} ===" in doc.lower():
#                         # Add section markers with weights
#                         weight = self.section_weights.get(section, 1.0)
#                         section_tokens = doc.lower().split(f"=== {section} ===")[1].split("===")[0].split()
#                         tokens.extend(section_tokens * int(weight))
#                 tokenized_docs.append(tokens)
#             self.bm25 = BM25Okapi(tokenized_docs)

#         elif self.retriever_type == "tfidf":
#             # Include section markers in vectorization
#             self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)

#     def query_courses(self, query_text, n_results=5):
#         enhanced_query, query_parts = self.enhance_query(query_text)

#         try:
#             if self.retriever_type == "dense":
#                 return self.collection.query(
#                     query_texts=[enhanced_query],
#                     n_results=n_results,
#                     include=["documents"]
#                 )

#             elif self.retriever_type == "bm25":
#                 if not self.bm25:
#                     return {"documents": [["No documents indexed"]]}

#                 # Create weighted query tokens
#                 query_tokens = []
#                 # Add query parts with appropriate weights
#                 for part_type, part_value in query_parts.items():
#                     if part_value:
#                         weight = self.section_weights.get(part_type, 1.0)
#                         tokens = part_value.split()
#                         query_tokens.extend(tokens * int(weight))

#                 # Add original query terms
#                 query_tokens.extend(query_text.lower().split())

#                 # Get scores and rank documents
#                 scores = self.bm25.get_scores(query_tokens)
#                 top_n = np.argsort(scores)[-n_results:][::-1]

#                 # Filter results based on exact matches for critical fields
#                 filtered_results = []
#                 for idx in top_n:
#                     doc = self.documents[idx]
#                     doc_lower = doc.lower()

#                     # Check for exact matches on critical fields
#                     matches_all = True
#                     if query_parts['course'] and query_parts['course'] not in doc_lower:
#                         matches_all = False
#                     if query_parts['professor'] and query_parts['professor'] not in doc_lower:
#                         matches_all = False
#                     if query_parts['term'] and query_parts['term'] not in doc_lower:
#                         matches_all = False

#                     if matches_all:
#                         filtered_results.append(doc)

#                 return {"documents": [filtered_results[:n_results]]}

#             elif self.retriever_type == "tfidf":
#                 if self.tfidf_matrix is None:
#                     return {"documents": [["No documents indexed"]]}

#                 # Transform enhanced query
#                 query_vec = self.vectorizer.transform([enhanced_query])
#                 scores = (query_vec @ self.tfidf_matrix.T).toarray()[0]
#                 top_n = np.argsort(scores)[-n_results:][::-1]

#                 # Filter results similarly to BM25
#                 filtered_results = []
#                 for idx in top_n:
#                     doc = self.documents[idx]
#                     doc_lower = doc.lower()

#                     matches_all = True
#                     if query_parts['course'] and query_parts['course'] not in doc_lower:
#                         matches_all = False
#                     if query_parts['professor'] and query_parts['professor'] not in doc_lower:
#                         matches_all = False
#                     if query_parts['term'] and query_parts['term'] not in doc_lower:
#                         matches_all = False

#                     if matches_all:
#                         filtered_results.append(doc)

#                 return {"documents": [filtered_results[:n_results]]}

#         except Exception as e:
#             print(f"Error during {self.retriever_type} search: {e}")
#             return {"documents": [["Error occurred during search"]]}

# # Enhanced RAG Pipeline
# class RAGPipeline:
#     SYSTEM_INSTRUCTION = """
#         You are Curriculum compass, a chatbot which helps Northeastern University students find course offerings of their choice for the Spring 2025 semester.

#         You have access to all the course offerings for the Spring 2025 semester, your objective is to use this context to answer student questions.

#         Instructions:
#         1. Students may not mention the names of the courses properly. Their input could have typo's, mistakes. For example, students could input 'PDP' instead of
#         'Programming Design Paradigm' or they could mention 'Raj Venkat' instead of the full name of the professor 'Rajagopal Venkatesaramani'
#         """

#     def __init__(self, course_search_system):
#         self.course_search_system = course_search_system
#         model_name = "Qwen/Qwen2.5-3B-Instruct"
#         self.model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             torch_dtype="auto",
#             device_map="auto"
#         )
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)

#     def generate_response(self, query, retrieved_docs):
#         context = "\n\n".join([doc for sublist in retrieved_docs for doc in sublist])
#         prompt = f"""Context:{context}, Query: {query}"""

#         messages = [
#             {"role": "system", "content": self.SYSTEM_INSTRUCTION},
#             {"role": "user", "content": prompt}
#         ]

#         text = self.tokenizer.apply_chat_template(
#             messages,
#             tokenize=False,
#             add_generation_prompt=True
#         )

#         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
#         generated_ids = self.model.generate(
#             **model_inputs,
#             max_new_tokens=4500,
#             temperature=0.1
#         )

#         generated_ids = [
#             output_ids[len(input_ids):]
#             for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
#         ]

#         return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

#     def __call__(self, query, top_k=10):
#         print(f"Processing query: {query}")
#         print("Retrieving relevant course information...")
#         results = self.course_search_system.query_courses(query, top_k)

#         print("Results:")
#         print(results["documents"])

#         print("Generating response...")
#         response = self.generate_response(query, results["documents"])
#         return response

# def main():
#     # Load course data
#     course_sentences = process_course_data('/content/drive/MyDrive/course_registration/courses.csv')

#     # Initialize retrievers
#     retrievers = {
#         "dense": CourseSearchSystem(retriever_type="dense"),
#         "bm25": CourseSearchSystem(retriever_type="bm25"),
#         "tfidf": CourseSearchSystem(retriever_type="tfidf")
#     }

#     # Initialize and index documents for each retriever
#     for system in retrievers.values():
#         system.add_course_sentences_to_db(course_sentences)

#     # Create RAG pipelines
#     rag_pipelines = {name: RAGPipeline(system) for name, system in retrievers.items()}

#     # Test queries
#     example_queries = [
#         "Are there any prerequisite courses for Artificial Intelligence for Human Computer Interaction?"
#     ]

#     # Process queries with each retriever
#     for query in example_queries:
#         print("\n" + "="*50)
#         print(f"Query: {query}")

#         for name, pipeline in rag_pipelines.items():
#             print(f"\n{name.upper()} Retriever Results:")
#             print("-" * 30)
#             try:
#                 response = pipeline(query)
#                 print(response)
#             except Exception as e:
#                 print(f"Error with {name} retriever: {e}")
#             print("-" * 30)

# if __name__ == "__main__":
#     main()