In [1]:
import pandas as pd
! pip install chromadb sentence_transformers transformers numpy pandas


In [2]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
df = pd.read_csv('/content/drive/MyDrive/course_registration/courses.csv')

     CRN Campus Description                           Course Title  \
0  34154             Online  Computer Science and Its Applications   
1  32828             Boston                        Lab for CS 1100   
2  32829             Boston                        Lab for CS 1100   
3  32830             Boston                        Lab for CS 1100   
4  34155             Online                     First Year Seminar   

  Subject Course       Faculty Name  \
0         CS1100   Lieberherr, Karl   
1         CS1101   Lieberherr, Karl   
2         CS1101   Lieberherr, Karl   
3         CS1101   Lieberherr, Karl   
4         CS1200  Wassinger, Claire   

                                  Course Description         Term  Begin Time  \
0  Introduces students to the field of computer s...  Spring 2025         NaN   
1  Accompanies CS 1100. Involves experiments and ...  Spring 2025       800.0   
2  Accompanies CS 1100. Involves experiments and ...  Spring 2025       915.0   
3  Accompanies CS 11

In [4]:
print(df.head())

(500, 11)

In [None]:
df.shape

In [120]:
import pandas as pd

def clean_time(time_val):
    if pd.isna(time_val) or time_val == '' or time_val == 0:
        return None
    return str(int(time_val)).zfill(4) 


def format_time(time_str):
    if not time_str or len(time_str) != 4:
        return None
    hours = int(time_str[:2])
    minutes = time_str[2:]
    period = "AM" if hours < 12 else "PM"
    if hours > 12:
        hours -= 12
    elif hours == 0:
        hours = 12
    return f"{hours}:{minutes} {period}"



def course_to_sentence(row):
    crn = str(row.get('CRN', ''))
    campus = row.get('Campus Description', '')
    title = row.get('Course Title', '')
    subject_course = row.get('Subject Course', '')
    faculty = row.get('Faculty Name', '')
    description = row.get('Course Description', '')
    term = row.get('Term', '')

    begin_time = clean_time(row.get('Begin Time', ''))
    end_time = clean_time(row.get('End Time', ''))
    days = row.get('Days', '')
    prerequisites = row.get('Prerequisites', '[]')


    sentence_parts = [f"{subject_course} (CRN: {crn}), {title},"]
    

    if campus.lower() == 'online':
        sentence_parts.append("is an online course")
    elif campus.lower() == 'no campus, no room needed':
        sentence_parts.append("is a self-paced course")
    else:
        sentence_parts.append(f"is offered at {campus}")
    
    
    sentence_parts.append(f"for {term}")
    

    if begin_time and end_time and days:
        formatted_begin = format_time(begin_time)
        formatted_end = format_time(end_time)
        schedule = f"with classes scheduled {days} from {formatted_begin} to {formatted_end}"
        sentence_parts.append(schedule)
    

    if faculty:
        sentence_parts.append(f"taught by Professor {faculty}")
    

    sentence = " ".join(sentence_parts) + "."
    

    if description:
        description = ' '.join(str(description).split())
        sentence += f" {description}"
    
  
    if prerequisites == '[]' or not prerequisites or prerequisites.strip() == '':
        sentence += " This course is open to all students with no prerequisites required."
    else:
        prereqs = prerequisites.strip('[]').replace("'", "").replace('"', '')
        if prereqs:
            sentence += f" Prerequisites for this course: {prereqs}."
    
   
    return ' '.join(sentence.split())


def course_to_structured_text(row):
    """Convert course data to structured, searchable format"""
    # Course identification
    metadata_section = (
        "=== COURSE METADATA ===\n"
        f"Course Code: {row.get('Subject Course', '')}\n"
        f"CRN: {str(row.get('CRN', ''))}\n"
        f"Title: {row.get('Course Title', '')}\n"
    )

    # Location information
    campus = row.get('Campus Description', '')
    format_type = ("Online" if campus.lower() == 'online' 
                  else "Self-paced" if campus.lower() == 'no campus, no room needed' 
                  else "In-Person")
    location_section = (
        "=== LOCATION ===\n"
        f"Campus: {campus}\n"
        f"Format: {format_type}\n"
    )

    # Schedule information
    begin_time = clean_time(row.get('Begin Time', ''))
    end_time = clean_time(row.get('End Time', ''))
    days = row.get('Days', '')
    schedule_section = "=== SCHEDULE ===\n"
    if begin_time and end_time and days:
        schedule_section += (f"Days: {days}\n"
                           f"Time: {format_time(begin_time)} to {format_time(end_time)}\n")
    else:
        schedule_section += "Schedule: Flexible/Self-paced\n"

    # Instructor information
    faculty = row.get('Faculty Name', '')
    instructor_section = (
        "=== INSTRUCTOR ===\n"
        f"Professor: {faculty if faculty else 'Not specified'}\n"
    )

    # Course details including prerequisites
    prerequisites = row.get('Prerequisites', '[]')
    prereq_text = ("None required" if prerequisites == '[]' or not prerequisites or prerequisites.strip() == '' 
                  else prerequisites.strip('[]').replace("'", "").replace('"', ''))
    details_section = (
        "=== COURSE DETAILS ===\n"
        f"Term: {row.get('Term', '')}\n"
        f"Prerequisites: {prereq_text}\n"
    )

    # Course description
    description = row.get('Course Description', '')
    description_section = (
        "=== DESCRIPTION ===\n"
        f"{description if description else 'No description available'}\n"
    )

    # Combine all sections
    return (f"{metadata_section.lower()}\n"
            f"{location_section.lower()}\n"
            f"{schedule_section.lower()}\n"
            f"{instructor_section.lower()}\n"
            f"{details_section.lower()}\n"
            f"{description_section.lower()}")

def process_course_data(file_path):
    """Process entire course dataset and convert to sentences."""
    df = pd.read_csv(file_path)
    # return [course_to_sentence(row) for _, row in df.iterrows()]
    return [course_to_structured_text(row) for _, row in df.iterrows()]




In [121]:
course_sentences = process_course_data('/content/drive/MyDrive/course_registration/courses.csv')

In [130]:
print(course_sentences[300])

['=== COURSE METADATA ===\nCourse Code: CS1100\nCRN: 34154\nTitle: Computer Science and Its Applications\n\n=== LOCATION ===\nCampus: Online\nFormat: Online\n\n=== SCHEDULE ===\nSchedule: Flexible/Self-paced\n\n=== INSTRUCTOR ===\nProfessor: Lieberherr, Karl\n\n=== COURSE DETAILS ===\nTerm: Spring 2025\nPrerequisites: None required\n\n=== DESCRIPTION ===\nIntroduces students to the field of computer science and the patterns of thinking that enable them to become intelligent users of software tools in a problem-solving setting. Examines several important software applications so that students may develop the skills necessary to use computers effectively in their own disciplines.\n', '=== COURSE METADATA ===\nCourse Code: CS1101\nCRN: 32828\nTitle: Lab for CS 1100\n\n=== LOCATION ===\nCampus: Boston\nFormat: In-Person\n\n=== SCHEDULE ===\nDays: Friday\nTime: 8:00 AM to 9:05 AM\n\n=== INSTRUCTOR ===\nProfessor: Lieberherr, Karl\n\n=== COURSE DETAILS ===\nTerm: Spring 2025\nPrerequisites: 

In [34]:
import chromadb
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import re

class CustomEmbeddingFunction:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
    
    def __call__(self, input):
        if isinstance(input, str):
            input = [input]
        embeddings = self.model.encode(input, convert_to_numpy=True)
        if len(embeddings.shape) == 1:
            embeddings = np.expand_dims(embeddings, axis=0)
        return embeddings.tolist()

class CourseSearchSystem:
    def __init__(
        self, 
        embedding_model_name: str = "all-MiniLM-L6-v2",
        device: str = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
    ):
        self.device = device
        self.embedding_function = CustomEmbeddingFunction(embedding_model_name)
        self.chroma_client = chromadb.Client()
        
        try:
            self.chroma_client.delete_collection(name="course_embeddings")
        except:
            pass
            
        self.collection = self.chroma_client.create_collection(
            name="course_embeddings",
            embedding_function=self.embedding_function
        )


    def add_course_sentences_to_db(self, course_data):
        """Process and add structured course descriptions to the database"""

        documents = [doc for doc in course_data if doc is not None]        
        ids = [f"course_{i}" for i in range(len(documents))]

        # print(f"Sample metadata: {metadatas[0]}\n\n Sample document: {documents[0]}\n\n Sample ID: {ids[0]}")
        
        self.collection.add(
            documents=documents,
            ids=ids
        )

    def query_courses(self, query_text, n_results=5):
        """Enhanced query function for structured text"""
        # Clean and normalize query
        query_text = query_text.lower()
        
        return self.collection.query(
            query_texts=[query_text],
            n_results=n_results,
            include=["documents"]
        )

In [35]:
course_search_system = CourseSearchSystem()
course_search_system.add_course_sentences_to_db(course_sentences)

In [196]:
#Implementing the RAG pipeline
class RAGPipeline:
    SYSTEM_INSTRUCTION = """You are Course Compass, an AI assistant for Northeastern University's course registration system. Your role is to provide accurate course information based STRICTLY on the provided context.

    When analyzing the context and generating responses:
    1. Pay special attention to exact course codes (e.g., CS4100)
    2. Match professor names exactly as they appear
    3. Include specific details about:
       - Course format (online/in-person)
       - Schedule and timing
       - Prerequisites
       - Course description
    
    If the query asks about a specific professor or course:
    1. First look for exact matches in the course codes or professor names
    2. Include ALL relevant sections taught by that professor
    3. Mention specific scheduling details for each section
    
    If information is not found in the context:
    1. Explicitly state that the specific information is not available
    2. Do not make assumptions or provide information not present in the context
    
    Format your response with clear sections:
    1. Course Information (code, title, format)
    2. Professor Details
    3. Schedule Information
    4. Course Content/Description
    5. Prerequisites (if any)"""

    def __init__(self, course_search_system):
        self.course_search_system = course_search_system
        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def generate_response(self, query, retrieved_docs):
        # Prepare context by joining all retrieved documents
        context = "\n\n".join([doc for sublist in retrieved_docs for doc in sublist])
        
        # Create a detailed prompt for the model
        prompt = f"""Based on the provided course information, please answer the following query.
        
        Query: {query}

        Context:
        {context}

        Instructions:
        1. Only provide information that is explicitly mentioned in the context
        2. If any specific detail (professor, schedule, prerequisites) is not in the context, clearly state it's not available
        3. For professor queries, list all sections they teach
        4. Include exact course codes, schedules, and locations as they appear

        Response:"""

        # Prepare messages for the model
        messages = [
            {"role": "system", "content": self.SYSTEM_INSTRUCTION},
            {"role": "user", "content": prompt}
        ]
        
        # Generate response using the model
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=4500,
            temperature=0.1
        )
        
        # Process the generated response
        generated_ids = [
            output_ids[len(input_ids):] 
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        
        return self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    def __call__(self, query, top_k=5):
        print(f"Processing query: {query}")
        print("Retrieving relevant course information...")
        results = self.course_search_system.query_courses(query, top_k)
        
        print("Generating response...")
        response = self.generate_response(query, results["documents"])
        return response

def main():
    print("Initializing RAG pipeline...")
    rag_pipeline = RAGPipeline(course_search_system)
    

    example_queries = [
        "Who teaches Foundations of Database management systems in spring 2025 in Boston?"
    ]
    
    # Process each query
    for query in example_queries:
        print("\n" + "="*50)
        print(f"Query: {query}")
        response = rag_pipeline(query,5)
        print("\nResponse:")
        print(response)
        print("="*50)

if __name__ == "__main__":
    main()
