In [1]:
import os
import json
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from dotenv import load_dotenv
from langchain_core.messages import SystemMessage,HumanMessage
from langchain.prompts import ChatPromptTemplate,ChatMessagePromptTemplate
from langchain_core.documents import Document
import pandas as pd
import re

load_dotenv()

True

In [2]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [3]:
data = []

DATASET_PATH = "./Dataset/"
files = os.listdir(path=DATASET_PATH)
for file in files:
    with open (f"{DATASET_PATH}/{file}","r") as f: 
        data.extend(json.load(f))

In [4]:
data[0]

{'data_id': '22503',
 'course_number': '5200',
 'title': 'Cardiopulmonary Physiology',
 'college': 'Bouve College of Hlth Sciences BV',
 'department': 'Public Health and Health Sci HSCI',
 'credit_hours': '3',
 'description': 'Offers students an opportunity to gain an understanding of physiological principles of the cardiopulmonary system. Covers the structure and functional operation and regulation of the cardiopulmonary system, disease-associated physiological changes and cardiopulmonary dysfunction, and exercise-induced acute responses and physiological adaptations of the system and their applications to chronic cardiopulmonary diseases. Integrates knowledge of exercise and physical activity with cardiopulmonary health and fitness, as well as cardiopulmonary disease prevention and treatment.',
 'prerequisites': '( ( Biology 2217 Undergraduate C- And Biology 2219 Undergraduate C- ) Or Cardiopulmonary & Exercise Sci 4500 Undergraduate C- ) Or Graduate Admission REQ'}

In [5]:
len(data)

3351

In [6]:
special_courses = []
regular_courses = []

for idx,info in enumerate(data):
    dept_code = info["department"].split(" ")[-1]
    
    # Create modified copy with new keys
    temp = {
        **info,
        "dept_code": dept_code,
        "course_number": f"{dept_code}{info['course_number']}"
    }
    
    if re.search(r"(Special|Topics)", info["title"], re.IGNORECASE):
        special_courses.append(temp)
    else:
        regular_courses.append(temp)

In [7]:
len(regular_courses) , len(special_courses)

(3221, 130)

In [8]:
sys_msg = """
You are a specialist in retrieving the most relevant chunks from a vector database using cosine similarity search.

Your task is to **rephrase vague or general user queries** into **information-rich, semantically detailed prompts** that enhance the quality of similarity-based retrieval from a course or subject database.

Your goal: Expand and enrich user queries by adding domain-relevant subtopics, tools, frameworks, techniques, or applications — to improve the precision of similarity search and align with structured educational content.

Very Important: You will also be provided with the `department` and `college` information related to the course database. Ensure that your rephrased query explicitly **mentions the department and college** to help the system match with the correct institutional context.

### Examples:

1.  
**User Query:** I want to become a data scientist.  
**Rephrased Query:** Find courses and subjects offered by the Department of Data Science within the College of Computing and Information Sciences that include hands-on training in Python and R programming, statistical modeling, data wrangling, data visualization with tools like Tableau and Seaborn, machine learning algorithms, deep learning fundamentals, SQL for structured data analysis, big data tools such as Apache Spark and Hadoop, and cloud-based analytics platforms (e.g., AWS, Google Cloud). Also include practical projects with real-world datasets and exposure to ethical use of data.

2.  
**User Query:** I’m interested in law and emerging technologies.  
**Rephrased Query:** Retrieve interdisciplinary courses offered by the Department of Law and Technology within the College of Legal Studies that focus on the intersection of law and modern technologies, covering digital privacy laws, cybersecurity regulation, intellectual property rights in AI and software, blockchain legal frameworks, smart contract enforceability, data protection acts like GDPR and CCPA, AI governance and ethical policy, and case law involving technology-driven legal disputes.

3.  
**User Query:** I want to study climate change.  
**Rephrased Query:** Search for courses offered by the Department of Environmental Science within the College of Earth and Atmospheric Sciences that focus on climate science, climate policy, and environmental sustainability. Topics should include greenhouse gas modeling, climate risk assessment, renewable energy systems (solar, wind, geothermal), carbon offset strategies, global climate treaties (e.g., Paris Agreement), sustainability metrics, green finance, environmental economics, and the use of data and technology in climate monitoring and mitigation.

4.  
**User Query:** I want to learn about art and design.  
**Rephrased Query:** Retrieve subjects offered by the Department of Art and Design under the College of Fine Arts that explore traditional and digital design, including visual composition principles, art history, color theory, drawing, graphic design, UI/UX, digital tools like Photoshop and Illustrator, 3D modeling software, typography, and development of creative portfolios for professional applications.

5.  
**User Query:** I’m interested in healthcare and medicine.  
**Rephrased Query:** Find courses provided by the Department of Biomedical Sciences in the College of Health and Life Sciences that cover healthcare and medical topics, including anatomy and physiology, pathology, pharmacology, diagnostic imaging, clinical communication, medical ethics, epidemiology, public health policy, healthcare data systems, telemedicine, and AI in modern diagnostics and wearable health devices.

---

Now rephrase the following user query and just return the rephrased query:
"""

In [9]:
regular_courses[-1],special_courses[0]

({'data_id': '49085',
  'course_number': 'LAW7976',
  'title': 'Directed Study',
  'college': 'School of Law LW',
  'department': 'Law LAW',
  'credit_hours': '1 TO 6',
  'description': 'Offers independent work under the direction of members of the department on a chosen topic. Course content depends on instructor. May be repeated two times for a maximum of six semester hours.',
  'prerequisites': 'None',
  'dept_code': 'LAW'},
 {'data_id': '56288',
  'course_number': 'HLTH5963',
  'title': 'Topics',
  'college': 'Bouve College of Hlth Sciences BV',
  'department': 'Health Sci - Interdisciplinary HLTH',
  'credit_hours': '1 OR 2',
  'description': 'Offers students an opportunity to learn about timely issues, develop new skills, or explore areas of broad interest in an immersive, short-course format. Content and instructors vary by offering. May be repeated three times.',
  'prerequisites': 'None',
  'dept_code': 'HLTH'})

In [10]:
llm =  ChatOpenAI(model="gpt-4.1-nano")

In [11]:
def get_documents(data):
    documents = [
        Document(
            page_content=(
                f"Title: {course['title']}\n"
                f"{course['description']}\n"
                f"Prerequisites: {course['prerequisites']}\n"
                f"College: {course['college']}\n"
                f"Department: {course['department']}"
            ),
            metadata={
                "data_id": course["data_id"],
                "course_number": course["course_number"],
                "department": course["department"],
                "title": course["title"],
                "college": course["college"],
                "credit_hours": course["credit_hours"],
                "prerequisites": course["prerequisites"],
                "dept_code":course["dept_code"]
            }
        )
        for course in data
    ]
    return documents


regular_courses_documents = get_documents(regular_courses)
special_topics_documents = get_documents(special_courses)

In [12]:
regular_courses_database = Chroma(collection_name="RegularCourses",embedding_function=embeddings,persist_directory="./VectorDB")
special_topics_database = Chroma(collection_name="SpecialCourses",embedding_function=embeddings,persist_directory="./VectorDB")

  regular_courses_database = Chroma(collection_name="RegularCourses",embedding_function=embeddings,persist_directory="./VectorDB")


In [13]:
regular_courses_ids = regular_courses_database.add_documents(documents=regular_courses_documents)
special_courses_ids = special_topics_database.add_documents(documents=special_topics_documents)

In [14]:
regular_courses_retriver = regular_courses_database.as_retriever(search_kwargs={'k': 15})
special_courses_retriver = special_topics_database.as_retriever(search_kwargs={'k': 2})

In [15]:
def get_df(courses):
    titles= []
    dept = []
    college = []
    data_ids = []
    cns = []
    disc = []

    for course in courses:
        disc.append(course.page_content.split("\n")[1])
        cns.append(course.metadata["course_number"])
        data_ids.append(course.metadata["data_id"])
        titles.append(course.metadata["title"])
        dept.append(course.metadata["dept_code"])
        college.append(course.metadata["college"])

    df = pd.DataFrame(data={
        # "Data ID":data_ids,
        "Course Number":cns,
        "College":college,
        "Titles":titles,
        "Dept":dept,
        "Description":disc
    })

    df.reset_index(drop=True, inplace=True)

    df.sort_values(by=["Dept"])

    return df

def get_courses(role):
    prompt = ChatPromptTemplate.from_messages([
        ("system",sys_msg),
        ("user",f"I want to become a {role}")
    ])

    prompt = prompt.format_prompt(query=str(role))

    resposne = llm.invoke(prompt).content
    print(f"Rephrased Query: {resposne}")

    regular_courses_list = regular_courses_retriver.get_relevant_documents(resposne)    
    special_courses_list = special_courses_retriver.get_relevant_documents(resposne)
    
    df1 = get_df(courses=regular_courses_list)
    df1.sort_values(by=["Dept"],inplace=True)
    df2 = get_df(courses=special_courses_list)
    df2.sort_values(by=["Dept"],inplace=True)
    df =  pd.concat((df1,df2),axis=0,ignore_index=True)
    return df

In [16]:
role = "i want to become a data scientist an i am in khoury in MSDS and wanted to be a ML OPS enginerri."

In [17]:
df = get_courses(role=role)

Rephrased Query: Retrieve courses offered by the Department of Data Science and Machine Learning within the Khoury College of Computer Sciences, specifically focusing on MSDS programs, covering topics such as data analysis, statistical modeling, machine learning algorithms, deep learning, and MLOps engineering practices. Include training on deploying machine learning models at scale, model monitoring, automation with tools like TensorFlow Extended (TFX), Docker, Kubernetes, CI/CD pipelines for ML, cloud platforms (AWS, GCP, Azure), and practical projects involving real-world datasets. Emphasize skills in version control, reproducibility, and ethical considerations in AI/ML deployment.


  regular_courses_list = regular_courses_retriver.get_relevant_documents(resposne)


In [18]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)  # or 200
# df.iloc[:,[1,2,-1]]
df

Unnamed: 0,Course Number,College,Titles,Dept,Description
0,CS6140,Khoury Coll of Comp Sciences CS,Machine Learning,CS,"Provides a broad look at a variety of techniques used in machine learning and data mining, and also examines issues associated with their use. Topics include algorithms for supervised learning including decision tree induction, artificial neural networks, instance-based learning, probabilistic methods, and support vector machines; unsupervised learning; and reinforcement learning. Also covers computational learning theory and other methods for analyzing and measuring the performance of learning algorithms. Course work includes a programming term project."
1,CS7150,Khoury Coll of Comp Sciences CS,Deep Learning,CS,"Introduces deep learning, including the statistical learning framework, empirical risk minimization, loss function selection, fully connected layers, convolutional layers, pooling layers, batch normalization, multilayer perceptrons, convolutional neural networks, autoencoders, U-nets, residual networks, gradient descent, stochastic gradient descent, backpropagation, autograd, visualization of neural network features, robustness and adversarial examples, interpretability, continual learning, and applications in computer vision and natural language processing. Assumes students already have a basic knowledge of machine learning, optimization, linear algebra, and statistics."
2,CS5130,Khoury Coll of Comp Sciences CS,Applied Programming and Data Processing for AI,CS,"Presents an in-depth exploration of programming paradigms, mathematical foundations, and computational techniques essential for scientific computing and data-driven applications. Integrates linear algebra, Python programming principles, algorithmic design, and data science methodologies to offer students an opportunity to develop the technical expertise necessary for handling complex numerical computations and large-scale data processing. Strongly emphasizes Pythonic programming, object-oriented and functional programming, data preprocessing, visualization, and numerical stability, with the aim of developing the skills needed to write clean, efficient, and maintainable code for scientific and engineering applications."
3,DS5220,Khoury Coll of Comp Sciences CS,Supervised Machine Learning and Learning Theory,DS,"Introduces supervised machine learning, which is the study and design of algorithms that enable computers/machines to learn from experience or data, given examples of data with a known outcome of interest. Offers a broad view of models and algorithms for supervised decision making. Discusses the methodological foundations behind the models and the algorithms, as well as issues of practical implementation and use, and techniques for assessing the performance. Includes a term project involving programming and/or work with real-world data sets. Requires proficiency in a programming language such as Python, R, or MATLAB."
4,DS5110,Khoury Coll of Comp Sciences CS,Essentials of Data Science,DS,"Introduces students to the core tasks in data science, including data collection, storage, tidying, transformation, processing, management, and modeling for the purpose of extracting knowledge from raw observations. Programming is a cross-cutting aspect of the course. Offers students an opportunity to gain experience with data science tasks and tools through short assignments. Includes a term project based on real-world data."
5,DS5010,Khoury Coll of Comp Sciences CS,Introduction to Programming for Data Science,DS,"Offers an introductory course on fundamentals of programming and data structures. Covers lists, arrays, trees, hash tables, etc.; program design, programming practices, testing, debugging, maintainability, data collection techniques, and data cleaning and preprocessing. Includes a class project, where students use the concepts covered to collect data from the web, clean and preprocess the data, and make it ready for analysis."
6,DS5500,Khoury Coll of Comp Sciences CS,Data Science Capstone,DS,"Offers students a capstone opportunity to practice data science skills learned in previous courses and to build a portfolio. Students practice visualization, data wrangling, and machine learning skills by applying them to semester-long term projects on real-world data. Students may either propose their own projects or choose from a selection of industry options. Emphasizes the overall data science process, including identification of the scientific problem, selection of appropriate machine learning methods, and visualization and communication of results. Lectures may include additional topics, including visualization, communication, and data science ethics."
7,DS5020,Khoury Coll of Comp Sciences CS,Introduction to Linear Algebra and Probability for Data Science,DS,"Offers an introductory course on the basics of statistics, probability, and linear algebra. Covers random variables, frequency distributions, measures of central tendency, measures of dispersion, moments of a distribution, discrete and continuous probability distributions, chain rule, Bayes&#8217; rule, correlation theory, basic sampling, matrix operations, trace of a matrix, norms, linear independence and ranks, inverse of a matrix, orthogonal matrices, range and null-space of a matrix, the determinant of a matrix, positive semidefinite matrices, eigenvalues, and eigenvectors."
8,GSEN6105,College of Engineering EN,Data Science Engineering Methods and Tools,GSEN,"Introduces the fundamental techniques for machine learning and data science engineering. Discusses a variety of machine learning algorithms, along with examples of their implementation, evaluation, and best practices. Lays the foundation of how learning models are derived from complex data pipelines, both algorithmically and practically. Topics include supervised learning (parametric/nonparametric algorithms, support vector machines, kernels, neural networks, deep learning) and unsupervised learning (clustering, dimensionality reduction, recommender systems). Based on numerous real-world case studies."
9,INPR6610,Office of the Provost PR,Applied Machine Learning,INPR,"Analyzes machine learning techniques, methodologies, and best practices tailored for advanced artificial intelligence applications. Covers advanced supervised and unsupervised learning, deep learning architectures, reinforcement learning, and domain-specific applications. Explores crucial topics of ethics, privacy, and fairness in machine learning."
