In [None]:
%pip install -r requirements.txt

# Imports

In [None]:
import os
from dotenv import load_dotenv

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [4]:
from langchain_chroma import Chroma

In [5]:
from langchain_openai import AzureChatOpenAI

In [6]:
from langchain_openai import AzureOpenAIEmbeddings

# Load Config

In [None]:
# Load environment variables from .env file
load_dotenv()

# Access environment variables
azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

In [8]:
# Set the environment variables
os.environ['AZURE_OPENAI_API_KEY'] = azure_api_key
os.environ['AZURE_OPENAI_ENDPOINT'] = azure_endpoint

In [9]:
MODEL_NAME = "gpt-4o-mini"
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

# Load CSV data

In [10]:
dir_path = "assignment2dataset.csv"

In [11]:
def load_documents(dir_path):
    """
    Load PDF documents from the specified directory using PyMuPDFLoader.
    
    Returns:
        list: A list of loaded documents.
    
    Raises:
        FileNotFoundError: If the directory does not exist.
        Exception: For other loading errors.
    """
    if not os.path.exists(dir_path):
        raise FileNotFoundError(f"Directory not found: {dir_path}")
    try:
        loader = CSVLoader(file_path=dir_path, source_column="course_id")
        return loader.load()
    except Exception as e:
        raise e

def split_documents(documents):
    """
    Split documents into smaller chunks using RecursiveCharacterTextSplitter.
    
    Args:
        documents (list): List of documents to split.
    
    Returns:
        list: A list of document chunks. Returns empty list if no documents.
    """
    try:
        if not documents:
            return []
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
        return text_splitter.split_documents(documents)
    except Exception as e:
        print(f"Error splitting documents: {str(e)}")
        return []

In [12]:
# Load and split documents
documents = load_documents(dir_path)
print(documents)

[Document(metadata={'source': 'C001', 'row': 0}, page_content='course_id: C001\ntitle: Foundations of Machine Learning\ndescription: Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to implement end-to-end workflows on real-world datasets, preparing learners for practical machine learning applications with interactive engaging exercises.'), Document(metadata={'source': 'C002', 'row': 1}, page_content='course_id: C002\ntitle: Deep Learning with TensorFlow and Keras\ndescription: Explore neural network architectures using TensorFlow and Keras frameworks. This course covers feedforward networks, convolutional neural networks, recurrent neural networks, and transfer learning. Learn to build, train, evaluate, and optimize deep learning models fo

In [13]:
# Access rows and their sources
for doc in documents[:5]:
    print("Row Content:", doc.page_content)  # The content of the row
    print("Source:", doc.metadata.get("source"))  # The source of the row

Row Content: course_id: C001
title: Foundations of Machine Learning
description: Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to implement end-to-end workflows on real-world datasets, preparing learners for practical machine learning applications with interactive engaging exercises.
Source: C001
Row Content: course_id: C002
title: Deep Learning with TensorFlow and Keras
description: Explore neural network architectures using TensorFlow and Keras frameworks. This course covers feedforward networks, convolutional neural networks, recurrent neural networks, and transfer learning. Learn to build, train, evaluate, and optimize deep learning models for image classification, sequence modeling, and text processing. Includes hands-on labs and re

In [14]:
documents = split_documents(documents)
print(f"Loaded and split {len(documents)} document chunks.")

Loaded and split 28 document chunks.


# Initialize OpenAI

In [15]:
# Azure OpenAI - GPT-4o or GPT-4o-mini
llm = AzureChatOpenAI(
    deployment_name=MODEL_NAME,
    model_name="gpt-4o",
    temperature=0.3,
    api_version="2024-05-01-preview",
)


In [16]:
embeddings = AzureOpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

# Vector Store

In [17]:
index_path = r"RAG/chroma_store/"
collection_name = "courses"

In [18]:
def create_vectorstore(documents, index_path, collection_name):
    """
    Create and save a new FAISS vector store from documents.
    
    Args:
        documents (list): List of document objects to convert to vectors.
    
    Returns:
        None: If successful, else Exception.
    """
    try:
        os.makedirs(index_path, exist_ok=True)
        vectorstore = Chroma.from_documents(documents, embeddings,persist_directory=index_path,
                                             collection_name=collection_name)
        save_vectorstore(vectorstore, index_path)
        print("Vector store created successfully.")
    except Exception as e:
        print(e)
        return e

def save_vectorstore(vectorstore, index_path):
    """
    Save the FAISS vector store to the specified path.
    
    Args:
        vectorstore (FAISS): The vector store to save.
    
    Returns:
        None: If successful, else Exception.
    """
    try:
        vectorstore.save_local(index_path)
    except Exception as e:
        return e

def load_vectorstore(embeddings, index_path, collection_name):
    """
    Load an existing FAISS vector store.
    
    Returns:
        FAISS: Loaded vector store, else Exception.
    """
    try:
        return Chroma(embedding_function=embeddings,persist_directory=index_path,
                      collection_name=collection_name)
    except Exception as e:
        return e

In [19]:
# Load or create vector store
if os.path.exists(index_path) and any(os.listdir(index_path)):
    vectorstore = load_vectorstore(embeddings, index_path, collection_name)
    vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
    print("Vector store loaded successfully.")
else:
    create_vectorstore(documents, index_path, collection_name)
    vectorstore = load_vectorstore(embeddings, index_path, collection_name)
    vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
    print("Created and loaded new vector store.")

Vector store created successfully.
Created and loaded new vector store.


In [20]:
# --- Search top-5 similar courses ---
user_input_text = "Foundations of Machine Learning"
results = vectorstore.similarity_search(
    query=user_input_text,
    k=5
)

# --- Display recommendations ---
print("\nRecommended Courses:")
for doc in results:
    name = doc.metadata.get("source")
    desc = doc.page_content
    print(f"- {name}: {desc}")


Recommended Courses:
- C001: course_id: C001
title: Foundations of Machine Learning
- C001: description: Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to implement end-to-end workflows on real-world datasets, preparing learners for practical machine learning applications with interactive engaging exercises.
- C005: course_id: C005
title: Reinforcement Learning Basics
description: Get introduced to reinforcement learning paradigms, including Markov decision processes, Q-learning, policy gradients, and actor-critic methods. Learn to formulate environments, design reward functions, and implement agents using OpenAI Gym and TensorFlow. Through guided labs you’ll train agents for classic control tasks and grid-world scenarios, exploring expl

# RAG Chain

In [24]:
def validate_query(query):
    """
    Validates a user's query by ensuring it is not empty and has at least 15 characters.
    
    Args:
        query (str): The input query.
    
    Returns:
        str: The query if valid, or an error message if invalid.
    """
    try:
        if not query:
            return "Query cannot be empty, enter a valid query."
        elif len(query) < 15:
            return "Query is too short, enter a valid query."
        else:
            return query
    except Exception as e:
        return str(e)

def create_rag_chain(query, relevant_documents):
    """
    Creates and executes a RAG chain to answer a query using retrieved documents.
    
    Args:
        query (str): The user query.
        relevant_documents (list): List of retrieved document chunks.
    
    Returns:
        str: The generated response or an error message.
    """
    try:
        prompt_template = """
        Only based on the provided documents, answer the question in points. Do not mention from which document the answer is derived.
        Your answer should be based on the documents a few courses that the user can take. Recommend the courses available in the documents.
        DO NOT go outside the document for course suggestions. If no suggestions, politely say "No suggested course available".
        Question: {query}
        Documents: {relevant_documents}
        """
        prompt = ChatPromptTemplate.from_template(prompt_template)
        valid_query = validate_query(query)
        rag_chain = prompt | llm | StrOutputParser()
        return rag_chain.invoke({"query": valid_query, "relevant_documents": relevant_documents})
    except Exception as e:
        return str(e)

In [25]:
# Test the RAG chain
query = """
“I know Azure basics and want to manage containers and build CI/CD pipelines.
Recommend courses.”
"""
relevant_documents = vectorstore_retriever.invoke(query)
print(relevant_documents)

[Document(id='d0400fa1-9b5b-4172-828f-94b3924a6a7b', metadata={'row': 6, 'source': 'C007'}, page_content='course_id: C007\ntitle: Cloud Computing with Azure\ndescription: Master Microsoft Azure’s core services: virtual machines, Azure Functions, Azure SQL Database, Cosmos DB, and Azure Kubernetes Service. Learn to deploy scalable web applications, configure networking and security, and implement infrastructure-as-code with ARM templates. Hands-on labs guide you through resource provisioning, cost management, and best practices for high availability and disaster recovery in Azure.'), Document(id='c9c93cdd-1ae3-4048-9f58-bd393026496f', metadata={'row': 8, 'source': 'C009'}, page_content='course_id: C009\ntitle: Containerization with Docker and Kubernetes'), Document(id='7a5e45eb-441e-4186-8457-0cf2037a48a8', metadata={'row': 7, 'source': 'C008'}, page_content='course_id: C008\ntitle: DevOps Practices and CI/CD'), Document(id='1de8cdaf-c181-44ff-ac00-6e892afa72b2', metadata={'row': 8, 'so

In [26]:
response = create_rag_chain(query, relevant_documents)
print("RAG Chain Response:")
print(response)

RAG Chain Response:
Here are a few recommended courses based on your interest in managing containers and building CI/CD pipelines:

1. **Containerization with Docker and Kubernetes**
   - Learn container fundamentals with Docker and advance to Kubernetes for orchestration.
   - Covers cluster provisioning, autoscaling, rolling updates, and Helm chart packaging.
   - Hands-on labs deploy microservices architectures on a local or cloud-based Kubernetes cluster.

2. **DevOps Practices and CI/CD**
   - Adopt DevOps methodologies to accelerate software delivery.
   - Explore version control with Git, continuous integration with Jenkins or GitHub Actions, and infrastructure-as-code with Terraform.
   - Implement CI/CD pipelines, container registry integration, and blue-green deployments with practical labs.
