In [48]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Imports

In [49]:
import os
import json

from typing import List
from dotenv import load_dotenv

In [50]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [51]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureChatOpenAI

In [52]:
from langchain.vectorstores import Chroma

In [53]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Load Config

In [54]:
# Load environment variables from .env file
load_dotenv()

# Access environment variables
azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("DEPLOYMENT_NAME")
langfuse_public = os.getenv("LANGFUSE_PUBLIC_KEY")
langfuse_secret = os.getenv("LANGFUSE_SECRET_KEY")

In [55]:
# Set the environment variables
os.environ['AZURE_OPENAI_API_KEY'] = azure_api_key
os.environ['AZURE_OPENAI_ENDPOINT'] = azure_endpoint

In [56]:
MODEL_NAME = "gpt-4.1-mini"
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

# Load CSV data

In [57]:
dir_path = "assignment2dataset.csv"

In [58]:
def load_documents(dir_path):
    """
    Load PDF documents from the specified directory using PyMuPDFLoader.
    
    Returns:
        list: A list of loaded documents.
    
    Raises:
        FileNotFoundError: If the directory does not exist.
        Exception: For other loading errors.
    """
    if not os.path.exists(dir_path):
        raise FileNotFoundError(f"Directory not found: {dir_path}")
    try:
        loader = CSVLoader(file_path=dir_path)
        return loader.load()
    except Exception as e:
        raise e

def split_documents(documents):
    """
    Split documents into smaller chunks using RecursiveCharacterTextSplitter.
    
    Args:
        documents (list): List of documents to split.
    
    Returns:
        list: A list of document chunks. Returns empty list if no documents.
    """
    try:
        if not documents:
            return []
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
        return text_splitter.split_documents(documents)
    except Exception as e:
        print(f"Error splitting documents: {str(e)}")
        return []

# Load and split documents
documents = load_documents(dir_path)
documents = split_documents(documents)
print(f"Loaded and split {len(documents)} document chunks.")

Loaded and split 28 document chunks.


# Initialize OpenAI

In [59]:
# Azure OpenAI - GPT-4o or GPT-4o-mini
llm = AzureChatOpenAI(
    deployment_name=MODEL_NAME,
    model_name="gpt-4o",
    temperature=0.3,
    api_version="2024-05-01-preview",
)


In [60]:
embeddings = AzureOpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

# Vector Store

In [61]:
index_path = r"RAG/chroma_store/"
collection_name = "courses"

In [62]:
def create_vectorstore(documents):
    """
    Create and save a new FAISS vector store from documents.
    
    Args:
        documents (list): List of document objects to convert to vectors.
    
    Returns:
        None: If successful, else Exception.
    """
    try:
        os.makedirs(index_path, exist_ok=True)
        vectorstore = Chroma.from_documents(documents, embeddings,persist_directory=index_path,
                                             collection_name=collection_name)
        save_vectorstore(vectorstore)
        print("Vector store created successfully.")
    except Exception as e:
        print(e)
        return e

def save_vectorstore(vectorstore):
    """
    Save the FAISS vector store to the specified path.
    
    Args:
        vectorstore (FAISS): The vector store to save.
    
    Returns:
        None: If successful, else Exception.
    """
    try:
        vectorstore.save_local(index_path)
    except Exception as e:
        return e

def load_vectorstore():
    """
    Load an existing FAISS vector store.
    
    Returns:
        FAISS: Loaded vector store, else Exception.
    """
    try:
        return Chroma(embedding_function=embeddings,persist_directory=index_path,
                      collection_name=collection_name)
    except Exception as e:
        return e

In [63]:
# Load or create vector store
if os.path.exists(index_path) and any(os.listdir(index_path)):
    vectorstore = load_vectorstore()
    vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
    print("Vector store loaded successfully.")
else:
    create_vectorstore(documents)
    vectorstore = load_vectorstore()
    vectorstore_retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
    print("Created and loaded new vector store.")

Vector store loaded successfully.


In [None]:
# --- Search top-5 similar courses ---
user_input_text = """
I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?
"""
results = vectorstore.similarity_search(
    query=user_input_text,
    k=5
)

# --- Display recommendations ---
print("\nRecommended Courses:")
for doc in results:
    name = doc.metadata["course_name"]
    desc = doc.page_content
    print(f"- {name}: {desc}")

# RAG Chain

In [None]:
def validate_query(query):
    """
    Validates a user's query by ensuring it is not empty and has at least 15 characters.
    
    Args:
        query (str): The input query.
    
    Returns:
        str: The query if valid, or an error message if invalid.
    """
    try:
        if not query:
            return "Query cannot be empty, enter a valid query."
        elif len(query) < 15:
            return "Query is too short, enter a valid query."
        else:
            return query
    except Exception as e:
        return str(e)

def create_rag_chain(query, relevant_documents):
    """
    Creates and executes a RAG chain to answer a query using retrieved documents.
    
    Args:
        query (str): The user query.
        relevant_documents (list): List of retrieved document chunks.
    
    Returns:
        str: The generated response or an error message.
    """
    try:
        prompt_template = """
        Only based on the provided documents, answer the question in points. Do not mention from which document the answer is derived.
        Question: {query}
        Documents: {docs}
        Note: You are a supply chain assistant. If the query is not related to supply chain or the documents do not provide the necessary information, return "Invalid Query".
        """
        prompt = ChatPromptTemplate.from_template(prompt_template)
        valid_query = validate_query(query)
        rag_chain = prompt | llm | StrOutputParser()
        return rag_chain.invoke({"query": valid_query, "docs": relevant_documents})
    except Exception as e:
        return str(e)

In [None]:
# Test the RAG chain
query = """
I’ve completed the ‘Python Programming for Data Science’ course and enjoy data visualization. What should I take next?
"""
relevant_documents = vectorstore_retriever.invoke(query)
response = create_rag_chain(query, relevant_documents)
print("RAG Chain Response:")
print(response)