In [1]:
import os
from dotenv import load_dotenv

# Get the current file's directory (all_rag_techniques)
current_dir = os.path.dirname(os.path.abspath('__file__'))

# Get the parent directory (RAG_Techniques)
parent_dir = os.path.dirname(current_dir)

# Construct the path to the .env file in the parent directory
dotenv_path = os.path.join(parent_dir, '.env')

# Load the .env file
load_dotenv(dotenv_path)

# Now you can use os.getenv to get your environment variables
api_key = os.getenv('OPENAI_API_KEY')

if api_key:
    print(f"API key loaded successfully. First 5 characters: {api_key[:5]}")
else:
    print("Failed to load API key from environment.")

print(f"Current working directory: {os.getcwd()}")
print(f"Parent directory (where .env should be): {parent_dir}")
print(f".env file path: {dotenv_path}")
print(f".env file exists: {os.path.exists(dotenv_path)}")

# If the .env file exists, print its contents (excluding the actual API key)
if os.path.exists(dotenv_path):
    with open(dotenv_path, 'r') as file:
        contents = file.read()
        print("\nContents of .env file (API key redacted):")
        for line in contents.split('\n'):
            if line.startswith('OPENAI_API_KEY='):
                print('OPENAI_API_KEY=[REDACTED]')
            else:
                print(line)

API key loaded successfully. First 5 characters: sk-pr
Current working directory: /Users/lasyaedunuri/Documents/ApplOfLLMs/t-c
Parent directory (where .env should be): /Users/lasyaedunuri/Documents/ApplOfLLMs
.env file path: /Users/lasyaedunuri/Documents/ApplOfLLMs/.env
.env file exists: False


In [2]:
import os
import sys
from dotenv import load_dotenv

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Add the parent directory to the path since we work with notebooks
from helper_functions import *
from evaluate_rag import *

# Check if the API key is loaded correctly
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    print("API key loaded successfully")
    print(f"API key: {api_key[:5]}...{api_key[-5:]}")  # Print first and last 5 characters
else:
    print("Failed to load API key")

# Load environment variables from a .env file
load_dotenv()

# Set the OpenAI API key environment variable
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from helper_functions import *


API key loaded successfully
API key: sk-pr...EIBQA




In [3]:
folders = ["src/cars", "src/E-commerce", "src/food", "src/electronics"]

In [4]:
import os
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# def encode_documents(folders, chunk_size=1000, chunk_overlap=200):
#     """
#     Encodes all text files from multiple folders into a FAISS vector store using OpenAI embeddings.

#     Args:
#         folders: A list of paths to directories containing text files.
#         chunk_size: The desired size of each text chunk.
#         chunk_overlap: The amount of overlap between consecutive chunks.

#     Returns:
#         A FAISS vector store containing the encoded content of the files.
#     """
#     # Initialize text splitter and embeddings
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
#     )
#     embeddings = OpenAIEmbeddings()

#     # Initialize an empty list to store all text chunks
#     all_texts = []
#     metadata = []

#     # Iterate through each folder
#     for folder in folders:
#         for filename in os.listdir(folder):
#             if filename.endswith('.txt'):
#                 file_path = os.path.join(folder, filename)
                
#                 # Load text document
#                 with open(file_path, 'r', encoding='utf-8') as file:
#                     text = file.read()
                
#                 # Split text into chunks
#                 texts = text_splitter.split_text(text)
#                 all_texts.extend(texts)

#                 # Add metadata for each chunk
#                 metadata.extend([{"file_name": filename, "folder": folder}] * len(texts))
    
#     # Create vector store from all texts and their metadata
#     vectorstore = FAISS.from_texts(all_texts, embeddings, metadatas=metadata)
    
#     return vectorstore

def encode_documents(folders, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    embeddings = OpenAIEmbeddings()
    all_texts = []
    metadata = []

    for folder in folders:
        for filename in os.listdir(folder):
            if filename.endswith(".txt"):
                file_path = os.path.join(folder, filename)
                with open(file_path, "r", encoding="utf-8") as file:
                    text = file.read()
                texts = text_splitter.split_text(text)
                all_texts.extend(texts)
                metadata.extend([{"file_name": filename, "folder": folder}] * len(texts))
    
    vectorstore = FAISS.from_texts(all_texts, embeddings, metadatas=metadata)
    return vectorstore

In [5]:
vectorstore = encode_documents(folders)
print("FAISS vector store created successfully!")

  embeddings = OpenAIEmbeddings()


FAISS vector store created successfully!


In [6]:
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [7]:
test_query = "Does METTLER TOLEDO's waive any provision?"
context = retrieve_context_per_question(test_query, chunks_query_retriever)
show_context(context)

  docs = chunks_query_retriever.get_relevant_documents(question)


Context 1:
Warranties; Disclaimers


Context 2:
(c) Class Action Waiver




In [8]:
evaluate_rag(chunks_query_retriever)

Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...
Answering the question from the retrieved context...


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 10 test case(s) in parallel: |          |  0% (0/10) [Time Taken: 00:00, ?test case/s]ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceed



Metrics Summary

  - ❌ Correctness (GEval) (score: 0.2627560891579086, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output is not factually correct; it does not directly answer the question of whether METTLER TOLEDO waives any provision., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4, reason: None, error: None)
  - ❌ Contextual Relevancy (score: 0.5, threshold: 1.0, strict: False, evaluation model: gpt-4, reason: The score is 0.50 because although the 'Class Action Waiver' statement does provide some relevance to the input, the 'Warranties; Disclaimers' statement does not contribute any meaningful information about METTLER TOLEDO's waiving any provision., error: None)

For test case:

  - input: Does METTLER TOLEDO's waive any provision?
  - actual output: The context provided does not specify whether METTLER TOLEDO waives any provision related to warranties, disclaimers, or class action waivers. Mor


