## openai_rag_multi_query

In [1]:
import os
from dotenv import load_dotenv
import requests
import json
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
from bs4 import BeautifulSoup

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def read_documents_from_directory(directory_path):
    """
    Reads all text documents from a specified directory.
    
    Parameters:
    directory_path (str): Path to the directory containing the documents.
    
    Returns:
    list: List of strings, each representing the content of a document.
    """
    documents = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
        elif filename.endswith(".pdf"):
            with open(os.path.join(directory_path, filename), 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    text += page.extract_text()
                documents.append(text)
    
    return documents

def generate_multi_queries(question, num_queries=3):
    """
    Generates multiple queries from an initial question.
    
    Parameters:
    question (str): The initial question.
    num_queries (int): The number of queries to generate.
    
    Returns:
    list: List of generated queries.
    """
    prompt = f"Generate {num_queries} different but related queries based on the following question: '{question}'. Ensure that each query focuses on different aspects relevant to the topic."
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    data = {
        "model": "gpt-4",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.7
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, data=json.dumps(data))

    if response.status_code == 200:
        response_json = response.json()
        queries = response_json["choices"][0]["message"]["content"].strip().split('\n')
        return queries
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

def retrieve_documents(query, documents, vectorizer):
    """
    Retrieves documents relevant to a query using a TF-IDF vectorizer.
    
    Parameters:
    query (str): The query.
    documents (list): List of all available documents.
    vectorizer (TfidfVectorizer): The TF-IDF vectorizer fitted on the documents.
    
    Returns:
    list: List of relevant documents.
    """
    query_vec = vectorizer.transform([query])
    doc_vecs = vectorizer.transform(documents)
    similarities = cosine_similarity(query_vec, doc_vecs).flatten()
    relevant_docs = [documents[i] for i in similarities.argsort()[-3:][::-1]]  # Get top 3 relevant documents
    return relevant_docs

def fuse_documents(documents):
    """
    Fuses multiple documents into a single response.
    
    Parameters:
    documents (list): List of documents to be fused.
    
    Returns:
    str: Fused response.
    """
    return "\n".join(documents)

def split_text_into_chunks(text, chunk_size=2000):
    """
    Splits the text into smaller chunks to fit within the context length limit.
    
    Parameters:
    text (str): The text to be split.
    chunk_size (int): The maximum size of each chunk.
    
    Returns:
    list: List of text chunks.
    """
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def generate_answer(fused_docs, question):
    """
    Generates an answer from fused documents using the OpenAI API, ensuring the response addresses the question.
    
    Parameters:
    fused_docs (str): Fused documents.
    question (str): The initial question.
    
    Returns:
    str: Generated answer.
    """
    chunks = split_text_into_chunks(fused_docs)
    answers = []
    for chunk in chunks:
        prompt = f"Based on the following information, generate a comprehensive answer to the question: '{question}'. Ensure that the answer directly addresses the question and provides relevant details.\n\n{chunk}"
        
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

        data = {
            "model": "gpt-4",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.7
        }

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, data=json.dumps(data))

        if response.status_code == 200:
            response_json = response.json()
            answer = response_json["choices"][0]["message"]["content"].strip()
            answers.append(answer)
        else:
            raise Exception(f"Error {response.status_code}: {response.text}")

    return " ".join(answers)

def openai_rag_multi_query(question, documents):
    """
    Implements the RAG (Retrieval Augmented Generation) process with multi-query.
    
    Parameters:
    question (str): The initial question.
    documents (list): List of all available documents.
    
    Returns:
    str: Generated answer.
    """
    # Step 1: Generate multiple queries
    queries = generate_multi_queries(question)
    
    # Step 2: Vectorize documents
    vectorizer = TfidfVectorizer().fit(documents)
    
    # Step 3: Retrieve relevant documents for each query
    all_relevant_docs = []
    for query in queries:
        relevant_docs = retrieve_documents(query, documents, vectorizer)
        all_relevant_docs.extend(relevant_docs)
    
    # Step 4: Fuse documents to generate a final answer
    fused_docs = fuse_documents(all_relevant_docs)
    final_answer = generate_answer(fused_docs, question)
    
    return final_answer


In [4]:
directory_path = "/Users/simon-pierreboucher/Desktop/llm-toolkit"  # Replace this with the correct path
documents = read_documents_from_directory(directory_path)
question = "Tell me about the impact of climate change on the environment and potential solutions."

In [5]:
answer = openai_rag_multi_query(question, documents)
print(answer)

Climate change is a systematic and long-term change in the state of the atmosphere over multiple decades, or more. The consensus among climate scientists is that climate change is occurring now, and is primarily driven by human activity. There is also a strong agreement that we can take actions to reduce its impacts and progression. 

The impacts of climate change on the environment are profound and multifaceted. It affects weather patterns, contributes to the warming of the Earth's surface, alters precipitation distribution leading to changes in water availability, and contributes to sea-level rise due to melting ice caps and glaciers. These changes are beyond the range of natural climate variability and can have severe impacts on ecosystems, biodiversity, and human societies. 

In terms of potential solutions, reducing greenhouse gas emissions, particularly carbon dioxide, is the key to mitigating climate change. This can be accomplished through the use of renewable energy sources, i