In [34]:
import os
import numpy as np
import json
from openai import OpenAI
import re
import chromadb
import uuid
import gradio as gr
#import pickle

## Create File List

In [35]:
def get_file_list(dir_path):
    
    file_list = []
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_type = os.path.basename(os.path.dirname(file_path))
            file_name = os.path.basename(file_path)
            file_list.append({
                "file_path": file_path,
                "file_type": file_type,
                "file_name": file_name
            })
    return file_list
            


## Functions for Extracting and Chunking the Text from Files

In [36]:
def extract_text_from_file(file_path):
    print(f"Extracting text from {file_path}...")
    try:
        with open(file_path, 'rb') as f:
            content = f.read()
        # Try to decode with error handling
        text = content.decode('utf-8', errors='ignore')
        return text
    except Exception as e:
        print(f"Failed to read file {file_path} with any encoding: {e}")
        return f"Error reading file: {file_path}"

def chunk_text(text, metadata, chunk_size=1000, overlap=200):
    chunks = []  # Initialize an empty list to store the chunks

    # Iterate over the text with the specified chunk size and overlap
    for i in range(0, len(text), chunk_size - overlap):
        chunk_text = text[i:i + chunk_size]  # Extract the chunk of text

        # Skip very small chunks (less than 50 characters)
        if chunk_text and len(chunk_text.strip()) > 50:

            # Create a copy of metadata and add chunk-specific info
            chunk_metadata = metadata.copy()
            chunk_metadata.update({
                "chunk_index": len(chunks),  # Index of the chunk - Based on how many chunks added upto that point
                "start_char": i,  # Start character index of the chunk
                "end_char": i + len(chunk_text),  # End character index of the chunk
                "is_summary": False  # Flag indicating this is not a summary
            })
            
            # Append the chunk with its metadata to the list
            chunks.append({
                "text": chunk_text,
                "metadata": chunk_metadata
            })
    
    return chunks  # Return the list of chunks with metadata

## Create Embeddings

In [37]:
def create_embeddings(texts, model="BAAI/bge-en-icl"):
    
    # Handle empty input
    if not texts:
        return []
        
    # Process in batches if needed (OpenAI API limits)
    batch_size = 100
    all_embeddings = []
    
    # Iterate over the input texts in batches
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]  # Get the current batch of texts
        
        # Create embeddings for the current batch
        response = openai.embeddings.create(
            model=model,
            input=batch
        )
        
        # Extract embeddings from the response
        batch_embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embeddings)  # Add the batch embeddings to the list
    
    return all_embeddings  # Return all embeddings

## Generate Summaries

In [38]:
def generate_file_summary(file_text):

    ## Generate a concise summary of each file.
    
   
    # Define the system prompt to instruct the summarization model

    system_prompt = """You are an expert summarization system.
    Create a detailed summary of the provided text. 
    Focus on capturing the main topics, key information, and important facts.
    Your summary should be comprehensive enough to understand what the file contains
    but more concise than the original."""

    # Make a request to the OpenAI API to generate the summary
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_prompt},  # System message to guide the assistant
            {"role": "user", "content": f"Please summarize this text:\n\n{file_text}"} 
        ],
        temperature=0.3  # Set the temperature low to avoid creativity
    )
    
    # Return the generated summary content
    return response.choices[0].message.content

## Process all files to create Summaries & Chunks

In [39]:
def prepare_files(dir_path):


    file_list = get_file_list(dir_path)
    summaries_list = []
    file_chunks_list = []
    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))   

    for file in file_list:

        file_text = extract_text_from_file(file["file_path"])

        file_metadata = {
            "file_type": file["file_type"],
            "file_name": file["file_name"]
        }

        print("Generating file summary...")

        summary_text = generate_file_summary(file_text)

        # Create summary metadata
        summary_metadata = file_metadata.copy()
        summary_metadata.update({"is_summary": True})

        summaries_list.append({
            "text": summary_text,
            "metadata": summary_metadata
        })

        file_chunks = chunk_text(file_text, file_metadata, chunk_size=200, overlap=50)
        file_chunks_list.extend(file_chunks)

    print(f"Total number of chunks: {len(file_chunks_list)}")
    print(f"Total number of summaries: {len(summaries_list)}")

    return summaries_list, file_chunks_list









In [62]:
def create_category_summary(summaries_list, category_name):

    category_text = ''
    for item in summaries_list:
        if (item['metadata']['file_type'] == category_name):
            category_text += f"File_Name: {item['metadata']['file_name']} \n\n"
            category_text += f"File_Text: {item['text']} \n\n"
            category_text += "--------------------------------"

    system_message = f"""
    You are an expert at creating summaries for any given context. 
    You will be given a context and you will need to create a summary for it.
    The summary should be in the same language as the context.
    Summary should contain statistics about the context,key information, and any other relevant information.
    The overview section should contain how many items of the {category_name} are there in the context.
    Summary should not exceed more than 1000 words.
    The output should be in markdown format.
    """
    user_message = f"""
    Create a summary for the following context:
    {category_text}
    """
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    directory_name = '/Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/summaries'
    file_name = f"{category_name}_summary.md"
    file_path = os.path.join(directory_name, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)
    with open(file_path, 'w') as file:
        file.write(response.choices[0].message.content)
    file.close()

    return True

create_category_summary(summaries_list, 'products')
create_category_summary(summaries_list, 'contracts')
create_category_summary(summaries_list, 'employees')





True

## Create embeddings for Summaries and File Chunks

In [40]:
def create_data_embeddings(dir_path):
    summaries_list, file_chunks_list = prepare_files(dir_path)   
    summary_text_list = [summary["text"] for summary in summaries_list]
    chunk_text_list = [chunk["text"] for chunk in file_chunks_list]

    # Create embeddings for summaries
    print("Creating embeddings for summaries of the files...")
    summary_embeddings = create_embeddings(summary_text_list, model="text-embedding-3-small")
        
    # Create embeddings for detailed chunks
    print("Creating embeddings for the file chunks....")
    chunk_embeddings = create_embeddings(chunk_text_list, model="text-embedding-3-small")
    return summary_embeddings, chunk_embeddings, summaries_list, file_chunks_list

## Add Embeddings to Chroma DB

In [63]:
dir_path = '/Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base'
db_name = "chroma_db"

if os.path.exists(db_name):
    client = chromadb.PersistentClient(path=db_name)
    client.delete_collection('file_summaries')
    client.delete_collection('file_chunks')

client = chromadb.PersistentClient(path=db_name)

summary_collection = client.get_or_create_collection("file_summaries")
chunks_collection = client.get_or_create_collection("file_chunks")

summary_embeddings, chunk_embeddings, summaries_list, file_chunks_list = create_data_embeddings(dir_path)

## Collect everything needed for adding info to Summary collection
summary_metadata = [summary["metadata"] for summary in summaries_list]
summary_texts = [summary["text"] for summary in summaries_list]
summary_ids = [summary["metadata"]["file_name"] for summary in summaries_list]

summary_collection.add(
            embeddings=summary_embeddings,
            metadatas=summary_metadata,
            documents=summary_texts,
            ids=summary_ids )

## Collect everything needed for adding into to Chunk Collection
chunk_metadata = [chunk["metadata"] for chunk in file_chunks_list]
chunk_texts = [chunk["text"] for chunk in file_chunks_list]
chunk_ids = [f"chunk_{uuid.uuid4().hex[:8]}_{i}" for i in range(len(chunk_embeddings))]
chunks_collection.add(
            embeddings=chunk_embeddings,
            metadatas=chunk_metadata,
            documents=chunk_texts,
            ids=chunk_ids )



Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/products/Rellm.md...
Generating file summary...
Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/products/Markellm.md...
Generating file summary...
Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/products/Homellm.md...
Generating file summary...
Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/products/Carllm.md...
Generating file summary...
Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/contracts/Contract with GreenField Holdings for Markellm.md...
Generating file summary...
Extracting text from /Users/harish/Harish_MAC/Learning/Projects/GenAI/unstructured_RAG/knowledge-base/contracts/Contract with EverGuard Insurance for Rellm - AI-Powered Enterprise Reinsurance Solution.m

## Define retrieval functions for summaries and chunks

In [64]:
def retrieve_hierarchically(query, k_summaries=3, k_chunks=5):

    query_embedding = create_embeddings(query, model="text-embedding-3-small")
    chunk_results = []

    summary_results = summary_collection.query(
        query_embeddings=query_embedding,
        n_results=k_summaries,
        where={"is_summary": True}
    )



    # Query chunks for each summary file

    for file_list in summary_results['ids']:
        for file in file_list:
            chunk_result = chunks_collection.query(
                                    query_embeddings=query_embedding,
                                    n_results=k_chunks,
                                    where={"file_name": file}
            )
            chunk_results.append(chunk_result)

    return chunk_results


In [65]:
def generate_response(query, retrieved_chunks):
    retrieved_context = ""
    for i in retrieved_chunks:
        retrieved_context += "\n\n".join(i['documents'][0])

    system_message = "You are a helpful AI assistant answering questions based on the provided context."
    system_message += "Use the information from the context to answer the user's question accurately."
    system_message += "Except for general greetings, if the context doesn't contain relevant information, Say 'I am sorry, I don't have that information'. Do not hullcinate"

    user_message = f"Context: {retrieved_context}\n\nQuestion: {query}"

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )

    return response.choices[0].message.content




### Putting it all together

In [66]:
def chat(message,history):
    query = message
    chunks = retrieve_hierarchically(query, k_summaries=3, k_chunks=5)
    reply = generate_response(query, chunks)
    #history.append({"role": "assistant", "content": reply})
    return reply

In [67]:
openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))   
gr.ChatInterface(fn=chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7867
* To create a public link, set `share=True` in `launch()`.




In [56]:
#print(summaries_list[10]['text'])
print(len(summary_embeddings))
print(len(chunk_embeddings))

summary_array = np.array(summary_embeddings)
chunk_array = np.array(chunk_embeddings)

print(summary_array.shape)
print(chunk_array.shape)

31
597
(31, 1536)
(597, 1536)
