#DATA PREPERATION PIPELINE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install beautifulsoup4 langchain sentence-transformers chromadb -q
import os
import glob
from bs4 import BeautifulSoup

In [None]:
# The DEFINITIVE Document Loading Logic (Using Your Findings)

def load_and_clean_document(file_path):
    """
    Loads an HTML file, finds the title from anywhere on the page,
    and extracts text ONLY from the specific content 'treasure chest'.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # --- Robot #1: The Title Finder (Searches the whole document) ---
        # This logic is perfect and doesn't need to change.
        title = ''
        if soup.title and soup.title.string:
            title = soup.title.string.strip()
        elif soup.find('h1'):
            title = soup.find('h1').get_text(strip=True)
        else:
            filename = os.path.basename(file_path)
            title = filename.replace('_', ' ').replace('-', ' ').replace('.html', '').capitalize()

        # --- Robot #2: The Content Finder (Goes to the exact treasure chest) ---
        # We target the specific div you found. We can use both id and class
        # for an extremely precise search.
        content_chest = soup.find(id='main-content', class_='wiki-content group')

        clean_text = ''
        if content_chest:
            # If we found our specific treasure chest, get text ONLY from it.
            clean_text = content_chest.get_text(separator=' ', strip=True)
        else:
            # Fallback in case a rare page has a different structure.
            # We'll just grab everything we can.
            clean_text = soup.get_text(separator=' ', strip=True)

        # --- Assembling our clean "Recipe Card" ---
        return {
            'title': title,
            'text': clean_text,
            'source_file': os.path.basename(file_path)
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

In [None]:
base_folder_path = '/content/drive/My Drive/AI_Internship_Project/data/'


html_files = glob.glob(os.path.join(base_folder_path, '**', '*.html'), recursive=True)

print(f"Found {len(html_files)} HTML files to process.")

# Process just ONE file to test our new function
if html_files:
    first_file_path = html_files[0]
    document_data = load_and_clean_document(first_file_path)

    if document_data:
        print("\n--- Successfully Processed First File ---")
        print(f"Source File: {document_data['source_file']}")
        print(f"Found Title: {document_data['title']}")
        print("\n--- First 500 Characters of Cleaned Text ---")
        print(document_data['text'][:500])
else:
    print("No HTML files were found. Please check your `base_folder_path`.")

Found 479 HTML files to process.

--- Successfully Processed First File ---
Source File: Flow-Explorer-application_324469701.html
Found Title: PS Team Master Authorizers : Flow Explorer application

--- First 500 Characters of Cleaned Text ---
Functional Requirements Requirement Description Show granted Authz List the Entry authz with missing Exit List the Exit authz with missing Entry Show declined Authz List the not granted Entry/Exit authz with response message, this includes both use cases Declined Authz NOTFOUND customer Filtering Filter includes simple and combined filter conditions Manual Booking Operator should be able to close Opened Entry / Exit Authz through manual booking I18N Localization of the table header & table conte


In [None]:
# THE FULL PIPELINE

import time
from langchain.text_splitter import RecursiveCharacterTextSplitter

 #Part 1: Load all 479 documents
print("--- Starting Part 1: Loading and Cleaning All Documents ---")
start_time = time.time()

base_folder_path = '/content/drive/My Drive/AI_Internship_Project/data/'
html_files = glob.glob(os.path.join(base_folder_path, '**', '*.html'), recursive=True)
print(f"Found {len(html_files)} HTML files.")

documents = []
for file_path in html_files:
    doc_data = load_and_clean_document(file_path)
    if doc_data:
        documents.append(doc_data)

end_time = time.time()
print(f"Successfully loaded and cleaned {len(documents)} documents in {end_time - start_time:.2f} seconds.")


# Part 2: Chunk the clean documents for the AI
print("\n--- Starting Part 2: Chunking All Documents ---")
start_time = time.time()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

all_chunks = []
for doc in documents:
    # We only chunk documents that have a meaningful amount of text
    if len(doc['text']) > 100:
        chunks = text_splitter.split_text(doc['text'])
        for chunk_text in chunks:
            all_chunks.append({
                'page_content': chunk_text,
                'metadata': {
                    'title': doc['title'],
                    'source': doc['source_file']
                }
            })

end_time = time.time()
print(f"Created a total of {len(all_chunks)} chunks from {len(documents)} documents in {end_time - start_time:.2f} seconds.")

# Verification Step
print("\n--- Sample of the first 3 chunks ---")
for i, chunk in enumerate(all_chunks[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Content: {chunk['page_content'][:200]}...")
    print(f"Metadata: {chunk['metadata']}")

--- Starting Part 1: Loading and Cleaning All Documents ---
Found 479 HTML files.
Successfully loaded and cleaned 479 documents in 24.55 seconds.

--- Starting Part 2: Chunking All Documents ---
Created a total of 1400 chunks from 479 documents in 0.37 seconds.

--- Sample of the first 3 chunks ---

--- Chunk 1 ---
Content: Functional Requirements Requirement Description Show granted Authz List the Entry authz with missing Exit List the Exit authz with missing Entry Show declined Authz List the not granted Entry/Exit aut...
Metadata: {'title': 'PS Team Master Authorizers : Flow Explorer application', 'source': 'Flow-Explorer-application_324469701.html'}

--- Chunk 2 ---
Content: Consumers details are displayed in topic transactionmanager.event.1 with Entry&Exit events as shown: Entry Event { "source": "com.scheidtbachmann.phfa.message.TransactionMessage", "time": "2024-03-26T...
Metadata: {'title': 'PS Team Master Authorizers : Kafka event consumer details', 'source': 'Kafka-event-cons

In [None]:
# THE AI CORE

import chromadb
from sentence_transformers import SentenceTransformer
import time

print("--- Starting Part 3: Embedding and Indexing ---")
start_time = time.time()

# Step 1: Initialize the Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

# Step 2: Initialize the Vector Database
db_path = '/content/drive/My Drive/AI_Internship_Project/chroma_db'
client = chromadb.PersistentClient(path=db_path)

# Step 3: Create a collection
collection_name = "confluence_docs"
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(name=collection_name)

collection = client.create_collection(name=collection_name)

# Step 4: Add all the chunks to the database

ids = [str(i) for i in range(len(all_chunks))]
documents_to_embed = [chunk['page_content'] for chunk in all_chunks]
metadatas_to_store = [chunk['metadata'] for chunk in all_chunks]

# Embed the documents in batches for efficiency
batch_size = 32
for i in range(0, len(documents_to_embed), batch_size):
    batch_docs = documents_to_embed[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]
    batch_metadatas = metadatas_to_store[i:i+batch_size]

    # The embedding model converts our text to vectors
    embeddings = embedding_model.encode(batch_docs).tolist()

    # Add the batch to the collection
    collection.add(
        embeddings=embeddings,
        documents=batch_docs,
        metadatas=batch_metadatas,
        ids=batch_ids
    )
    print(f"  Processed batch {i//batch_size + 1}/{(len(documents_to_embed)//batch_size) + 1}")


end_time = time.time()
print(f"\nSuccessfully created and indexed the vector database in {end_time - start_time:.2f} seconds.")
print(f"The database contains {collection.count()} chunks.")

# Verification Step
print("\n--- Testing the Search Functionality ---")

query_text = "What are the functional requirements for authorization?"
results = collection.query(
    query_texts=[query_text],
    n_results=3 # Ask for the top 3 most relevant results
)

print(f"Query: '{query_text}'")
print("\n--- Top 3 Results ---")
for i, doc in enumerate(results['documents'][0]):
    print(f"\n--- Result {i+1} ---")
    print(f"Text: {doc}")
    print(f"Metadata: {results['metadatas'][0][i]}")

--- Starting Part 3: Embedding and Indexing ---
  Processed batch 1/44
  Processed batch 2/44
  Processed batch 3/44
  Processed batch 4/44
  Processed batch 5/44
  Processed batch 6/44
  Processed batch 7/44
  Processed batch 8/44
  Processed batch 9/44
  Processed batch 10/44
  Processed batch 11/44
  Processed batch 12/44
  Processed batch 13/44
  Processed batch 14/44
  Processed batch 15/44
  Processed batch 16/44
  Processed batch 17/44
  Processed batch 18/44
  Processed batch 19/44
  Processed batch 20/44
  Processed batch 21/44
  Processed batch 22/44
  Processed batch 23/44
  Processed batch 24/44
  Processed batch 25/44
  Processed batch 26/44
  Processed batch 27/44
  Processed batch 28/44
  Processed batch 29/44
  Processed batch 30/44
  Processed batch 31/44
  Processed batch 32/44
  Processed batch 33/44
  Processed batch 34/44
  Processed batch 35/44
  Processed batch 36/44
  Processed batch 37/44
  Processed batch 38/44
  Processed batch 39/44
  Processed batch 40/44
 

In [None]:
# Part 4: Building the Generation Engine

# Step 1: Install the necessary library for the LLM
!pip install ctransformers

from ctransformers import AutoModelForCausalLM
import textwrap

# --- Part 4: Building the Generation Engine (Corrected) ---


from ctransformers import AutoModelForCausalLM
import textwrap

# Step 2: Load the Open-Source LLM with Increased Context
print("--- Loading the LLM ---")
llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=0,
    context_length=4096
)
print("--- LLM Loaded Successfully ---")

# Step 3: The RAG Function
def get_rag_response(query, n_results=5):
    """
    Takes a user query, retrieves relevant documents, and generates a response.
    """
    print(f"1. Starting retrieval for query: '{query}'")
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    retrieved_docs = results['documents'][0]
    retrieved_metadata = results['metadatas'][0]

    context = "\n\n---\n\n".join(retrieved_docs)
    print("2. Retrieved context successfully.")

    prompt_template = f"""
    [INST]
    You are an expert technical assistant. Your task is to answer the user's question based ONLY on the following context from Confluence documentation.
    If the context does not contain the answer, state that you cannot find the information in the provided documents.
    Do not use any prior knowledge. Be concise and quote the source document when possible.

    CONTEXT:
    {context}
    ---
    QUESTION:
    {query}
    [/INST]
    """

    print("3. Generating response...")
    response = llm(prompt_template, max_new_tokens=256, temperature=0.1)

    print("4. Response generated.")

    sources = list(set([meta['source'] for meta in retrieved_metadata]))
    source_str = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)

    return response + source_str


# Step 3: Create the Full RAG Function
def get_rag_response(query, n_results=5):
    """
    Takes a user query, retrieves relevant documents, and generates a response.
    """
    print(f"1. Starting retrieval for query: '{query}'")
    #  RETRIEVAL
    # Get the top N most relevant chunks from our ChromaDB vector store
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    retrieved_docs = results['documents'][0]
    retrieved_metadata = results['metadatas'][0]

    #  AUGMENTATION
    # Combine the retrieved documents into a single context string
    context = "\n\n---\n\n".join(retrieved_docs)
    print("2. Retrieved context successfully.")

    #  GENERATION
    # Create the prompt using our template
    # The [INST] and [/INST] tags are specific to the Mistral Instruct model.
    prompt_template = f"""
    [INST]
    You are an expert technical assistant. Your task is to answer the user's question based ONLY on the following context from Confluence documentation.
    If the context does not contain the answer, state that you cannot find the information in the provided documents.
    Do not use any prior knowledge. Be concise and quote the source document when possible.

    CONTEXT:
    {context}
    ---
    QUESTION:
    {query}
    [/INST]
    """

    print("3. Generating response...")
    # Pass the prompt to the LLM
    response = llm(prompt_template, max_new_tokens=256, temperature=0.1)

    print("4. Response generated.")

    # Add source citations
    sources = list(set([meta['source'] for meta in retrieved_metadata]))
    source_str = "\n\nSources:\n" + "\n".join(f"- {source}" for source in sources)

    return response + source_str


#  Verification Step
print("\n--- Testing the Full RAG Pipeline ---")

query_text = "What are the functional requirements for authorization?"
final_answer = get_rag_response(query_text)

print("\n--- Final Generated Answer ---")

wrapped_answer = textwrap.fill(final_answer, width=100)
print(wrapped_answer)

--- Loading the LLM ---


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

--- LLM Loaded Successfully ---

--- Testing the Full RAG Pipeline ---
1. Starting retrieval for query: 'What are the functional requirements for authorization?'
2. Retrieved context successfully.
3. Generating response...
4. Response generated.

--- Final Generated Answer ---
Based on the provided context, the functional requirements for authorization include:  1. Securing
the portal by setting some pages as public and others as private via path or route protection. Users
must be authenticated to access certain page routes. (V1) 2. Protecting pages by role/permission
requirement at the application level. (V2) 3. Extending authentication to support the try it button
so that users have the ability to run real calls to APIs in a TEST tenant. (V2) 4. Configuring the
TEST tenant with users and roles allowed to use the API's. (V2) 5. Retrieving the authenticated
user's token and inserting it in the try it call as a header to run a real API call. (V2) 6.
Ignoring presence check parameter con

In [None]:
#  test a new, process-oriented question
print("\n--- Testing a 'How To' Question ---")
query_text = "What do I do In case of virtual machine crashing repeatedly" # A completely different type of query
final_answer = get_rag_response(query_text)

print("\n--- Final Generated Answer ---")
wrapped_answer = textwrap.fill(final_answer, width=100)
print(wrapped_answer)


--- Testing a 'How To' Question ---
1. Starting retrieval for query: 'What do I do In case of virtual machine crashing repeatedly'
2. Retrieved context successfully.
3. Generating response...


KeyboardInterrupt: 