In [14]:
# CADDIE_RAG_MCP_FULL_DEMO.ipynb (CLIENT-ONLY VERSION)

# 1. Install dependencies (uncomment to run)
# !pip install pandas PyPDF2 requests chromadb sentence-transformers llama-cpp-python

# 2. Import essentials
import os, glob, uuid
import pandas as pd
import requests

# For LLM (replace with HuggingFace if preferred)
try:
    from llama_cpp import Llama
except ImportError:
    Llama = None

# ---- CONFIGURATION ---- #
CHROMA_PATH = "chroma_db"
PDF_FOLDER = "pdf_regulations"
COMPANY_CSV = "./acme enterprises/trustlayer_intake_example_cleaned_final_v3.csv"
REG_CSV = "external_regulations.csv"
MEMORY_PORT = 8001
GRC_PORT = 8002
EC2_HOST = "54.165.180.47"  # <-- Update to your EC2 public IP or DNS
MEMORY_URL = f"http://{EC2_HOST}:{MEMORY_PORT}/mcp"
GRC_URL = f"http://{EC2_HOST}:{GRC_PORT}/mcp"

# ---- (Optional) DATA SETUP & INDEXING ---- #
# Skip if your data is already indexed in Chroma and you just want to use the servers.
# If you need to index or re-index, uncomment and run the block below ONCE.

#"""
import chromadb
from chromadb.utils import embedding_functions

print("STEP 1: Indexing data to Chroma vector DB ...")
client = chromadb.PersistentClient(path=CHROMA_PATH)
embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# 2. Company Info to Chroma
if not os.path.exists(COMPANY_CSV):
    raise FileNotFoundError("Missing: " + COMPANY_CSV)
company_df = pd.read_csv(COMPANY_CSV)
company_texts = company_df.apply(lambda row: " ".join(row.dropna().astype(str)), axis=1).tolist()
company_ids = [f"company_{i}" for i in range(len(company_texts))]
company_coll = client.get_or_create_collection("company_info", embedding_function=embed_fn)
company_coll.add(ids=company_ids, documents=company_texts, metadatas=[{"source": "company_info"}]*len(company_texts))

# 3. Regulations to Chroma
if not os.path.exists(REG_CSV):
    raise FileNotFoundError("Missing: " + REG_CSV)
reg_df = pd.read_csv(REG_CSV)
reg_texts = reg_df.apply(lambda row: " ".join(row.dropna().astype(str)), axis=1).tolist()
reg_ids = [f"reg_{i}" for i in range(len(reg_texts))]
reg_coll = client.get_or_create_collection("regulations", embedding_function=embed_fn)
reg_coll.add(ids=reg_ids, documents=reg_texts, metadatas=[{"source": "regulation"}]*len(reg_texts))

# 4. PDFs to Chroma
import PyPDF2
pdf_texts, pdf_ids, pdf_metas = [], [], []
if not os.path.exists(PDF_FOLDER):
    raise FileNotFoundError("Missing folder: " + PDF_FOLDER)
for pdf_path in glob.glob(f"{PDF_FOLDER}/*.pdf"):
    pdf = PyPDF2.PdfReader(open(pdf_path, "rb"))
    for i, page in enumerate(pdf.pages):
        text = page.extract_text() or ""
        if len(text.strip()) > 0:
            pdf_texts.append(text)
            pdf_ids.append(f"{os.path.basename(pdf_path)}_p{i}")
            pdf_metas.append({"file": os.path.basename(pdf_path), "page": i})
pdf_coll = client.get_or_create_collection("pdfs", embedding_function=embed_fn)
pdf_coll.add(ids=pdf_ids, documents=pdf_texts, metadatas=pdf_metas)

print("Data indexed in Chroma.\n")
#"""



STEP 1: Indexing data to Chroma vector DB ...


  from tqdm.autonotebook import tqdm, trange
2025-05-16 12:25:41.651031: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-16 12:25:41.668752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-16 12:25:41.688914: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-16 12:25:41.695020: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-16 12:25:41.710172

Data indexed in Chroma.



In [15]:
import requests, uuid, os
from llama_cpp import Llama

# --- Configuration ---
MEMORY_URL = "http://54.165.180.47:8001/mcp"
GRC_URL = "http://54.165.180.47:8002/mcp"
session_id = "user123"
llm_model_path = "../../../wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.gguf.bin"

# --- Helper for MCP calls ---
def mcp_call(url, method, params):
    payload = {
        "jsonrpc": "2.0",
        "id": str(uuid.uuid4()),
        "method": method,
        "params": params
    }
    r = requests.post(url, json=payload)
    return r.json()

# --- Load Llama model ---
print("Model file exists:", os.path.exists(llm_model_path))
llm_loaded = False
if os.path.exists(llm_model_path):
    llm = Llama(model_path=llm_model_path, n_ctx=2048)
    llm_loaded = True


llama_model_loader: loaded meta data with 18 key-value pairs and 363 tensors from ../../../wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.gguf.bin (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = wizardlm-13b-v1.1-superhot-8k.ggmlv3....
llama_model_loader: - kv   2:                        general.description str              = converted from legacy GGJTv3 format
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   5:                          llama.block_count u32              = 40
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 13

Model file exists: True


llm_load_tensors:        CPU buffer size =  6983.62 MiB
....................................................................................................
llama_new_context_with_model: n_ctx      = 2048
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =  1600.00 MiB
llama_new_context_with_model: KV self size  = 1600.00 MiB, K (f16):  800.00 MiB, V (f16):  800.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:        CPU compute buffer size =   204.01 MiB
llama_new_context_with_model: graph nodes  = 1286
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 1 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 |

In [20]:
# !pip install openai
import os
from openai import OpenAI
os.environ['OPENAI_API_KEY'] = ""  # Replace with your real key or set as an env variable
openai_client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))


In [21]:
def format_hits(hits, label, maxlen=350):
    out = []
    for hit in hits[:2]:
        meta = hit.get('meta', {})
        ref = f"[{label}] {meta.get('file', meta.get('source', 'N/A'))}"
        if 'page' in meta:
            ref += f", page {meta['page']}"
        text = hit.get('text', '').replace('\n', ' ').strip()
        out.append(f"{ref}: {text[:maxlen]} ...")
    return "\n\n".join(out)

context_parts = []
if company_info:
    context_parts.append(format_hits(company_info, "Company Info"))
if regulations:
    context_parts.append(format_hits(regulations, "Regulation"))
if pdfs:
    context_parts.append(format_hits(pdfs, "PDF"))
context = "\n\n".join(context_parts)
if not context.strip():
    context = "No relevant sources found."


In [40]:
SYSTEM_PROMPT = """
You are a Governance, Risk, and Compliance (GRC) assistant for TrustLayer. Only answer using the provided sources below. Cite the file and page (e.g., [PDF] GDPR.pdf, page 23) after each fact or paragraph. If you cannot find the answer in the sources, say 'I do not know.'
"""
#user_question = "Where is Trustlayers company headquarters?"
#user_question = "Should GDPR apply to TrustLayer?"
#user_question = "What policies does TrustLayer have?"
#user_question = "Does NYSHIELD apply to Trustlayer and if so, what does TrustLayer need to do to abide by the regulation?"
#user_question = "What are TrustLayer’s core service commitments for its customers"
#user_question = "List all TrustLayer internal policies related to data privacy."
#user_question = "What procedures are in place for TrustLayer’s incident response?"
#user_question = "What evidence is missing for a successful SOC 2 audit at TrustLayer?"
user_question = "What regulations would you advise TrustLayer to prioritize with given their company information?"

# --- Configuration ---
MEMORY_URL = "http://54.165.180.47:8001/mcp"
GRC_URL = "http://54.165.180.47:8002/mcp"
session_id = "user123"


# --- Fetch context from MCP servers ---
company_info = mcp_call(GRC_URL, "search_company_info", {"query": user_question, "k": 10})['result']
regulations = mcp_call(GRC_URL, "search_regulations", {"query": user_question, "k": 10})['result']
pdfs = mcp_call(GRC_URL, "search_pdfs", {"query": user_question, "k": 10})['result']

# Format context as above...
context_parts = []
if company_info:
    context_parts.append(format_hits(company_info, "Company Info"))
if regulations:
    context_parts.append(format_hits(regulations, "Regulation"))
if pdfs:
    context_parts.append(format_hits(pdfs, "PDF"))
context = "\n\n".join(context_parts)
if not context.strip():
    context = "No relevant sources found."

print("\n---- Context Provided to OpenAI LLM ----\n")
print(context)

def call_openai_with_context(query: str, context: str, session_id: str = "user123"):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT.strip()},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
    ]
    resp = openai_client.chat.completions.create(
        model="gpt-4o",  # or "gpt-4" or "gpt-3.5-turbo"
        messages=messages,
        temperature=0.2,
        max_tokens=384,
        user=session_id,
    )
    return resp.choices[0].message.content

#insert into memory
result = mcp_call(MEMORY_URL, "insert_memory", {"session_id": session_id, "text": f"Q: {user_question} A: {answer}"})
print("Memory insert result:", result)



# Call OpenAI and get the answer
answer = call_openai_with_context(user_question, context)
print("\nLLM Final Answer with References:\n", answer)



---- Context Provided to OpenAI LLM ----

[Company Info] Source: company_info
8 Brief Business Description 3/26/25 TrustLayer AI provides automated risk and compliance monitoring software for mid-sized businesses and enterprises. none Intake ...

[Company Info] Source: company_info
1 Company Website 3/26/25 https://www.trustlayer.ai none Intake ...

[Company Info] Source: company_info
0 Company Legal Name 3/26/25 TrustLayer AI, Inc. none Intake ...

[Company Info] Source: company_info
23 Upload risk assessment or summary 3/26/25 Uploaded risk-management-policy.pdf Trust Layer NIST-CSF Security Posture Assessment 2024.pdf Intake ...

[Company Info] Source: company_info
26 Trust Services Criteria In-Scope 3/26/25 Security, Availability, Confidentiality, Processing Integrity none Intake ...

[Company Info] Source: company_info
20 Do you enforce MFA for employee access? 3/26/25 Yes none Intake ...

[Company Info] Source: company_info
19 Upload security policy documents 3/26/25 Uploaded ac

In [27]:
# Load your company info DataFrame
import pandas as pd
df = pd.read_csv("acme enterprises/trustlayer_intake_example_cleaned_final_v3.csv")

# Check which columns/rows actually mention headquarters, address, or NYC
print(df.head(20))
for col in df.columns:
    if df[col].astype(str).str.contains("New York", case=False).any():
        print(f"Found 'New York' in column: {col}")
        print(df[df[col].astype(str).str.contains("New York", case=False)][col])


    Unnamed: 0                                            Subject     Date  \
0            0                                 Company Legal Name  3/26/25   
1            1                                    Company Website  3/26/25   
2            2                       Company Headquarters Address  3/26/25   
3            3                               Primary Contact Name  3/26/25   
4            4                              Primary Contact Email  3/26/25   
5            5                                           Industry  3/26/25   
6            6                               Size of Organization  3/26/25   
7            7                Compliance Framework(s) of Interest  3/26/25   
8            8                         Brief Business Description  3/26/25   
9            9     Is your offering SaaS, PaaS, IaaS, or On-Prem?  3/26/25   
10          10  Describe services/products in-scope for SOC 2/...  3/26/25   
11          11                             Hosting Providers Use

In [25]:
# --- User's question ---
user_question = "Where is TrustLayers headquarters ?"  # Change as needed!

# --- Fetch context from MCP servers ---
company_info = mcp_call(GRC_URL, "search_company_info", {"query": user_question, "k": 2})['result']
regulations = mcp_call(GRC_URL, "search_regulations", {"query": user_question, "k": 2})['result']
pdfs = mcp_call(GRC_URL, "search_pdfs", {"query": user_question, "k": 2})['result']

# --- Helper: Format hits for context with references ---
def format_hits(hits, label, maxlen=350):
    out = []
    for hit in hits:
        meta = hit.get('meta', {})
        ref = f"Source: {meta.get('file', meta.get('source', 'N/A'))}"
        if 'page' in meta:
            ref += f", Page: {meta['page']}"
        out.append(f"[{label}] {ref}\n{hit['text'][:maxlen]} ...")
    return "\n\n".join(out)

context_parts = []
if company_info:
    context_parts.append(format_hits(company_info, "Company Info"))
if regulations:
    context_parts.append(format_hits(regulations, "Regulation"))
if pdfs:
    context_parts.append(format_hits(pdfs, "PDF"))
context = "\n\n".join(context_parts)

# --- Optionally print context for debugging ---
print("\n---- Context Provided to LLM ----\n")
print(context)



---- Context Provided to LLM ----

[Company Info] Source: company_info
1 Company Website 3/26/25 https://www.trustlayer.ai none Intake ...

[Company Info] Source: company_info
0 Company Legal Name 3/26/25 TrustLayer AI, Inc. none Intake ...

[Regulation] Source: regulation
NIST_CSWP_29 4/16/25 nist cswp 29 the nist cybersecurity framework (csf) 2.0 february 26, 2024 25 tier cybersecurity risk governance cybersecurity risk management the organization risk strategy is informed by the cybersecurity risks associated with its suppliers and the products and services it acquires and uses. personnel formally act upon those r ...

[Regulation] Source: regulation
SOC_2_Report_Example 4/16/25 policies [company name] has implemented the following policies, which serve as the basis for company procedures. these are made accessible to all relevant employees and contractors, and are reviewed annually: [bullet points listing policies with a brief description] applicable trust services criteria and th

In [17]:
# --- WizardLM-style prompt for citations ---
full_prompt = f"""<|im_start|>system
You are a GRC compliance assistant. ONLY answer using the provided sources below. Cite the file and page (e.g., [PDF] GDPR.pdf, page 23) after each fact or paragraph. If the answer is not found, say "I do not know."
<|im_end|>
<|im_start|>user
Question: {user_question}

Context:
{context}
<|im_end|>
<|im_start|>assistant
"""

if llm_loaded:
    response = llm(full_prompt, max_tokens=384, stop=["<|im_end|>"])
    llm_answer = response['choices'][0]['text']
    print("\nLLM Final Answer with References:\n", llm_answer)
    # Optionally log Q&A to memory for continuity
    mcp_call(MEMORY_URL, "insert_memory", {"session_id": session_id, "text": f"Q: {user_question} A: {llm_answer}"})
else:
    print("Model not loaded! Check the path or your memory/CPU resources.")
print("----PROMPT TO LLM----")
print(full_prompt)
print("---------------------")

if llm_loaded:
    response = llm(full_prompt, max_tokens=384, stop=["<|im_end|>"])
    llm_answer = response['choices'][0]['text']
    print("\nLLM Final Answer with References:\n", llm_answer)
    mcp_call(MEMORY_URL, "insert_memory", {"session_id": session_id, "text": f"Q: {user_question} A: {llm_answer}"})
else:
    print("Model not loaded! Check the path or your memory/CPU resources.")



llama_print_timings:        load time =   34134.94 ms
llama_print_timings:      sample time =       9.99 ms /   225 runs   (    0.04 ms per token, 22520.27 tokens per second)
llama_print_timings: prompt eval time =   45133.59 ms /   677 tokens (   66.67 ms per token,    15.00 tokens per second)
llama_print_timings:        eval time =   41848.87 ms /   224 runs   (  186.83 ms per token,     5.35 tokens per second)
llama_print_timings:       total time =   87172.36 ms /   901 tokens
Llama.generate: 676 prefix-match hit, remaining 1 prompt tokens to eval



LLM Final Answer with References:
 Based on the provided sources, TrustLayers headquarters is not explicitly mentioned. However, we can infer that TrustLayers is a company that has implemented policies and procedures in accordance with industry standards, such as the NIST Cybersecurity Framework and the requirements of the Statement on Controls (SOC 2).

It is also mentioned that TrustLayers has adopted the applicable Trust Services Criteria and the related controls. Although the information does not explicitly reveal the headquarters location, it indicates that TrustLayers has put in place proper governance and risk management practices to address cybersecurity risks associated with its suppliers and the products and services it acquires and uses.

Considering the above information, we can assume that TrustLayers headquarters might be located in a jurisdiction that recognizes and enforces the relevant regulations and standards, such as the GDPR, CCPA, and the NIST Cybersecurity Frame


llama_print_timings:        load time =   34134.94 ms
llama_print_timings:      sample time =       1.54 ms /    36 runs   (    0.04 ms per token, 23422.25 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =    6654.87 ms /    36 runs   (  184.86 ms per token,     5.41 tokens per second)
llama_print_timings:       total time =    6677.73 ms /    36 tokens



LLM Final Answer with References:
 Headquarters: TrustLayers headquarters is located in San Francisco, California, USA. The company is based in the city known for its technology and innovation hubs.


In [18]:
print("\nReferences used:")
for hit in regulations + pdfs:
    meta = hit.get('meta', {})
    ref = f"{meta.get('file', meta.get('source', 'N/A'))}"
    if 'page' in meta:
        ref += f", Page: {meta['page']}"
    print("-", ref)



References used:
- regulation
- regulation
- SOC_2_Report_Example.pdf, Page: 10
- SOC_2_Report_Example.pdf, Page: 2


In [19]:
mem = mcp_call(MEMORY_URL, "fetch_memory", {"session_id": session_id})
for entry in mem['result']:
    print(entry)


{'id': '6bbfe144-3222-425e-8158-8eddb253dca1', 'text': 'Q: Does Trust Layer need to abide by GDPR? A: Based on the provided sources, it seems that Trust Layer may need to abide by GDPR, depending on its specific services and operations. GDPR 5/30/24 (b) mentions that appropriate safeguards should be established by controllers or processors that are not subject to GDPR. If Trust Layer provides services that involve processing personal data, it may need to obtain certification, seals, or marks that demonstrate compliance with GDPR.\n\nPlease note that the information provided above is based on the sources provided earlier. To confirm whether Trust Layer needs to abide by GDPR, it is best to consult with the relevant authorities or seek legal advice.', 'timestamp': '2025-05-16T12:03:21.584292'}
{'id': 'a65cae36-27e8-4060-be89-a99cffd5fad7', 'text': 'Q: Does Trust Layer need to abide by GDPR? A: \nBased on the provided sources, Trust Layer may need to abide by GDPR if it is considered a co

In [4]:
!pip install PyPDF2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.12 -m pip install --upgrade pip[0m
