<a href="https://colab.research.google.com/github/Jawhy/GDPR-COMPLY/blob/main/Compliance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =======================================================================
# 🚀 AI Compliance Agent: Full Production-Ready Colab Script
# =======================================================================
# INSTRUCTIONS:
# 1. Run this entire cell.
# 2. It will install all dependencies (this may take 2-3 minutes).
# 3. It will then prompt you to upload your 10 files.
# 4. After processing, it will load the AI model (this may take 2-3 minutes).
# 5. Finally, it will output a public Gradio URL. Click it to use your app.
# =======================================================================

# -----------------------------------------------------------------------
# 📦 STEP 1: INSTALL ALL DEPENDENCIES
# -----------------------------------------------------------------------
# We install all required packages at the start.
print("📦 Installing dependencies...")
!pip install -U langchain langchain-core langchain-community langchain-text-splitters chromadb
!pip install -U sentence-transformers transformers accelerate bitsandbytes huggingface_hub
!pip install -U gradio pandas

print("✅ All dependencies installed.")

# -----------------------------------------------------------------------
# 📚 STEP 2: IMPORT LIBRARIES (WITH ALL FIXES)
# -----------------------------------------------------------------------
import os
import re
import json
import pandas as pd
import torch
import gradio as gr
from datetime import datetime
from getpass import getpass
from google.colab import files

# LangChain Imports (Correct 2025 Structure)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# Transformers Imports (for loading the local LLM)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print("✅ All libraries imported.")

# -----------------------------------------------------------------------
# 📄 STEP 3: UPLOAD, LOAD, CHUNK, AND EMBED YOUR 10 FILES
# -----------------------------------------------------------------------

print("Please upload your 10 policy and framework files (.txt):")
# This will open a file upload dialog
uploaded = files.upload()

# Load all uploaded documents directly into a list
all_docs = []
for filename, content in uploaded.items():
    print(f"  -> Loaded file: {filename}")
    # Decode the file content from bytes to a string
    all_docs.append(content.decode('utf-8'))

print(f"✅ Successfully loaded {len(all_docs)} files.")

# 1. Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_chunks = text_splitter.create_documents(all_docs)

# 2. Load the embedding model (free, runs in Colab)
print("Embedding documents... (This may take a moment)")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda'}
)

# 3. Create the Chroma Vector Store
vector_db = Chroma.from_documents(
    documents=all_chunks,
    embedding=embeddings,
    persist_directory="chroma_db"
)

# 4. Define the Retriever
retriever = vector_db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.5}
)

print(f"✅ Vector DB created with {len(all_chunks)} chunks from your files.")


# -----------------------------------------------------------------------
# 🤖 STEP 4: LOAD FREE, NON-GATED LLM (FIXES MEMORY/ACCESS ISSUES)
# -----------------------------------------------------------------------
print("Loading AI model (microsoft/phi-2)... (This may take 2-3 minutes)")
# We use microsoft/phi-2, a small but powerful model that fits in
# the free Colab GPU.

model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use float16 for memory efficiency
    device_map="auto",          # Automatically uses GPU if available
    trust_remote_code=True
)

# Create a Hugging Face Pipeline
hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    return_full_text=False,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Wrap it for LangChain
llm = HuggingFacePipeline(pipeline=hf_pipeline)

print(f"✅ Successfully loaded model: {model_id}")


# -----------------------------------------------------------------------
# 🏛️ STEP 5: DEFINE ADVANCED GOVERNANCE & AUDIT FEATURES
# -----------------------------------------------------------------------
# This section implements the advanced features from your design.

# --- 1. Global Audit Log ---
audit_log = []
MODEL_VERSION = model_id
EMBEDDING_VERSION = "all-MiniLM-L6-v2"

# --- 2. Audit Logging Function ---
def log_decision(query, context, response, confidence, industry, hitl_flag):
    """Logs a complete record of the agent's decision."""
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "model_version": MODEL_VERSION,
        "embedding_version": EMBEDDING_VERSION,
        "industry_persona": industry,
        "query": query,
        "retrieved_context": context,
        "response": response,
        "confidence_score": confidence,
        "human_review_required": hitl_flag
    }
    audit_log.append(log_entry)
    print(f"Log Entry Added: {log_entry['timestamp']}")

# --- 3. Confidence Scoring ---
def get_confidence_and_context(query):
    """Retrieves documents and calculates a confidence score."""
    docs_with_scores = vector_db.similarity_search_with_relevance_scores(query, k=3)

    if not docs_with_scores:
        return "", 0.0  # No documents found

    context = "\n---\n".join([doc.page_content for doc, score in docs_with_scores])
    # Use the highest score as the confidence
    highest_confidence = docs_with_scores[0][1]

    return context, highest_confidence

# --- 4. Human-in-the-Loop (HITL) Flag ---
def require_human_review(confidence_score, industry):
    """Flags low-confidence or high-risk queries for review."""
    if confidence_score < 0.6:
        return True, "Reason: Low Confidence"
    if industry in ["Healthcare (HIPAA)", "Government (FISMA)"]:
        return True, "Reason: High-Risk Industry"
    return False, "N/A"

# --- 5. Industry Specialization ---
def detect_industry(query):
    """Detects keywords to assign an industry persona to the agent."""
    query_lower = query.lower()
    if any(k in query_lower for k in ["phi", "hipaa", "patient", "ephi"]):
        return "Healthcare (HIPAA)"
    if any(k in query_lower for k in ["pci", "cardholder", "chd", "sox"]):
        return "Financial (PCI/SOX)"
    if any(k in query_lower for k in ["nist", "fisma", "fedramp", "classified"]):
        return "Government (FISMA)"
    return "General"

# --- 6. Security (Placeholder) ---
def anonymize_query(query):
    """Placeholder for a PII/data anonymizer."""
    # In a real system, this would use regex or an NER model
    return re.sub(r"user_[a-zA-Z0-9_]+", "[USER_ID]", query)

print("✅ Advanced governance functions defined.")

# -----------------------------------------------------------------------
# 🔗 STEP 6: BUILD THE MODERN LANGCHAIN RAG CHAIN (LCEL)
# -----------------------------------------------------------------------
# This uses the modern LangChain Expression Language (LCEL) which
# is robust to the import and compatibility errors.

# --- Base Prompt ---
base_template = """
You are an AI Compliance Agent. Your persona is: {persona}
Use the following retrieved context to answer the user's question.
If you don't know the answer, just say so. Do not make up information.
Provide a concise answer and cite the source control (e.g., "NIST AC-2", "HIPAA §164.312").

CONTEXT:
{context}

QUESTION:
{question}

CONCISE ANSWER:
"""
prompt = PromptTemplate.from_template(base_template)

# --- Define the LLM processing step ---
def run_llm(inputs):
    """Wraps the LLM call."""
    return llm.invoke(inputs)

# --- Define the full RAG and Governance Chain ---
def governed_agent_chain(query):
    """
    This is the main function that runs the entire governed RAG pipeline.
    """
    # 1. Anonymize (Security)
    safe_query = anonymize_query(query)

    # 2. Detect Industry (Specialization)
    industry_persona = detect_industry(safe_query)

    # 3. Retrieve Context and Confidence (RAG + Governance)
    context, confidence = get_confidence_and_context(safe_query)

    # 4. Check for HITL (Governance)
    is_hitl, hitl_reason = require_human_review(confidence, industry_persona)

    # 5. Format the prompt
    formatted_prompt = prompt.format(
        persona=industry_persona,
        context=context if context else "No relevant context found.",
        question=safe_query
    )

    # 6. Run the LLM
    response = run_llm(formatted_prompt)

    # 7. Log the entire decision (Audit)
    log_decision(safe_query, context, response, confidence, industry_persona, is_hitl)

    # 8. Format the final output for the user
    output_message = f"""**Answer:**
{response}

---
**Governance Details:**
* **Confidence:** {confidence:.2%}
* **Human Review Required:** {is_hitl} ({hitl_reason})
* **Persona:** {industry_persona}
    """
    return output_message

print("✅ Governed RAG chain built with LCEL.")


# -----------------------------------------------------------------------
# 🖥️ STEP 7: CREATE GRADIO WEB INTERFACE
# -----------------------------------------------------------------------
# This creates a two-tab web app:
# 1. A chatbot to query the agent.
# 2. An audit dashboard to review all decisions.

def get_audit_log_df():
    """Converts the list of log dicts into a Pandas DataFrame."""
    if not audit_log:
        return pd.DataFrame(columns=[
            "timestamp", "model_version", "industry_persona", "query",
            "response", "confidence_score", "human_review_required"
        ])

    # Select key fields for the dashboard
    df = pd.DataFrame(audit_log)
    dashboard_cols = [
        "timestamp", "model_version", "industry_persona", "query",
        "response", "confidence_score", "human_review_required"
    ]
    return df[dashboard_cols]

# --- Build the UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚀 AI Compliance & Governance Agent")
    gr.Markdown("Query your policies and frameworks, with full audit logging.")

    with gr.Tabs():
        # --- Tab 1: Compliance Agent Chat ---
        with gr.TabItem("Query Agent"):
            chatbot = gr.ChatInterface(
                fn=governed_agent_chain,
                title="AI Compliance Agent",
                description="Ask a question about your policies (e.g., 'What is our policy on MFA?' or 'Does our MFA policy meet NIST standards?').",
                examples=[
                    "What is our policy on MFA for remote access?",
                    "Does our MFA policy meet NIST IA-2?",
                    "What is the rule for reporting a PHI breach according to HIPAA?"
                ]
            )

        # --- Tab 2: Governance Audit Dashboard ---
        with gr.TabItem("Governance & Audit Log"):
            gr.Markdown("Review all queries and agent decisions. Click 'Refresh' to update.")
            refresh_button = gr.Button("Refresh Audit Log")
            audit_dashboard = gr.DataFrame(
                headers=[
                    "Timestamp", "Model", "Persona", "Query",
                    "Response", "Confidence", "Review?"
                ],
                datatype=["str", "str", "str", "str", "str", "number", "bool"],
                interactive=False
            )
            refresh_button.click(
                fn=get_audit_log_df,
                inputs=[],
                outputs=audit_dashboard
            )

print("✅ Gradio UI defined.")

# -----------------------------------------------------------------------
# 🚀 STEP 8: LAUNCH THE APPLICATION
# -----------------------------------------------------------------------
# share=True provides a public link to your app.
print("Launching Gradio App... Click the public link to interact.")
demo.launch(share=True, debug=True)

📦 Installing dependencies...
Collecting langchain
  Downloading langchain-1.0.2-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain-core
  Downloading langchain_core-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting chromadb
  Downloading chromadb-1.2.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langgraph<1.1.0,>=1.0.0 (from langchain)
  Downloading langgraph-1.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-

Saving example_policy.txt.txt to example_policy.txt.txt
Saving nist_controls.txt.txt to nist_controls.txt.txt
Saving iso_27001.txt.txt to iso_27001.txt.txt
Saving hipaa_healthcare.txt.txt to hipaa_healthcare.txt.txt
Saving gdpr_privacy.txt.txt to gdpr_privacy.txt.txt
Saving pci_dss.txt.txt to pci_dss.txt.txt
Saving sox_governance.txt.txt to sox_governance.txt.txt
Saving fedramp_cloud.txt.txt to fedramp_cloud.txt.txt
Saving fisma_federal.txt.txt to fisma_federal.txt.txt
Saving soc2_service.txt.txt to soc2_service.txt.txt
  -> Loaded file: example_policy.txt.txt
  -> Loaded file: nist_controls.txt.txt
  -> Loaded file: iso_27001.txt.txt
  -> Loaded file: hipaa_healthcare.txt.txt
  -> Loaded file: gdpr_privacy.txt.txt
  -> Loaded file: pci_dss.txt.txt
  -> Loaded file: sox_governance.txt.txt
  -> Loaded file: fedramp_cloud.txt.txt
  -> Loaded file: fisma_federal.txt.txt
  -> Loaded file: soc2_service.txt.txt
✅ Successfully loaded 10 files.
Embedding documents... (This may take a moment)


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Vector DB created with 10 chunks from your files.
Loading AI model (microsoft/phi-2)... (This may take 2-3 minutes)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


✅ Successfully loaded model: microsoft/phi-2
✅ Advanced governance functions defined.
✅ Governed RAG chain built with LCEL.


  self.chatbot = Chatbot(


✅ Gradio UI defined.
Launching Gradio App... Click the public link to interact.
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4a6d46901aec79ffe5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
