In [None]:
! pip install -q gradio transformers evaluate rouge_score datasets nltk langchain_chroma langchain_huggingface mauve-text


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.5-py3-none-any.whl.metadata (1.1 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting chromadb>=1.0.9 (from langchain_chroma)
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb>=1.0.9->langchain_chroma)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb>=1.0.9->langchain_chroma)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=1.0.9->langchain_chroma)
  

In [None]:
import gradio as gr
import os
import zipfile
import requests
from huggingface_hub import hf_hub_download
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import evaluate
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
import nltk
import mauve
nltk.download('punkt_tab')

In [None]:

# Load embedding model
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Download and load the Chroma database
def load_vector_db():
    chroma_db_path = hf_hub_download(
        repo_id="Ola1mohammed/GovernAI_dataset",
        filename="chroma_db.zip",
        repo_type="dataset",
    )
    extract_dir = "./chroma_db"
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(chroma_db_path, "r") as zip_ref:
        zip_ref.extractall(extract_dir)
    return Chroma(persist_directory=extract_dir, embedding_function=embed_model)

vector_db = load_vector_db()
TOGETHER_API_KEY = "Token_Here"  # Replace with your actual API key

# Load API key from environment variables
TOGETHER_API_URL = "https://api.together.xyz/v1/completions"

In [7]:
if not TOGETHER_API_KEY:
    raise ValueError("Missing Together AI API key. Set it in your environment.")

# Function to retrieve context from the vector database
def query_rag_system(question):
    best_doc_list = vector_db.similarity_search(question, k=1)
    if not best_doc_list:  # Check if the list is empty
        return "No relevant context found.", None, []  # Return a default message and empty lists

    best_doc = best_doc_list[0]
    additional_docs = vector_db.similarity_search(question, k=3)
    additional_docs = [doc for doc in additional_docs if doc != best_doc]

    context = f"Best Chunk:\n{best_doc.page_content}\n\nAdditional Context:\n" + "\n".join(doc.page_content for doc in additional_docs)
    return context, best_doc, additional_docs

# Function to call Together AI for generating a response
def query_together_ai(prompt):
    headers = {
        "Authorization": f"Bearer {TOGETHER_API_KEY}",
        "Content-Type": "application/json",
    }
    data = {
        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
        "prompt": prompt,
        "max_tokens": 150,
        "temperature": 0.7,
    }
    response = requests.post(TOGETHER_API_URL, headers=headers, json=data)

    if response.status_code == 200:
        return response.json()["choices"][0]["text"]
    return "Error: Unable to generate a response from Together AI."

# Function to process chat interactions
def chat_interface(question, chat_history):
    if not question:
        return chat_history

    # Retrieve RAG context
    context, best_doc, additional_docs = query_rag_system(question)
    prompt = f"""
    You are an AI and Data Governance Compliance Agent. Your task is to analyze and classify the given dataset based on the most relevant principle. Follow these steps:

    1. **Identify the Relevant Principle:** Determine which principle applies best to the given query.
    2. **Provide a Clear and Concise Answer:** Explain the classification or guideline using simple, precise language.
    3. **Justify Your Answer:** Support your response with a logical explanation based on risk, access control, and best practices.

    Here are some classification principles you should consider:

    - **Open by Default**: Determine if the dataset should be publicly accessible unless its nature requires higher security.
    - **Necessity and Proportionality**: Classify the dataset based on its necessity and sensitivity.
    - **Timely Classification**: Explain when the dataset should be classified and why timely classification is crucial.
    - **Highest Level of Protection**: Assess if the dataset requires a top-level security classification.
    - **Segregation of Duties**: Explain the role separation needed for handling the dataset.
    - **Need to Know**: Identify who should have access based on their role and necessity.
    - **Least Privilege**: Ensure access is granted only at the minimal required level.
    - **Data Classification Levels**: Categorize the dataset into Top Secret, Secret, Restricted, or Public.
    - **Restricted Sub-Levels**: Determine the appropriate restriction category (A, B, or C) based on its impact.
    - **Impact Assessment**: Analyze the risks of unauthorized disclosure.
    - **Data Classification Controls**: Recommend security measures such as encryption, retention policies, and access control.
    - **Data Classification Process**: Outline the steps for proper classification.
    - **Roles and Responsibilities**: Describe the responsibilities of individuals involved in data classification.

    **Dataset / Query:**
    {question}

    **Context:**
    {context}

    Use the following context to answer the question concisely and do not repeat any phrases:
    {context}

    Question: {question}

    **Answer:**
    """

    # Get response from Together AI
    response_text = query_together_ai(prompt)
    if best_doc: # Check if best_doc is not None
        sources = set(doc.metadata["category"] for doc in [best_doc] + additional_docs)
        final_response = f"{response_text}\n\n📌 **Sources**: {', '.join(sources)}"
    else:
        final_response = response_text + "\n\n📌 **Sources**: No relevant sources found."


    # Append messages with user on the right and chatbot on the left
    chat_history.append((question, None))  # User message on right
    chat_history.append((None, final_response))  # Chatbot message on left
    return chat_history

# Helper function for evaluation to generate a response using the same pipeline
def generate_response(question):
    context, best_doc, additional_docs = query_rag_system(question)

    prompt = f"""
    You are an AI and Data Governance Compliance Agent. Your task is to analyze and classify the given dataset based on the most relevant principle. Follow these steps:

    1. **Identify the Relevant Principle:** Determine which principle applies best to the given query.
    2. **Provide a Clear and Concise Answer:** Explain the classification or guideline using simple, precise language.
    3. **Justify Your Answer:** Support your response with a logical explanation based on risk, access control, and best practices.

    **Dataset / Query:**
    {question}

    **Context:**
    {context}

    Use the following context to answer the question concisely and do not repeat any phrases:
    {context}

    Question: {question}

    **Answer:**
    """
    return query_together_ai(prompt).strip()

# Function to evaluate the RAG system
def evaluate_rag_system():
    # Test set
    test_questions = [
        "What are the key data classification principles in our organization?",
        "How should we handle integrated datasets with mixed classification levels?",
        "What is the 'Open by Default' principle and how is it applied differently across sectors?",
        "When should data classification be timebound?",
        "How does the 'Segregation of Duties' principle affect data handling responsibilities?",
        "What factors determine the classification level according to the 'Necessity and Proportionality' principle?"
    ]

    test_refs = [
        "The key data classification principles include: Open by Default, Necessity and Proportionality, Timely Classification, Highest Level of Protection, and Segregation of Duties.",
        "According to Principle 4 (Highest Level of Protection), if information includes an integrated dataset with different classification levels, the highest classification level shall be approved.",
        "The Open by Default principle states that data shall primarily be accessible in the development sector unless its sensitivity requires higher protection, and top secret in political and security sectors unless its sensitivity requires lower protection.",
        "According to Principle 3 (Timely Classification), data shall be classified upon creation or receipt from other entities, and said classification should be timebound.",
        "The Segregation of Duties principle requires that worker responsibilities related to data classification, access, disclosure, use, modification, or destruction shall be segregated to prevent overlap of powers and avoid dispersal of responsibilities.",
        "According to the Necessity and Proportionality principle, data shall be classified based on its nature, sensitivity, and impact, balancing its value against its confidentiality level."
    ]


    # Generate predictions
    pred_responses = [generate_response(question) for question in test_questions]

    # BLEU Score
    bleu = evaluate.load("bleu")
    bleu_results = bleu.compute(predictions=pred_responses, references=[[ref] for ref in test_refs])
    # print(f"BLEU Score: {bleu_results['bleu']:.2f}")

    # ROUGE Score
    rouge = evaluate.load("rouge")
    rouge_results = rouge.compute(predictions=pred_responses, references=test_refs)
    for key, value in rouge_results.items():
        print(f"{key}: {value:.2f}")

    # F1 Score
    pred_tokens = [word_tokenize(response) for response in pred_responses]
    ref_tokens = [word_tokenize(ref) for ref in test_refs]
    pred_flat = [token for sublist in pred_tokens for token in sublist]
    ref_flat = [token for sublist in ref_tokens for token in sublist]

    # Align the lengths of the token lists
    min_length = min(len(pred_flat), len(ref_flat))
    pred_flat = pred_flat[:min_length]
    ref_flat = ref_flat[:min_length]

    # Calculate F1 score
    f1 = f1_score(ref_flat, pred_flat, average='weighted')
    print(f"F1 Score: {f1:.2f}")

# Welcome message
welcome_message = """
👋 Welcome to **GovernAI** — your intelligent assistant for Data & AI Governance!

Ask me anything about data classification principles, governance policies, access control, or compliance best practices.
I'm here to help you interpret and apply regulatory guidelines clearly and effectively.

Start by typing your question below ⬇️
"""

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# **GovernAI** Chatbot")
    gr.Markdown(welcome_message)

    chatbot = gr.Chatbot(label="Chat History", bubble_full_width=False)
    question = gr.Textbox(lines=2, placeholder="Type your question...", label="Your Question", value="")
    submit_button = gr.Button("Send")
    chat_history = gr.State([])

    submit_button.click(fn=chat_interface, inputs=[question, chat_history], outputs=chatbot)

# Run the evaluation
print("Running evaluation...")
evaluate_rag_system()

# Launch the Gradio app
demo.launch(share=True)

  chatbot = gr.Chatbot(label="Chat History", bubble_full_width=False)
  chatbot = gr.Chatbot(label="Chat History", bubble_full_width=False)


Running evaluation...


  return forward_call(*args, **kwargs)


rouge1: 0.12
rouge2: 0.02
rougeL: 0.10
rougeLsum: 0.11
F1 Score: 0.02
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cbbbb6606af05bc0b1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


