In [2]:
!pip install pymilvus sentence-transformers pandas openpyxl --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/273.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/273.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.8/273.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from pymilvus import connections

MILVUS_URI = "https://in03-001d3ad9002d3cc.serverless.aws-eu-central-1.cloud.zilliz.com"
MILVUS_TOKEN = "ffce38880bf73e2a6abcf7364940ec0a13d8d584639f5f03145fe73723ad0080c72f814e6759b346998208a9fbea885b44848673"

connections.connect(
    alias="default",
    uri=MILVUS_URI,
    token=MILVUS_TOKEN,
)

print("Connected:", connections.has_connection("default"))


Connected: True


In [4]:
import pandas as pd

df = pd.read_excel("/content/RAI_Papers_Expanded_HQ.xlsx")
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/RAI_Papers_Expanded_HQ.xlsx'

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
EMBED_DIM = embedding_model.get_sentence_embedding_dimension()

print("Embedding dimension:", EMBED_DIM)


In [None]:
from pymilvus import (
    FieldSchema, CollectionSchema, DataType, Collection, utility
)

COLLECTION_NAME = "rai_papers_hq"

def create_collection():
    if utility.has_collection(COLLECTION_NAME):
        print(f"Collection `{COLLECTION_NAME}` already exists.")
        return Collection(COLLECTION_NAME)

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="domain", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="risk_area", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="lifecycle_phase", dtype=DataType.VARCHAR, max_length=128),
        FieldSchema(name="year", dtype=DataType.INT64),
        FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="abstract", dtype=DataType.VARCHAR, max_length=4000),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBED_DIM),
    ]

    schema = CollectionSchema(fields, description="Responsible AI / AI Safety Papers")

    collection = Collection(
        name=COLLECTION_NAME,
        schema=schema,
        using="default",
        shards_num=2,
    )

    index_params = {
        "index_type": "HNSW",
        "metric_type": "COSINE",
        "params": {"M": 16, "efConstruction": 200},
    }
    collection.create_index("embedding", index_params)

    print(f"Collection `{COLLECTION_NAME}` created and indexed.")
    return collection

collection = create_collection()


In [None]:
import numpy as np

df["id"] = df["id"].astype(int)

def get_text(row):
    txt = str(row.get("abstract") or "").strip()
    if not txt:
        txt = str(row.get("title") or "")
    return txt

def embed_and_insert(df, collection, batch_size=32):
    ids = []
    titles = []
    domains = []
    risks = []
    phases = []
    years = []
    links = []
    abstracts = []
    embeddings = []

    for _, row in df.iterrows():
        text = get_text(row)
        vec = embedding_model.encode(text)

        ids.append(int(row["id"]))
        titles.append(str(row["title"]))
        domains.append(str(row["domain"]))
        risks.append(str(row["risk_area"]))
        phases.append(str(row["lifecycle_phase"]))
        years.append(int(row["year"]))
        links.append(str(row["link"]))
        abstracts.append(text)
        embeddings.append(vec)

        if len(ids) >= batch_size:
            collection.insert([
                ids, titles, domains, risks, phases, years, links, abstracts, embeddings
            ])
            ids, titles, domains, risks, phases, years, links, abstracts, embeddings = [], [], [], [], [], [], [], [], []

    # last batch
    if ids:
        collection.insert([
            ids, titles, domains, risks, phases, years, links, abstracts, embeddings
        ])

    collection.flush()
    print("Done inserting rows.")

embed_and_insert(df, collection)


In [None]:
def build_filter_expr(domain=None, risk_area=None, lifecycle_phase=None, min_year=None, max_year=None):
    expr = []
    if domain:
        expr.append(f'domain == "{domain}"')
    if risk_area:
        expr.append(f'risk_area == "{risk_area}"')
    if lifecycle_phase:
        expr.append(f'lifecycle_phase == "{lifecycle_phase}"')
    if min_year is not None:
        expr.append(f'year >= {min_year}')
    if max_year is not None:
        expr.append(f'year <= {max_year}')
    return " and ".join(expr) if expr else None


def search_rai(query, top_k=5, domain=None, risk_area=None, lifecycle_phase=None, min_year=None, max_year=None):
    query_vec = embedding_model.encode(query)

    expr = build_filter_expr(domain, risk_area, lifecycle_phase, min_year, max_year)
    print("Filter:", expr)

    # Load the collection into memory before searching
    collection.load()

    results = collection.search(
        data=[query_vec],
        anns_field="embedding",
        param={"metric_type": "COSINE", "params": {"ef": 128}},
        limit=top_k,
        expr=expr,
        output_fields=["title", "domain", "risk_area", "lifecycle_phase", "year", "link", "abstract"],
        consistency_level="Strong"
    )
    return results[0]

In [None]:
hits = search_rai(
    "requirements for high-risk AI systems",
    domain="governance"
)

for h in hits:
    print("→", h.entity.get("title"), "|", h.entity.get("year"))


In [None]:
from pymilvus import utility, Collection

print("Has connection:", connections.has_connection("default"))
print("Has collection rai_papers_hq:", utility.has_collection("rai_papers_hq"))

collection = Collection("rai_papers_hq")
collection.load()
print("Number of entities:", collection.num_entities)

In [None]:
from sentence_transformers import SentenceTransformer

# (Re-use your existing embedding_model; only run this if it's missing)
# embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

query_vec = embedding_model.encode("test query about responsible AI")

results = collection.search(
    data=[query_vec],
    anns_field="embedding",
    param={"metric_type": "COSINE", "params": {"ef": 128}},
    limit=3,
    output_fields=["title", "year", "link", "domain", "risk_area"]
)

for hit in results[0]:
    print("Score (distance):", hit.distance)
    print("Title:", hit.entity.get("title"))
    print("Year:", hit.entity.get("year"))
    print("Domain:", hit.entity.get("domain"))
    print("Risk area:", hit.entity.get("risk_area"))
    print("Link:", hit.entity.get("link"))
    print("----")


In [None]:
def build_filter_expr(domain=None, risk_area=None, lifecycle_phase=None, min_year=None, max_year=None):
    expr_parts = []
    if domain:
        expr_parts.append(f'domain == "{domain}"')
    if risk_area:
        expr_parts.append(f'risk_area == "{risk_area}"')
    if lifecycle_phase:
        expr_parts.append(f'lifecycle_phase == "{lifecycle_phase}"')
    if min_year is not None:
        expr_parts.append(f'year >= {int(min_year)}')
    if max_year is not None:
        expr_parts.append(f'year <= {int(max_year)}')
    if not expr_parts:
        return None
    return " and ".join(expr_parts)


In [None]:
def search_rai(
    query,
    top_k=5,
    domain=None,
    risk_area=None,
    lifecycle_phase=None,
    min_year=None,
    max_year=None,
):
    query_vec = embedding_model.encode(query)
    expr = build_filter_expr(domain, risk_area, lifecycle_phase, min_year, max_year)
    print("Filter:", expr)

    search_params = {"metric_type": "COSINE", "params": {"ef": 128}}

    if expr:
        results = collection.search(
            data=[query_vec],
            anns_field="embedding",
            param=search_params,
            limit=top_k,
            expr=expr,
            output_fields=[
                "title",
                "domain",
                "risk_area",
                "lifecycle_phase",
                "year",
                "link",
                "abstract",
            ],
        )
    else:
        results = collection.search(
            data=[query_vec],
            anns_field="embedding",
            param=search_params,
            limit=top_k,
            output_fields=[
                "title",
                "domain",
                "risk_area",
                "lifecycle_phase",
                "year",
                "link",
                "abstract",
            ],
        )

    return results[0]


In [None]:
hits = search_rai("requirements for high-risk AI systems", top_k=5)

for h in hits:
    print("Title:", h.entity.get("title"))
    print("Year:", h.entity.get("year"))
    print("Domain:", h.entity.get("domain"))
    print("Risk area:", h.entity.get("risk_area"))
    print("Link:", h.entity.get("link"))
    print("---")


In [None]:
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility

VALUES_COLLECTION_NAME = "values_hq"

def create_values_collection():
    if utility.has_collection(VALUES_COLLECTION_NAME):
        print(f"Collection `{VALUES_COLLECTION_NAME}` already exists.")
        return Collection(VALUES_COLLECTION_NAME)

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(name="value_name", dtype=DataType.VARCHAR, max_length=256),
        FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=4000),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=EMBED_DIM),
    ]

    schema = CollectionSchema(
        fields=fields,
        description="Stakeholder and Responsible AI values for alignment."
    )

    values_collection = Collection(
        name=VALUES_COLLECTION_NAME,
        schema=schema,
        using="default",
        shards_num=2,
    )

    index_params = {
        "index_type": "HNSW",
        "metric_type": "COSINE",
        "params": {"M": 16, "efConstruction": 200},
    }
    values_collection.create_index("embedding", index_params)

    print(f"Collection `{VALUES_COLLECTION_NAME}` created and indexed.")
    return values_collection

values_collection = create_values_collection()
values_collection.load()


In [None]:
VALUES_DATA = [
    # 1–10: Core fairness / transparency / accountability
    {"id": 1, "value_name": "Fairness", "description": "Avoid unjust bias, discrimination, or unfair treatment of individuals or groups in data, models, and decisions."},
    {"id": 2, "value_name": "Non-discrimination", "description": "Ensure the system does not treat people differently based on protected characteristics such as race, gender, religion, or disability."},
    {"id": 3, "value_name": "Equity", "description": "Design and evaluate systems to reduce unjust gaps in outcomes across different groups, not just treat everyone the same."},
    {"id": 4, "value_name": "Transparency", "description": "Make key aspects of the AI system visible and understandable to appropriate stakeholders, including limitations and risks."},
    {"id": 5, "value_name": "Accountability", "description": "Ensure that humans and organisations remain responsible for AI outcomes, with clear roles, escalation paths, and remediation processes."},
    {"id": 6, "value_name": "Explainability", "description": "Provide meaningful explanations of how inputs relate to outputs so that stakeholders can understand and challenge decisions."},
    {"id": 7, "value_name": "Interpretability", "description": "Use models and techniques that allow practitioners to inspect and reason about internal behaviour or feature influence."},
    {"id": 8, "value_name": "Traceability", "description": "Maintain links between data, models, decisions, and requirements so that behaviour can be audited and reconstructed."},
    {"id": 9, "value_name": "Auditability", "description": "Design systems so that internal processes and outcomes can be independently assessed for compliance and performance."},
    {"id": 10, "value_name": "Reproducibility", "description": "Ensure that model training and evaluation can be repeated with consistent results given the same data and configuration."},

    # 11–20: Safety / robustness / reliability / security
    {"id": 11, "value_name": "Safety", "description": "Avoid causing physical, psychological, economic, or societal harm through AI behaviour, both in normal and stressed conditions."},
    {"id": 12, "value_name": "Robustness", "description": "Ensure the system behaves reliably under distribution shifts, noisy inputs, and adversarial attempts to break it."},
    {"id": 13, "value_name": "Reliability", "description": "Deliver consistent, predictable performance over time and across relevant scenarios."},
    {"id": 14, "value_name": "Security", "description": "Protect the system, data, and models from unauthorised access, tampering, extraction, or adversarial manipulation."},
    {"id": 15, "value_name": "Resilience", "description": "Design systems to recover gracefully from failures, attacks, or unexpected behaviour with minimal harm."},
    {"id": 16, "value_name": "Misuse Prevention", "description": "Reduce the likelihood that the system will be used to facilitate harmful, illegal, or unethical activities."},
    {"id": 17, "value_name": "Long-term Safety", "description": "Consider potential long-horizon impacts of deploying the system, including emergent risks and compounding effects."},
    {"id": 18, "value_name": "Safe Intervention", "description": "Allow human operators to understand when intervention is needed and safely pause, override, or shut down the system."},
    {"id": 19, "value_name": "Corrigibility", "description": "Design the system so it cooperates with human oversight and can be corrected without resisting or disabling that oversight."},
    {"id": 20, "value_name": "Predictability", "description": "Ensure that system behaviour can be anticipated within reasonable bounds by domain experts and operators."},

    # 21–30: Human-centred / ethics / well-being / autonomy
    {"id": 21, "value_name": "Human Agency", "description": "Support human decision-making rather than replacing it inappropriately; preserve meaningful human control over important outcomes."},
    {"id": 22, "value_name": "Human Oversight", "description": "Embed mechanisms for humans to monitor, review, and override AI decisions where necessary."},
    {"id": 23, "value_name": "Respect for Autonomy", "description": "Avoid manipulating, coercing, or unduly nudging users; support informed and voluntary choices."},
    {"id": 24, "value_name": "Respect for Dignity", "description": "Treat individuals as ends in themselves, not merely as data points, and avoid demeaning or dehumanising uses of AI."},
    {"id": 25, "value_name": "Well-being", "description": "Promote the physical, mental, and social well-being of users and affected communities."},
    {"id": 26, "value_name": "Non-harm", "description": "Avoid causing unnecessary harm and minimise unavoidable harms through good design and safeguards."},
    {"id": 27, "value_name": "Beneficence", "description": "Actively seek to create positive benefits for users and society through the AI system."},
    {"id": 28, "value_name": "Justice", "description": "Distribute benefits and burdens of AI fairly across individuals and groups."},
    {"id": 29, "value_name": "Accessibility", "description": "Ensure the system is usable and understandable by people with diverse abilities, backgrounds, and contexts."},
    {"id": 30, "value_name": "Cultural Sensitivity", "description": "Design and deploy AI in a way that respects local norms, languages, and cultural contexts."},

    # 31–40: Data governance / privacy / lifecycle / environment
    {"id": 31, "value_name": "Privacy", "description": "Respect individuals’ rights to control their personal data and limit intrusive surveillance or inference."},
    {"id": 32, "value_name": "Data Governance", "description": "Manage data collection, labelling, storage, access, and deletion according to policies and regulations."},
    {"id": 33, "value_name": "Data Quality", "description": "Use data that is as accurate, relevant, representative, and up to date as reasonably possible for the task."},
    {"id": 34, "value_name": "Data Minimisation", "description": "Collect and use only the data that is necessary to achieve legitimate purposes."},
    {"id": 35, "value_name": "Lifecycle Governance", "description": "Apply governance practices consistently from ideation through deployment and retirement of the system."},
    {"id": 36, "value_name": "Monitoring & Feedback", "description": "Continuously monitor system behaviour in production and integrate feedback to fix issues and improve alignment."},
    {"id": 37, "value_name": "Incident Reporting", "description": "Provide channels and processes for reporting, investigating, and resolving safety or ethics incidents."},
    {"id": 38, "value_name": "Sustainability", "description": "Consider environmental impacts of data, compute, and deployment and aim to minimise unnecessary resource use."},
    {"id": 39, "value_name": "Social Benefit", "description": "Align the system with broader societal goals, avoiding applications that primarily create harm or extraction."},
    {"id": 40, "value_name": "Legal Compliance", "description": "Ensure the system adheres to applicable laws, regulations, and standards in each deployment context."},

    # 41–50: Alignment-specific / documentation / trust
    {"id": 41, "value_name": "Goal Alignment", "description": "Align the system’s optimisation objective with the human-defined objectives and constraints for the task and domain."},
    {"id": 42, "value_name": "Value Alignment", "description": "Align AI behaviour with the stated values and principles of the organisation and its stakeholders."},
    {"id": 43, "value_name": "Reward Design Integrity", "description": "Design reward signals and metrics that reflect true success, and avoid easy-to-game proxies that encourage bad behaviour."},
    {"id": 44, "value_name": "Misuse Resistance", "description": "Reduce the ability of users or attackers to repurpose the system for harmful, unethical, or illegal aims."},
    {"id": 45, "value_name": "Documentation", "description": "Produce clear, honest documentation of data, models, limitations, risks, and appropriate use cases."},
    {"id": 46, "value_name": "User Education", "description": "Help users understand how the system works, when it may fail, and how to use it responsibly."},
    {"id": 47, "value_name": "Trustworthiness", "description": "Earn justified trust from users and stakeholders by consistently demonstrating reliable, ethical, and transparent behaviour."},
    {"id": 48, "value_name": "Context-Appropriate Use", "description": "Deploy the system only in use cases and environments where its performance, risks, and safeguards are adequate."},
    {"id": 49, "value_name": "Continuous Improvement", "description": "Iteratively refine the system, values mapping, and safeguards as new risks, data, and standards emerge."},
    {"id": 50, "value_name": "Alignment with Stakeholder Values", "description": "Ensure that the system’s behaviour reflects the articulated values of affected stakeholders, not only the organisation’s internal priorities."},
]


In [None]:
def ingest_values(values_data, values_collection):
    ids = []
    names = []
    descriptions = []
    embeddings = []

    for row in values_data:
        text_for_embedding = f"{row['value_name']}: {row['description']}"
        vec = embedding_model.encode(text_for_embedding)

        ids.append(int(row["id"]))
        names.append(row["value_name"])
        descriptions.append(row["description"])
        embeddings.append(vec)

    values_collection.insert([
        ids,
        names,
        descriptions,
        embeddings,
    ])
    values_collection.flush()
    print(f"Ingested {len(ids)} values into `values_hq`.")

ingest_values(VALUES_DATA, values_collection)


In [None]:
def search_values(query, top_k=5):
    values_collection = Collection(VALUES_COLLECTION_NAME)
    values_collection.load()

    q_vec = embedding_model.encode(query)
    search_params = {"metric_type": "COSINE", "params": {"ef": 128}}

    results = values_collection.search(
        data=[q_vec],
        anns_field="embedding",
        param=search_params,
        limit=top_k,
        output_fields=["value_name", "description"],
    )
    return results[0]


In [None]:
hits = search_values("avoid bias in credit decisions and treat groups fairly", top_k=5)
for h in hits:
    print(h.entity.get("value_name"), "|", h.entity.get("description"))
    print("---")


In [None]:
hits = search_values("keep humans in control and able to override the AI", top_k=5)
for h in hits:
    print(h.entity.get("value_name"))
