Researchers can access and explore the IRT Privacy‑Policy Dataset according to their specific needs. This is an example Python script demonstrating four core functionalities:

1. Database Statistics: This function prints the total number of clauses in the database, the number of clauses labeled with each privacy concern, and the number of clauses with no matching labels (no privacy concern).
2. Keyword Search: This function allows users to input a keyword or a set of keywords. It then retrieves and prints all clauses containing the specified terms.
3. Label-Based Filtering: This function enables users to select a specific privacy concern label. All clauses associated with the selected label are then retrieved and displayed.
4. Company-Based Search: Leveraging the company field associated with each entry, this function allows users to enter a company name and retrieves all clauses attributed to that company.

In [4]:
"""
Read-only CLI for the Columbia IRT Privacy-Policy Clause dataset
"""

import requests
import itertools
from urllib.parse import quote_plus

# ---------- Project-specific constants ----------
API_KEY     = "AIzaSyDfvG2Y2mnUxl6QKivpf8RSucx0IqOCELQ"
PROJECT_ID  = "privacy-policy-database-apr25"
COLLECTION  = "privacy_policies"
BASE_URL    = (
    f"https://firestore.googleapis.com/v1/projects/{PROJECT_ID}"
    f"/databases/(default)/documents/{quote_plus(COLLECTION)}"
)

def _extract_value(value_dict):
    """Firestore REST wraps every primitive in a one-key dict."""
    if not isinstance(value_dict, dict):
        return value_dict
    # Grab the first (and only) key → value
    key, val = next(iter(value_dict.items()))
    return val

def _parse_document(doc_json):
    """Flatten Firestore document proto to a normal dict."""
    fields = doc_json.get("fields", {})
    return {k: _extract_value(v) for k, v in fields.items()}

def _fetch_all_docs():
    """Yield every document in COLLECTION (handles pagination)."""
    page_token = ""
    while True:
        url = f"{BASE_URL}?pageSize=1000&key={API_KEY}"
        if page_token:
            url += f"&pageToken={page_token}"
        resp = requests.get(url, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        for doc in data.get("documents", []):
            yield _parse_document(doc)
        page_token = data.get("nextPageToken")
        if not page_token:
            break

def list_all_entries():
    docs = list(_fetch_all_docs())
    total_count = len(docs)

    stats = dict(
        shared_with_advertisers=0,
        transferred_outside=0,
        used_for_ai=0,
    )

    for d in docs:
        if d.get("Data Shared with Advertisers"):
            stats["shared_with_advertisers"] += 1
        if d.get("Data Transferred Outside USA, EU, or UK"):
            stats["transferred_outside"] += 1
        if d.get("Data Used for AI Training"):
            stats["used_for_ai"] += 1

    # Percent helpers
    pct = lambda x: (x / total_count * 100) if total_count else 0

    print("\nPrivacy Policy Database Statistics")
    print("-" * 80)
    print(f"Total number of clauses: {total_count}")
    print("\nBreakdown by privacy concerns:")
    print(f"- Shared with advertisers: "
          f"{stats['shared_with_advertisers']} ({pct(stats['shared_with_advertisers']):.1f}%)")
    print(f"- Transferred outside USA, EU, or UK: "
          f"{stats['transferred_outside']} ({pct(stats['transferred_outside']):.1f}%)")
    print(f"- Used for AI training: "
          f"{stats['used_for_ai']} ({pct(stats['used_for_ai']):.1f}%)")

    # Combinations
    both_shared_and_transferred = sum(
        1 for d in docs
        if d.get("Data Shared with Advertisers") and d.get("Data Transferred Outside USA, EU, or UK")
    )
    both_shared_and_ai = sum(
        1 for d in docs
        if d.get("Data Shared with Advertisers") and d.get("Data Used for AI Training")
    )
    both_transferred_and_ai = sum(
        1 for d in docs
        if d.get("Data Transferred Outside USA, EU, or UK") and d.get("Data Used for AI Training")
    )
    all_three = sum(
        1 for d in docs
        if d.get("Data Shared with Advertisers")
        and d.get("Data Transferred Outside USA, EU, or UK")
        and d.get("Data Used for AI Training")
    )

    print("\nCombinations of privacy concerns:")
    print(f"- Shared & transferred: {both_shared_and_transferred}")
    print(f"- Shared & AI: {both_shared_and_ai}")
    print(f"- Transferred & AI: {both_transferred_and_ai}")
    print(f"- All three: {all_three}")

    no_concerns = total_count - (
        stats["shared_with_advertisers"]
        + stats["transferred_outside"]
        + stats["used_for_ai"]
        - both_shared_and_transferred
        - both_shared_and_ai
        - both_transferred_and_ai
        + all_three
    )
    print(f"\nClauses with no privacy concerns: {no_concerns} ({pct(no_concerns):.1f}%)")
    print("-" * 80)

def search_by_text(search_term):
    docs = _fetch_all_docs()
    term = search_term.lower()
    print(f"\nSearching for clauses containing: '{search_term}'")
    print("-" * 80)
    found = False
    for d in docs:
        if term in d.get("text", "").lower():
            found = True
            print(f"Text: {d.get('text')}")
            print(f"Shared with advertisers: {d.get('Data Shared with Advertisers')}")
            print(f"Transferred outside USA, EU, or UK: {d.get('Data Transferred Outside USA, EU, or UK')}")
            print(f"Used for AI training: {d.get('Data Used for AI Training')}")
            print("-" * 80)
    if not found:
        print("No matching clauses found.")

def _filter_docs(**kwargs):
    """Client-side filtering; kwargs are boolean flags or None."""
    return (
        d for d in _fetch_all_docs()
        if all(
            (d.get(field) == val) if val is not None else True
            for field, val in kwargs.items()
        )
    )

def find_by_privacy_concerns(shared_with_advertisers=None,
                              transferred_outside=None,
                              used_for_ai=None):
    query_kwargs = {
        "Data Shared with Advertisers": shared_with_advertisers,
        "Data Transferred Outside USA, EU, or UK": transferred_outside,
        "Data Used for AI Training": used_for_ai,
    }
    docs = list(_filter_docs(**query_kwargs))
    print("\nSearching for clauses with specified privacy concerns:")
    for k, v in query_kwargs.items():
        if v is not None:
            print(f"- {k}: {v}")
    print("-" * 80)
    if not docs:
        print("No matching clauses found.")
        return
    for d in docs:
        print(f"Text: {d.get('text')}")
        print(f"Shared with advertisers: {d.get('Data Shared with Advertisers')}")
        print(f"Transferred outside USA, EU, or UK: {d.get('Data Transferred Outside USA, EU, or UK')}")
        print(f"Used for AI training: {d.get('Data Used for AI Training')}")
        print("-" * 80)

def search_by_company(company_term):
    docs = _fetch_all_docs()
    term = company_term.lower()
    print(f"\nSearching for clauses from companies containing: '{company_term}'")
    print("-" * 80)
    found = False
    for d in docs:
        company = d.get("company", "")
        if term in company.lower():
            found = True
            print(f"Company: {company}")
            print(f"Text: {d.get('text')}")
            print(f"Shared with advertisers: {d.get('Data Shared with Advertisers')}")
            print(f"Transferred outside USA, EU, or UK: {d.get('Data Transferred Outside USA, EU, or UK')}")
            print(f"Used for AI training: {d.get('Data Used for AI Training')}")
            print("-" * 80)
    if not found:
        print("No matching clauses found.")

def main():
    while True:
        print("\nPrivacy Policy Database Query Tool")
        print("1. Show Database Statistics")
        print("2. Search by text")
        print("3. Find by privacy concerns")
        print("4. Search by company")
        print("5. Exit")

        choice = input("\nEnter your choice (1-5): ")

        if choice == "1":
            list_all_entries()
        elif choice == "2":
            search_term = input("Enter text to search for: ")
            search_by_text(search_term)
        elif choice == "3":
            print("\nEnter True/False for each concern (or press Enter to skip):")
            shared = input("Shared with advertisers? (True/False/Enter): ")
            transferred = input("Transferred outside USA, EU, or UK? (True/False/Enter): ")
            ai_used = input("Used for AI training? (True/False/Enter): ")

            tf = lambda s: True if s.lower() == "true" else False if s.lower() == "false" else None
            find_by_privacy_concerns(tf(shared), tf(transferred), tf(ai_used))
        elif choice == "4":
            company_term = input("Enter company name: ")
            search_by_company(company_term)
        elif choice == "5":
            print("Goodbye!")
            break
        else:
            print("Invalid choice.")

In [5]:
if __name__ == "__main__":
    main()


Privacy Policy Database Query Tool
1. Show Database Statistics
2. Search by text
3. Find by privacy concerns
4. Search by company
5. Exit

Enter your choice (1-5): 1

Privacy Policy Database Statistics
--------------------------------------------------------------------------------
Total number of clauses: 5091

Breakdown by privacy concerns:
- Shared with advertisers: 4267 (83.8%)
- Transferred outside USA, EU, or UK: 664 (13.0%)
- Used for AI training: 182 (3.6%)

Combinations of privacy concerns:
- Shared & transferred: 18
- Shared & AI: 15
- Transferred & AI: 0
- All three: 0

Clauses with no privacy concerns: 11 (0.2%)
--------------------------------------------------------------------------------

Privacy Policy Database Query Tool
1. Show Database Statistics
2. Search by text
3. Find by privacy concerns
4. Search by company
5. Exit

Enter your choice (1-5): 5
Goodbye!
