# UKB Compliance Auditor (Activity 2.1.8.1)

## Overview
This notebook implements a compliance auditing script to scan the `rogen_aging` repository for restricted identifiers, specifically UK Biobank (UKB) participant IDs (EIDs). This tool ensures that no sensitive data is accidentally pushed to public portals, maintaining compliance with UK Biobank data usage agreements.

### Objectives:
1. Scan files for 7-digit numbers (common UKB EID format).
2. Identify files with restricted names or extensions.
3. Provide a report compatible with the UK Biobank Git Audit requirements.
4. Report the researcher's GitHub username as part of the audit trail.

In [None]:
import os
import re
from pathlib import Path
import polars as pl
from typing import List, Dict

# --- Configuration ---
GITHUB_USERNAME = "mitya-toren"  # Researcher GitHub username
REPO_ROOT = Path("../..").resolve()  # Point to repository root
IGNORE_DIRS = [".git", ".venv", "__pycache__", "node_modules", ".ipynb_checkpoints"]
IGNORE_FILES = ["uv.lock", "package-lock.json", "UKB_Compliance_Auditor.ipynb"]
RESTRICTED_PATTERNS = [
    r"\b\d{7}\b",  # Matches exact 7-digit numbers (typical UKB EID)
    r"participant_id",
    r"eid",
    r"patient_name"
]
RESTRICTED_EXTENSIONS = [".csv", ".tsv", ".xlsx", ".sqlite", ".bam", ".pod5"]

print(f"Auditor initialized for user: {GITHUB_USERNAME}")
print(f"Scanning repository at: {REPO_ROOT}")

## Compliance Scanner Logic

The following functions implement the local wrapper for auditing the repository.

In [None]:
def is_ignored(path: Path) -> bool:
    """Check if a path should be ignored during the scan."""
    if path.name in IGNORE_FILES:
        return True
    for ignore in IGNORE_DIRS:
        if ignore in path.parts:
            return True
    return False

def scan_file_contents(file_path: Path) -> List[Dict]:
    """Scan a single file for restricted patterns."""
    findings = []
    try:
        # Only scan text files to avoid issues with binary data
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            for line_num, line in enumerate(f, 1):
                for pattern in RESTRICTED_PATTERNS:
                    matches = re.findall(pattern, line)
                    if matches:
                        # Filter out common false positives (e.g. 0000000)
                        valid_matches = [m for m in matches if m != "0000000"]
                        if valid_matches:
                            findings.append({
                                "file": str(file_path.relative_to(REPO_ROOT)),
                                "line": line_num,
                                "pattern": pattern,
                                "match": str(valid_matches[0]),
                                "type": "Content Match"
                            })
    except Exception as e:
        # Log errors but continue
        pass
    return findings

def audit_repository() -> pl.DataFrame:
    """Perform the full repository audit."""
    all_findings = []
    
    for file_path in REPO_ROOT.rglob("*"):
        if file_path.is_file() and not is_ignored(file_path):
            # 1. Check file extension
            if file_path.suffix.lower() in RESTRICTED_EXTENSIONS:
                all_findings.append({
                    "file": str(file_path.relative_to(REPO_ROOT)),
                    "line": 0,
                    "pattern": file_path.suffix,
                    "match": "File Extension",
                    "type": "Restricted File Type"
                })
            
            # 2. Check content (if not too large)
            if file_path.stat().st_size < 5 * 1024 * 1024:  # 5MB limit
                all_findings.extend(scan_file_contents(file_path))
                
    return pl.DataFrame(all_findings) if all_findings else pl.DataFrame(schema=["file", "line", "pattern", "match", "type"])

## Integration with UK Biobank Git Audit Tool API

This section provides a wrapper for the hypothetical or internal UKB Git Audit Tool API.

In [None]:
class UKBGitAuditAPI:
    """Wrapper for the UK Biobank Git Audit Tool API."""
    def __init__(self, username: str):
        self.username = username
        self.api_endpoint = "https://api.ukbiobank.ac.uk/git-audit/v1"  # Placeholder

    def submit_report(self, report_df: pl.DataFrame):
        """
        Mock submission of the audit report to UK Biobank.
        In a real scenario, this would involve an authenticated POST request.
        """
        print(f"Preparing submission for user: {self.username}...")
        issue_count = report_df.height
        if issue_count == 0:
            print("SUCCESS: No compliance issues found. Safe to push.")
        else:
            print(f"WARNING: Found {issue_count} potential compliance issues.")
            print("Please review the results below before proceeding.")
        
        return {"status": "success", "audited_by": self.username, "issues": issue_count}

audit_api = UKBGitAuditAPI(GITHUB_USERNAME)

## Run Audit and Display Results

In [None]:
# Run the local audit
results = audit_repository()

# Submit to UKB API (mock)
submission_status = audit_api.submit_report(results)

# Display findings
if results.height > 0:
    print("\nPotential Compliance Issues Found:")
    with pl.Config(tbl_rows=100):
        print(results)
else:
    print("\nRepository is compliant.")