# 03: Silver Layer - Anonymization and Utility

This notebook demonstrates:
- k-anonymity and differential privacy
- Utility assessment
- Privacy-utility tradeoffs

In [None]:
import sys
sys.path.insert(0, '../src')

from pyspark.sql import SparkSession
from faircare.silver.anonymization import AnonymizationEngine
from faircare.silver.utilityassessment import UtilityAssessment
from faircare.metrics.layermetrics import SilverMetrics
import yaml

In [None]:
spark = SparkSession.builder.appName("FAIR-CARE-Silver").getOrCreate()

with open('../configs/default.yaml', 'r') as f:
    config = yaml.safe_load(f)

dataset_config = config['datasets']['compas']

## Load Bronze Data

In [None]:
bronze_df = spark.read.format("delta").load(dataset_config['bronze_path'])
print(f"Bronze records: {bronze_df.count()}")

## Anonymization: k-Anonymity

In [None]:
anon_config = config['anonymization'].copy()
anon_config['quasi_identifiers'] = dataset_config['quasi_identifiers']
anon_config['technique'] = 'kanonymity'
anon_config['k'] = 5

anonymizer = AnonymizationEngine(anon_config)
silver_df = anonymizer.anonymize(bronze_df, spark)

print(f"\nSilver records after k-anonymity: {silver_df.count()}")
print(f"Suppression rate: {(1 - silver_df.count()/bronze_df.count())*100:.1f}%")

In [None]:
# Compare before/after
print("\nBefore anonymization:")
bronze_df.select('age', 'sex', 'race').show(5)

print("\nAfter k-anonymity:")
silver_df.select('age', 'sex', 'race').show(5)

## Utility Assessment

In [None]:
utility_assessor = UtilityAssessment(dataset_config)
utility_report = utility_assessor.assess(bronze_df, silver_df)

print("\nUtility Assessment:")
print(f"  Correlation distance: {utility_report.get('correlation_distance', 'N/A')}")
print(f"  Original AUC: {utility_report.get('original_auc', 'N/A'):.3f}")
print(f"  Anonymized AUC: {utility_report.get('anonymized_auc', 'N/A'):.3f}")
print(f"  Utility retention: {utility_report.get('utility_retention', 0)*100:.1f}%")

## Compare Different k Values

In [None]:
import matplotlib.pyplot as plt

k_values = [3, 5, 7, 10]
results = []

for k in k_values:
    anon_config['k'] = k
    anonymizer = AnonymizationEngine(anon_config)
    temp_df = anonymizer.anonymize(bronze_df, spark)
    
    utility = utility_assessor.assess(bronze_df, temp_df)
    
    results.append({
        'k': k,
        'records': temp_df.count(),
        'utility': utility.get('utility_retention', 0)
    })

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot([r['k'] for r in results], [r['records'] for r in results], marker='o')
ax1.set_xlabel('k value')
ax1.set_ylabel('Records remaining')
ax1.set_title('Privacy (k) vs Data Retention')
ax1.grid(True)

ax2.plot([r['k'] for r in results], [r['utility'] for r in results], marker='o', color='orange')
ax2.set_xlabel('k value')
ax2.set_ylabel('Utility retention')
ax2.set_title('Privacy (k) vs Utility')
ax2.grid(True)

plt.tight_layout()
plt.show()

## Calculate Silver Score

In [None]:
silver_metrics = SilverMetrics()
ss = silver_metrics.calculate({
    'utility_retention': utility_report.get('utility_retention', 0),
    'causal_validity': 'PASS'
})

print(f"\nSilver Score (SS): {ss:.3f}")

## Summary

Silver layer complete:
- ✅ k-anonymity applied
- ✅ Utility assessed
- ✅ Privacy-utility tradeoffs analyzed
- ✅ Silver Score calculated

**Next**: Proceed to notebook 04 for causal validation.

In [None]:
spark.stop()