# 02: Bronze Layer - Ingestion and PII Detection

This notebook demonstrates the Bronze layer of the FAIR-CARE pipeline:
- Data ingestion with metadata
- PII detection
- Provenance tracking

In [None]:
import sys
sys.path.insert(0, '../src')

from pyspark.sql import SparkSession
from faircare.bronze.ingestion import DataIngestion
from faircare.bronze.piidetection import PIIDetection
from faircare.bronze.audittrail import AuditTrail
from faircare.metrics.layermetrics import BronzeMetrics
import yaml

## Initialize Spark

In [None]:
spark = SparkSession.builder \
    .appName("FAIR-CARE-Bronze") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print(f"Spark version: {spark.version}")

## Load Configuration

In [None]:
with open('../configs/default.yaml', 'r') as f:
    config = yaml.safe_load(f)

dataset_config = config['datasets']['compas']
pii_config = config['pii_detection']

## Step 1: Data Ingestion

In [None]:
ingestion = DataIngestion(spark)

bronze_df = ingestion.ingest(
    source_path=dataset_config['raw_path'],
    output_path=dataset_config['bronze_path'],
    dataset_name='compas',
    source_system='manual_upload'
)

print(f"Ingested {bronze_df.count()} records")
bronze_df.printSchema()

In [None]:
# View sample with metadata
bronze_df.select('_ingestion_timestamp', '_source_system', '_dataset_name', 'age', 'race', 'sex').show(5)

## Step 2: PII Detection

In [None]:
pii_detector = PIIDetection(pii_config)
pii_report = pii_detector.detect(bronze_df, sample_size=1000)

print("\nPII Detection Report:")
for column, info in pii_report.items():
    print(f"\n{column}:")
    print(f"  PII Types: {info['pii_types']}")
    print(f"  Confidence: {info['confidence']}")
    print(f"  Recommendation: {info['recommendation']}")

## Step 3: Audit Trail

In [None]:
audit = AuditTrail(log_dir='../results/logs')
audit.log_event('BRONZE_INGESTION', {
    'dataset': 'compas',
    'record_count': bronze_df.count(),
    'pii_detected': list(pii_report.keys())
})

print("Audit event logged")

## Step 4: Calculate Bronze Score

In [None]:
bronze_metrics = BronzeMetrics()
sb = bronze_metrics.calculate({
    'provenance_complete': True,
    'pii_found': len(pii_report) > 0,
    'quality_score': 0.9
})

print(f"\nBronze Score (SB): {sb:.3f}")
if sb >= 0.85:
    print("Status: EXCELLENT")
elif sb >= 0.70:
    print("Status: ACCEPTABLE")
else:
    print("Status: AT RISK")

## Summary

Bronze layer complete:
- ✅ Data ingested with metadata
- ✅ PII detected and documented
- ✅ Provenance tracked
- ✅ Bronze Score calculated

**Next**: Proceed to notebook 03 for Silver layer anonymization.

In [None]:
spark.stop()