<a id='1'></a>
## 1 ¬∑ Setup

In [None]:
import sys, os, warnings, re, json
warnings.filterwarnings('ignore')

PROJECT_ROOT = os.path.abspath('..')
sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

sns.set_theme(style='whitegrid', palette='viridis', font_scale=1.1)
plt.rcParams.update({'figure.figsize':(14,5), 'figure.dpi':110,
                     'axes.titlesize':14, 'axes.titleweight':'bold'})

from src.utils.spark_utils import get_spark_session
spark = get_spark_session(app_name='NB03-PIIDetection', master='local[*]', driver_memory='4g')
print(f'‚úÖ SparkSession ready  ¬∑  v{spark.version}')

<a id='2'></a>
## 2 ¬∑ Data Generation with PII Leakage

The generator injects PII into free-text fields:
- `delivery_instructions` ‚Äî ~10% contain phone, email, Aadhaar
- `review_text` ‚Äî ~15% contain PII
- `description` ‚Äî ~5% contain PII

In [None]:
from src.utils.data_generator import generate_customers, generate_products, generate_orders, generate_reviews

print('‚è≥ Generating data with PII leakage ‚Ä¶')
customers_pdf = generate_customers(n=5_000)
products_pdf  = generate_products(n=1_000)
orders_pdf    = generate_orders(n=20_000,
                                customer_ids=customers_pdf['customer_id'].tolist(),
                                product_ids=products_pdf['product_id'].tolist())
reviews_pdf   = generate_reviews(n=10_000,
                                 customer_ids=customers_pdf['customer_id'].tolist(),
                                 product_ids=products_pdf['product_id'].tolist())

# Sample texts for demonstration
sample_texts = [
    'Please call me at +91-9876543210 for delivery',
    'My email is rahul.kumar@gmail.com and aadhaar is 1234 5678 9012',
    'Great product! Contact support at help@company.in',
    'PAN: ABCDE1234F, please send invoice',
    'Normal review with no personal information at all',
    'Deliver to Raj Sharma, phone 9123456789, near MG Road',
    'Credit card ending 4532-xxxx-xxxx-1234 was charged',
    'IFSC code SBIN0001234, account 12345678',
    'IP address 192.168.1.100 detected in log',
    'Address: 42 Park Street, Kolkata 700001',
]

print(f'‚úÖ Generated {len(orders_pdf):,} orders, {len(reviews_pdf):,} reviews')
print(f'   Sample texts prepared: {len(sample_texts)}')

<a id='3'></a>
## 3 ¬∑ PIIDetector ‚Äî 8 Regex Patterns

The detector uses 8 compiled regex patterns for Indian PII:

| Pattern | Example | Regex |
|---------|---------|-------|
| **EMAIL** | user@domain.com | `[A-Za-z0-9._%+-]+@...` |
| **PHONE_NUMBER** | +91-9876543210 | `(\+91[\-\s]?)?[6-9]\d{9}` |
| **AADHAAR** | 1234 5678 9012 | `\d{4}\s\d{4}\s\d{4}` |
| **PAN** | ABCDE1234F | `[A-Z]{5}\d{4}[A-Z]` |
| **CREDIT_CARD** | 4532-xxxx-1234 | `\d{4}[\-\s]?\d{4}[\-\s]?...` |
| **IPV4** | 192.168.1.1 | `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}` |
| **ADDRESS** | 42 Park Street | address pattern |
| **IFSC** | SBIN0001234 | `[A-Z]{4}0[A-Z0-9]{6}` |

In [None]:
from src.pii_detection.pii_detector import PIIDetector, PII_PATTERNS

# Initialize detector (regex only first)
detector = PIIDetector(confidence_threshold=0.85, use_ner_model=False)

# Show built-in patterns
print(f'üìã PII Patterns registered: {len(PII_PATTERNS)}')
for name, pattern in PII_PATTERNS.items():
    pat_str = pattern.pattern[:60] + '‚Ä¶' if len(pattern.pattern) > 60 else pattern.pattern
    print(f'   {name:15s}  {pat_str}')

# Detect PII in sample texts
print(f'\n{"‚ïê"*70}')
print(f'{"  PII DETECTION RESULTS":^70}')
print(f'{"‚ïê"*70}')

all_entities = []
for text in sample_texts:
    entities = detector.detect_pii(text)
    has = detector.has_pii(text)
    display_text = text[:55] + '‚Ä¶' if len(text) > 55 else text
    status = 'üî¥' if has else 'üü¢'
    print(f'\n{status} "{display_text}"')
    if entities:
        for e in entities:
            print(f'     ‚Üí {e.entity_type:15s}  "{e.text}"  (score: {e.score:.2f})')
            all_entities.append({'type': e.entity_type, 'text': e.text, 'score': e.score})
    else:
        print(f'     ‚Üí No PII detected')

print(f'\n{"‚ïê"*70}')
print(f'Total entities detected: {len(all_entities)}')

In [None]:
# Visualise detection results
entity_df = pd.DataFrame(all_entities)
if not entity_df.empty:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Entity type distribution
    type_counts = entity_df['type'].value_counts()
    type_counts.plot.bar(ax=ax1, color=sns.color_palette('Set2', len(type_counts)), edgecolor='white')
    ax1.set_title('üîê PII Entity Types Detected')
    ax1.set_ylabel('Count')
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

    # Confidence scores
    ax2.hist(entity_df['score'], bins=20, color='#9b59b6', edgecolor='white', alpha=0.8)
    ax2.axvline(x=0.85, color='red', linestyle='--', linewidth=2, label='Threshold (0.85)')
    ax2.set_xlabel('Confidence Score')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Detection Confidence Distribution')
    ax2.legend()

    plt.tight_layout()
    plt.show()
else:
    print('No entities to visualise')

<a id='4'></a>
## 4 ¬∑ PIIDetector ‚Äî NER (BERT) Model

The framework supports `dslim/bert-base-NER` via HuggingFace Transformers for detecting PII that regex cannot catch (e.g. person names, organisations, locations).

In [None]:
# NER-based detection (loads BERT model ‚Äî may take a moment)
try:
    ner_detector = PIIDetector(confidence_threshold=0.85, use_ner_model=True,
                               model_name='dslim/bert-base-NER')
    ner_available = True
    print('‚úÖ BERT NER model loaded (dslim/bert-base-NER)')
except Exception as e:
    ner_available = False
    print(f'‚ö†Ô∏è  NER model not available: {e}')
    print('   (Requires transformers + torch ‚Äî available in Docker environment)')

if ner_available:
    ner_test_texts = [
        'Rahul Sharma ordered from Mumbai office at +91-9876543210',
        'Priya Patel from Bangalore sent email to support@company.in',
        'Dr Amit Verma from AIIMS Delhi reviewed the product',
    ]

    print(f'\n{"‚ïê"*70}')
    print(f'{"  NER + REGEX COMBINED DETECTION":^70}')
    print(f'{"‚ïê"*70}')

    for text in ner_test_texts:
        entities = ner_detector.detect_pii(text)
        print(f'\nüîç "{text[:60]}"')
        for e in entities:
            source = 'NER' if e.entity_type in ('PERSON', 'ORG', 'LOC') else 'Regex'
            print(f'   ‚Üí [{source}] {e.entity_type:15s}  "{e.text}"  (score: {e.score:.2f})')
else:
    print('\nüìù NER detection demo skipped ‚Äî model not available in this environment')
    print('   The Docker deployment includes transformers + torch for full NER support')

<a id='5'></a>
## 5 ¬∑ PIIMasker ‚Äî Hash / Redact / Tokenize

Three masking strategies:

| Strategy | Method | Example |
|----------|--------|---------|
| **Hash** | SHA-256 (salted) | `abc123...` |
| **Redact** | Replace with `[TYPE_REDACTED]` | `[EMAIL_REDACTED]` |
| **Tokenize** | Random token | `TOK_a1b2c3` |

In [None]:
from src.pii_detection.pii_masker import PIIMasker

test_text = 'Contact Rahul at rahul.kumar@gmail.com or +91-9876543210. Aadhaar: 1234 5678 9012, PAN: ABCDE1234F'

print(f'üìÑ Original Text:')
print(f'   "{test_text}"\n')

for strategy in ['hash', 'redact', 'tokenize']:
    masker = PIIMasker(strategy=strategy, detector=detector)
    result = masker.mask_text_with_report(test_text)
    print(f'\n{"‚îÄ"*70}')
    print(f'üîí Strategy: {strategy.upper()}')
    print(f'   Masked:  "{result["masked_text"][:80]}‚Ä¶"' if len(result['masked_text']) > 80
          else f'   Masked:  "{result["masked_text"]}"')
    print(f'   Entities masked: {result["entities_masked"]}')
    for e in result.get('entities', []):
        print(f'     ‚Ä¢ {e["entity_type"]:15s}  "{e["original"]}"  ‚Üí  "{e["masked"][:30]}"')

<a id='6'></a>
## 6 ¬∑ Masking Strategy Comparison

In [None]:
# Compare all 3 strategies on a batch of texts
comparison_texts = orders_pdf['delivery_instructions'].dropna().head(500).tolist()

strategy_results = {}
for strategy in ['hash', 'redact', 'tokenize']:
    masker = PIIMasker(strategy=strategy, detector=detector)
    total_masked = 0
    total_entities = 0
    for text in comparison_texts:
        result = masker.mask_text_with_report(str(text))
        total_entities += result['entities_masked']
        if result['entities_masked'] > 0:
            total_masked += 1
    strategy_results[strategy] = {
        'texts_with_pii': total_masked,
        'entities_masked': total_entities,
        'pii_rate': total_masked / len(comparison_texts) * 100,
    }

comp_df = pd.DataFrame(strategy_results).T
comp_df.index.name = 'Strategy'

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

comp_df['texts_with_pii'].plot.bar(ax=ax1, color=['#3498db','#e74c3c','#f39c12'], edgecolor='white')
ax1.set_title('Texts with PII Detected (per strategy)')
ax1.set_ylabel('Count')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=0)

comp_df['entities_masked'].plot.bar(ax=ax2, color=['#3498db','#e74c3c','#f39c12'], edgecolor='white')
ax2.set_title('Total Entities Masked')
ax2.set_ylabel('Count')
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=0)

plt.tight_layout()
plt.show()

display(comp_df.style.set_caption('üîí Masking Strategy Comparison').format({
    'texts_with_pii': '{:,}', 'entities_masked': '{:,}', 'pii_rate': '{:.1f}%'
}))

<a id='7'></a>
## 7 ¬∑ Spark UDF ‚Äî PII at Scale

The framework provides `create_spark_detect_udf()` and `create_spark_mask_udf()` for distributed PII processing.

In [None]:
from pyspark.sql import functions as F

# Create Spark DataFrame
orders_sdf = spark.createDataFrame(orders_pdf)

# Register PII detection UDF
detect_udf = PIIDetector.create_spark_detect_udf(use_ner=False)

# Register masking UDF
masker = PIIMasker(strategy='redact', detector=detector)
mask_udf = masker.create_spark_mask_udf()

# Apply at scale
print('‚è≥ Applying PII detection + masking on delivery_instructions ‚Ä¶')
result_sdf = orders_sdf.withColumn(
    'has_pii', detect_udf(F.col('delivery_instructions'))
).withColumn(
    'masked_instructions', mask_udf(F.col('delivery_instructions'))
)

pii_count = result_sdf.filter(F.col('has_pii') == True).count()
total = result_sdf.count()

print(f'\n‚úÖ PII Detection at Scale')
print(f'   Total records:    {total:>10,}')
print(f'   Records with PII: {pii_count:>10,}  ({pii_count/total*100:.1f}%)')

# Show examples
print(f'\nüìã Sample Masked Records:')
sample = result_sdf.filter(F.col('has_pii') == True).select(
    'delivery_instructions', 'masked_instructions'
).limit(3).toPandas()
for _, row in sample.iterrows():
    print(f'   Original: {str(row["delivery_instructions"])[:70]}')
    print(f'   Masked:   {str(row["masked_instructions"])[:70]}')
    print()

<a id='8'></a>
## 8 ¬∑ AdaptivePIITuner ‚Äî Threshold Learning

The `AdaptivePIITuner` uses feedback events (true/false positives) to **adaptively tune detection thresholds** per entity type.

In [None]:
from src.pii_detection.adaptive_pii_tuner import AdaptivePIITuner, PIIFeedbackEvent
import tempfile

tuner = AdaptivePIITuner(
    feedback_dir=os.path.join(tempfile.mkdtemp(), 'pii_feedback'),
    default_threshold=0.85,
    min_threshold=0.50,
    max_threshold=0.99,
    min_feedback_count=10,
)

# Simulate feedback events
np.random.seed(42)
feedback_events = []

# True positives (detector correctly found PII)
for _ in range(60):
    feedback_events.append(PIIFeedbackEvent(
        entity_type='EMAIL',
        text='user@example.com',
        score=np.random.uniform(0.85, 0.99),
        predicted_pii=True,
        actual_pii=True,
    ))

# False positives (detector incorrectly flagged)
for _ in range(15):
    feedback_events.append(PIIFeedbackEvent(
        entity_type='EMAIL',
        text='not-an-email',
        score=np.random.uniform(0.50, 0.85),
        predicted_pii=True,
        actual_pii=False,
    ))

# True positives for PHONE
for _ in range(40):
    feedback_events.append(PIIFeedbackEvent(
        entity_type='PHONE_NUMBER',
        text='+91-9876543210',
        score=np.random.uniform(0.88, 0.99),
        predicted_pii=True,
        actual_pii=True,
    ))

# False negatives for AADHAAR (detector missed)
for _ in range(25):
    feedback_events.append(PIIFeedbackEvent(
        entity_type='AADHAAR',
        text='1234 5678 9012',
        score=np.random.uniform(0.40, 0.84),
        predicted_pii=False,
        actual_pii=True,
    ))

# Record all feedback
tuner.record_batch_feedback(feedback_events)
print(f'‚úÖ Recorded {len(feedback_events)} feedback events')

# Compute metrics
metrics = tuner.compute_entity_metrics()
print(f'\n{"‚ïê"*60}')
print(f'{"  ENTITY-LEVEL METRICS":^60}')
print(f'{"‚ïê"*60}')
for entity_type, m in metrics.items():
    print(f'\n  {entity_type}:')
    print(f'    Precision: {m.get("precision", 0):.2f}')
    print(f'    Recall:    {m.get("recall", 0):.2f}')
    print(f'    F1-score:  {m.get("f1", 0):.2f}')
    print(f'    Samples:   {m.get("count", 0)}')

In [None]:
# Tune thresholds based on feedback
tuned = tuner.tune_thresholds()
current = tuner.get_thresholds()

print(f'\nüéØ Tuned Detection Thresholds:')
threshold_data = []
for entity_type, thresh in current.items():
    print(f'   {entity_type:15s}  default=0.85 ‚Üí tuned={thresh:.3f}')
    threshold_data.append({'Entity': entity_type, 'Default': 0.85, 'Tuned': thresh})

# Visualise
thresh_df = pd.DataFrame(threshold_data)

fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(thresh_df))
width = 0.35
ax.bar(x - width/2, thresh_df['Default'], width, label='Default', color='#bdc3c7', edgecolor='white')
ax.bar(x + width/2, thresh_df['Tuned'], width, label='Tuned', color='#3498db', edgecolor='white')
ax.set_ylabel('Threshold')
ax.set_title('üéØ Adaptive PII Thresholds ‚Äî Before vs After Tuning')
ax.set_xticks(x)
ax.set_xticklabels(thresh_df['Entity'])
ax.legend()
ax.set_ylim(0, 1.0)
ax.axhline(y=0.85, color='red', linestyle='--', alpha=0.5, label='Original default')
plt.tight_layout()
plt.show()

<a id='9'></a>
## 9 ¬∑ PII Drift Detection

Detects if PII patterns are changing over time (new types appearing, rates shifting).

In [None]:
drift_report = tuner.detect_pii_drift()

print(f'üìä PII Drift Report')
print(f'   Drift detected:  {drift_report.get("drift_detected", False)}')
print(f'   Entity types:    {drift_report.get("entity_types_monitored", 0)}')
print(f'   Total feedback:  {drift_report.get("total_feedback", 0)}')

if drift_report.get('entity_drift'):
    print(f'\n   Entity-level drift:')
    for entity_type, info in drift_report['entity_drift'].items():
        print(f'     {entity_type}: {info}')

# Visualise detection rates over entity types
if metrics:
    fig, ax = plt.subplots(figsize=(10, 5))
    entities = list(metrics.keys())
    precision = [metrics[e].get('precision', 0) for e in entities]
    recall = [metrics[e].get('recall', 0) for e in entities]
    f1 = [metrics[e].get('f1', 0) for e in entities]

    x = np.arange(len(entities))
    w = 0.25
    ax.bar(x - w, precision, w, label='Precision', color='#3498db', edgecolor='white')
    ax.bar(x, recall, w, label='Recall', color='#e74c3c', edgecolor='white')
    ax.bar(x + w, f1, w, label='F1-Score', color='#2ecc71', edgecolor='white')
    ax.set_xticks(x)
    ax.set_xticklabels(entities)
    ax.set_ylabel('Score')
    ax.set_title('üìä PII Detection Performance Metrics')
    ax.legend()
    ax.set_ylim(0, 1.1)
    plt.tight_layout()
    plt.show()

<a id='10'></a>
## 10 ¬∑ DPDP Act Compliance Analysis

India's **Digital Personal Data Protection Act 2023** requires:
- Explicit consent before processing personal data
- Right to erasure
- Purpose limitation
- Data minimisation

In [None]:
# DPDP compliance assessment
mkt_consent  = customers_pdf['consent_marketing'].fillna(False).astype(bool).sum()
dp_consent   = customers_pdf['consent_data_processing'].fillna(True).astype(bool).sum()
total_cust   = len(customers_pdf)

# PII exposure in free text
order_pii_n = int(orders_pdf['delivery_instructions'].dropna().apply(
    lambda x: detector.has_pii(str(x))).sum())

# Sensitive fields present
pii_columns = ['aadhaar', 'pan_card', 'phone', 'email']
pii_coverage = {col: customers_pdf[col].notna().sum() for col in pii_columns}

compliance_checks = [
    ('Marketing consent rate',   f'{mkt_consent/total_cust*100:.1f}%',
     'üü°' if mkt_consent/total_cust < 0.8 else 'üü¢'),
    ('Data processing consent',  f'{dp_consent/total_cust*100:.1f}%',
     'üü¢' if dp_consent/total_cust > 0.9 else 'üî¥'),
    ('PII leakage (orders)',     f'{order_pii_n:,} records',
     'üî¥' if order_pii_n > 0 else 'üü¢'),
    ('Aadhaar stored',           f'{pii_coverage["aadhaar"]:,} records',
     'üü°'),
    ('PAN stored',               f'{pii_coverage["pan_card"]:,} records',
     'üü°'),
    ('Masking strategy',         'Redact',
     'üü¢'),
    ('Drift monitoring',         'Active',
     'üü¢'),
    ('Adaptive thresholds',      f'{len(current)} entity types tuned',
     'üü¢'),
]

display(HTML(f'''
<div style="border:2px solid #3498db;border-radius:10px;overflow:hidden;margin:10px 0;">
  <div style="background:#2c3e50;color:white;padding:12px 16px;">
    <b>üáÆüá≥ DPDP Act Compliance Assessment</b></div>
  <table style="width:100%;border-collapse:collapse;">
    {''.join(f"""<tr style="border-bottom:1px solid #eee;">
      <td style="padding:8px 12px;">{status}</td>
      <td style="padding:8px 12px;font-weight:bold;">{check}</td>
      <td style="padding:8px 12px;text-align:right;">{value}</td>
    </tr>""" for check, value, status in compliance_checks)}
  </table>
</div>
'''))

<a id='11'></a>
## 11 ¬∑ NER Training Dataset Generation

The framework can generate BIO-tagged training data for fine-tuning custom NER models.

In [None]:
from src.pii_detection.ner_trainer import generate_training_sample, generate_training_dataset

# Generate a sample
tokens, tags = generate_training_sample()
print('üìã NER Training Sample (BIO format):')
print(f'{"Token":20s}  {"Tag"}')
print(f'{"‚îÄ"*30}')
for token, tag in zip(tokens[:20], tags[:20]):
    color = 'üî¥' if tag.startswith('B-') else 'üü°' if tag.startswith('I-') else '  '
    print(f'{color} {token:20s}  {tag}')
if len(tokens) > 20:
    print(f'   ‚Ä¶ {len(tokens)-20} more tokens')

# Generate small dataset
train_data = generate_training_dataset(n=100)
print(f'\n‚úÖ Generated {len(train_data)} training samples')
print(f'   Avg tokens/sample: {np.mean([len(s["tokens"]) for s in train_data]):.1f}')

# Count tag distribution
tag_counts = {}
for sample in train_data:
    for tag in sample['tags']:
        tag_counts[tag] = tag_counts.get(tag, 0) + 1

tag_df = pd.DataFrame(list(tag_counts.items()), columns=['Tag', 'Count']).sort_values('Count', ascending=True)
fig, ax = plt.subplots(figsize=(10, 4))
tag_df.plot.barh(x='Tag', y='Count', ax=ax, color=sns.color_palette('Set2', len(tag_df)),
                 edgecolor='white', legend=False)
ax.set_title('NER Training Data ‚Äî Tag Distribution (BIO)')
ax.set_xlabel('Count')
plt.tight_layout()
plt.show()

<a id='12'></a>
## 12 ¬∑ Executive Summary

In [None]:
display(HTML(f'''
<div style="background:linear-gradient(135deg,#1a1a2e,#16213e,#0f3460);color:white;
            padding:30px;border-radius:12px;font-family:sans-serif;">
  <h2 style="text-align:center;margin:0 0 20px;">üîê PII Detection & Privacy ‚Äî Executive Summary</h2>
  <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:15px;">
    <div style="background:rgba(231,76,60,0.2);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">REGEX PATTERNS</div>
      <div style="font-size:32px;font-weight:bold;color:#e74c3c;">8</div></div>
    <div style="background:rgba(155,89,182,0.2);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">NER MODEL</div>
      <div style="font-size:20px;font-weight:bold;">BERT NER</div>
      <div style="font-size:10px;opacity:0.6;">dslim/bert-base-NER</div></div>
    <div style="background:rgba(52,152,219,0.2);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">MASKING STRATEGIES</div>
      <div style="font-size:32px;font-weight:bold;">3</div>
      <div style="font-size:10px;opacity:0.6;">Hash ¬∑ Redact ¬∑ Tokenize</div></div>
    <div style="background:rgba(255,255,255,0.08);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">PII IN ORDERS</div>
      <div style="font-size:28px;font-weight:bold;">{order_pii_n:,}</div></div>
    <div style="background:rgba(255,255,255,0.08);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">ENTITY TYPES TUNED</div>
      <div style="font-size:28px;font-weight:bold;">{len(current)}</div></div>
    <div style="background:rgba(46,204,113,0.2);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">DRIFT MONITORING</div>
      <div style="font-size:20px;font-weight:bold;color:#2ecc71;">Active</div></div>
    <div style="background:rgba(255,255,255,0.08);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">DPDP CHECKS</div>
      <div style="font-size:28px;font-weight:bold;">8</div></div>
    <div style="background:rgba(255,255,255,0.08);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">TRAINING SAMPLES</div>
      <div style="font-size:28px;font-weight:bold;">100</div>
      <div style="font-size:10px;opacity:0.6;">BIO-tagged for NER</div></div>
    <div style="background:rgba(255,255,255,0.08);padding:18px;border-radius:10px;text-align:center;">
      <div style="font-size:11px;opacity:0.7;">CONSENT RATE</div>
      <div style="font-size:28px;font-weight:bold;">{dp_consent/total_cust*100:.0f}%</div></div>
  </div>
  <p style="text-align:center;margin:20px 0 0;opacity:0.6;font-size:12px;">
    Proceed to <b>Notebook 04</b> for End-to-End Pipeline & all 12 AI Models</p>
</div>
'''))

In [None]:
spark.stop()
print('‚úÖ SparkSession stopped ‚Äî Notebook 03 complete')