In [2]:
# PART 1: STRATIFIED RANDOM SAMPLING

# This script performs stratified random sampling on Vietnamese census data
# and calculates population risk scores for privacy analysis

import pandas as pd

# PART 1: STRATIFIED RANDOM SAMPLING

# Load the Vietnamese census dataset (2019)
df = pd.read_csv("nswiss2011.csv", on_bad_lines='skip')

# Define the stratification variables (quasi-identifiers)
strat_cols = ["persons", "hhwt", "gq", "regionw", "ownershipd"]

# Set the sampling fraction (1% of the population)
sample_fraction = 0.01

# Create a unique stratification key by combining all stratification columns
df["strat_key"] = df[strat_cols].astype(str).agg("_".join, axis=1)

# Configuration for generating multiple samples
num_samples = 2                    # Number of independent samples to generate
random_seeds = [42, 43]           # Different seeds ensure different samples

# Generate stratified samples
print("Generating stratified samples...")
for i, seed in enumerate(random_seeds, start=1):

    # Calculate target sample size (1% of total population)
    sample_size = int(len(df) * sample_fraction)

    # Perform stratified sampling within each stratum
    # Each group (strat_key) contributes proportionally to its size
    stratified_sample = df.groupby("strat_key", group_keys=False).apply(
        lambda g: g.sample(
            n=max(1, round(len(g) * sample_fraction)),  # At least 1 record per stratum
            random_state=seed
        )
    )

    # Ensure exact sample size by random sampling from the stratified result
    # This step handles any rounding discrepancies from the stratified sampling
    stratified_sample = stratified_sample.sample(n=sample_size, random_state=seed)

    # Remove the temporary stratification key column
    stratified_sample = stratified_sample.drop(columns="strat_key")

    # Save sample to CSV file for further analysis
    output_filename = f"stratified_sample_{i}.csv"
    stratified_sample.to_csv(output_filename, index=False)

    print(f"Sample {i}: {len(stratified_sample)} rows saved to {output_filename}")

# Clean up: remove stratification key from original dataset
df.drop(columns="strat_key", inplace=True)

Generating stratified samples...


  stratified_sample = df.groupby("strat_key", group_keys=False).apply(


Sample 1: 2380 rows saved to stratified_sample_1.csv
Sample 2: 2380 rows saved to stratified_sample_2.csv


  stratified_sample = df.groupby("strat_key", group_keys=False).apply(


In [3]:
# PART 2: POPULATION RISK CALCULATION

# Reload the original census data for risk analysis
df = pd.read_csv("nswiss2011.csv")

# Define quasi-identifiers (same as stratification variables)
key_vars = ["persons", "hhwt", "gq", "regionw", "ownershipd"]

# Step 1: Create equivalence classes
# Records with identical quasi-identifier values form an equivalence class
df['eq_class'] = df[key_vars].astype(str).agg('-'.join, axis=1)

# Step 2: Count the size of each equivalence class
# Larger classes provide better privacy protection (lower re-identification risk)
equiv_counts = df['eq_class'].value_counts()

# Step 3: Map equivalence class sizes back to individual records
# Each record gets labeled with its class size
df['eq_class_size'] = df['eq_class'].map(equiv_counts)

# Step 4: Calculate individual re-identification risk
# Risk = 1 / class_size (smaller classes = higher individual risk)
df['individual_risk'] = 1 / df['eq_class_size']

# Step 5: Calculate total population risk
# Sum of all individual risks across the entire population
total_risk = df['individual_risk'].sum()

# OUTPUT RESULTS
print(f"True Population Risk Score for Switzerland 2011: {total_risk:.4f}")
print("\nSample of risk analysis results:")
print(df[['eq_class', 'eq_class_size', 'individual_risk']].head())

# =============================================================================
# INTERPRETATION NOTES:
# - Higher total risk = greater privacy vulnerability across the population
# - Individual risk of 1.0 = unique record (highest re-identification risk)
# - Individual risk of 0.1 = part of 10-record equivalence class (lower risk)
# =============================================================================

True Population Risk Score for Switzerland 2011: 15061.0000

Sample of risk analysis results:
            eq_class  eq_class_size  individual_risk
0  1-17.04-10-44-210             35         0.028571
1  1-17.89-10-44-210             96         0.010417
2  1-31.99-10-44-210             13         0.076923
3  1-10.22-10-44-210             82         0.012195
4   1-12.6-10-44-100             27         0.037037
