# Federated Privacy-Preserving Neural Network Record Linkage (FPN-RL)

Neural network-based record linkage with federated learning and differential privacy.

# CHANGE 1: Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import hashlib
import random
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
import matplotlib.pyplot as plt
import difflib
from typing import List, Dict, Tuple, Any, Optional

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

print("All dependencies imported successfully!")

# CHANGE 2: FPN-RL Model Definition

In [None]:
class FederatedEmbeddingLinkage:
    """
    Federated Privacy-Preserving Neural Network Record Linkage (FPN-RL)
    
    This class implements a novel approach to privacy-preserving record linkage that combines:
    1. Federated learning principles for distributed privacy
    2. Neural network embeddings for complex feature learning
    3. Differential privacy guarantees at the embedding level
    4. Support for both structured and unstructured data
    5. Adaptive threshold learning for linkage decisions
    """
    
    def __init__(self, 
                 embedding_dim: int = 128,
                 epsilon: float = 1.0,
                 delta: float = 1e-5,
                 noise_multiplier: float = 1.1,
                 l2_norm_clip: float = 1.0,
                 min_sim_threshold: float = 0.5,
                 max_vocab_size: int = 10000,
                 max_text_length: int = 500):
        """
        Initialize the Federated Embedding Linkage system.
        
        Parameters:
        - embedding_dim: Dimension of learned embeddings
        - epsilon: Differential privacy epsilon parameter (privacy budget)
        - delta: Differential privacy delta parameter  
        - noise_multiplier: Gaussian noise multiplier for DP
        - l2_norm_clip: L2 norm clipping for gradient privacy
        - min_sim_threshold: Minimum similarity threshold for matches
        - max_vocab_size: Maximum vocabulary size for text processing
        - max_text_length: Maximum text length for processing
        """
        self.embedding_dim = embedding_dim
        self.epsilon = epsilon
        self.delta = delta
        self.noise_multiplier = noise_multiplier
        self.l2_norm_clip = l2_norm_clip
        self.min_sim_threshold = min_sim_threshold
        self.max_vocab_size = max_vocab_size
        self.max_text_length = max_text_length
        
        # Model components
        self.encoder_model = None
        self.classifier_model = None
        self.text_vectorizer = None
        self.scaler = None
        self.optimal_threshold = min_sim_threshold
        
        # Privacy tracking
        self.privacy_spent = 0.0
        self.composition_steps = 0
        
        print(f"Initialized FPN-RL with ε={epsilon}, δ={delta}")
        print(f"Embedding dimension: {embedding_dim}")
        print(f"Privacy guarantees: ({epsilon}, {delta})-differential privacy")

# CHANGE 3: Privacy-Preserving Methods

In [None]:
def add_privacy_methods_to_class():
    """
    Add privacy-preserving methods to the FederatedEmbeddingLinkage class.
    """
    
    def _add_differential_privacy_noise(self, embeddings: np.ndarray) -> np.ndarray:
        """
        Add calibrated Gaussian noise for differential privacy at embedding level.
        """
        sensitivity = 2 * self.l2_norm_clip  # L2 sensitivity
        noise_scale = self.noise_multiplier * sensitivity / self.epsilon
        
        noise = np.random.normal(0, noise_scale, embeddings.shape)
        noisy_embeddings = embeddings + noise
        
        # Update privacy accounting
        self.privacy_spent += self.epsilon
        self.composition_steps += 1
        
        return noisy_embeddings
    
    def _preprocess_structured_data(self, data: pd.DataFrame) -> np.ndarray:
        """
        Preprocess structured data (numerical and categorical features).
        """
        processed_features = []
        
        for col in data.columns:
            if data[col].dtype == 'object':  # Categorical/text data
                # Convert to string and create hash-based features
                col_data = data[col].astype(str).fillna('')
                
                # Create multiple hash features for better collision resistance
                hash_features = []
                for i in range(5):  # 5 different hash functions
                    hashes = [int(hashlib.md5(f"{val}_{i}".encode()).hexdigest(), 16) % 1000 
                             for val in col_data]
                    hash_features.append(hashes)
                
                processed_features.extend(hash_features)
                
                # Add string similarity features
                if len(col_data) > 1:
                    sim_features = []
                    for val in col_data:
                        # Compute average similarity to other values
                        similarities = [difflib.SequenceMatcher(None, val, other).ratio() 
                                      for other in col_data[:100]]  # Limit for efficiency
                        sim_features.append(np.mean(similarities))
                    processed_features.append(sim_features)
                    
            else:  # Numerical data
                # Normalize and add noise for privacy
                col_data = data[col].fillna(data[col].mean())
                processed_features.append(col_data.tolist())
        
        return np.array(processed_features).T
    
    def _preprocess_unstructured_data(self, texts: List[str]) -> np.ndarray:
        """
        Preprocess unstructured text data using TF-IDF.
        """
        if self.text_vectorizer is None:
            self.text_vectorizer = TfidfVectorizer(
                max_features=self.max_vocab_size,
                max_df=0.8,
                min_df=2,
                stop_words='english',
                ngram_range=(1, 2)
            )
            text_features = self.text_vectorizer.fit_transform(texts)
        else:
            text_features = self.text_vectorizer.transform(texts)
        
        return text_features.toarray()
    
    # Attach methods to the class
    FederatedEmbeddingLinkage._add_differential_privacy_noise = _add_differential_privacy_noise
    FederatedEmbeddingLinkage._preprocess_structured_data = _preprocess_structured_data
    FederatedEmbeddingLinkage._preprocess_unstructured_data = _preprocess_unstructured_data

# Call the function to add methods
add_privacy_methods_to_class()
print("Privacy-preserving methods added to class!")

# CHANGE 4: Neural Network Architecture

In [None]:
def add_neural_network_methods():
    """
    Add neural network architecture methods to the FederatedEmbeddingLinkage class.
    """
    
    def _build_encoder_model(self, input_dim: int):
        """
        Build the neural encoder model for learning privacy-preserving embeddings.
        """
        inputs = Input(shape=(input_dim,))
        
        # Encoder pathway with privacy-aware architecture
        x = Dense(256, activation='relu', 
                 kernel_regularizer=regularizers.l2(0.01))(inputs)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        
        x = Dense(128, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.2)(x)
        
        # Embedding layer
        embeddings = Dense(self.embedding_dim, activation='tanh', name='embeddings',
                          kernel_regularizer=regularizers.l2(0.01))(x)
        
        # Decoder pathway for reconstruction (autoencoder approach)
        y = Dense(128, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(embeddings)
        y = BatchNormalization()(y)
        y = Dropout(0.2)(y)
        
        y = Dense(256, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(y)
        y = BatchNormalization()(y)
        y = Dropout(0.3)(y)
        
        outputs = Dense(input_dim, activation='linear')(y)
        
        # Create the full autoencoder model
        autoencoder = Model(inputs, outputs, name='privacy_autoencoder')
        
        # Create encoder model for embeddings
        encoder = Model(inputs, embeddings, name='privacy_encoder')
        
        return autoencoder, encoder
    
    def _build_classifier_model(self, embedding_dim: int) -> Model:
        """
        Build the neural classifier for record linkage decisions.
        """
        input_diff = Input(shape=(embedding_dim,), name='embedding_difference')
        
        x = Dense(64, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(input_diff)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        
        x = Dense(32, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.2)(x)
        
        x = Dense(16, activation='relu',
                 kernel_regularizer=regularizers.l2(0.01))(x)
        x = Dropout(0.1)(x)
        
        # Output layer with sigmoid for binary classification
        output = Dense(1, activation='sigmoid', name='match_probability')(x)
        
        model = Model(inputs=input_diff, outputs=output, name='linkage_classifier')
        return model
    
    # Attach methods to the class
    FederatedEmbeddingLinkage._build_encoder_model = _build_encoder_model
    FederatedEmbeddingLinkage._build_classifier_model = _build_classifier_model

# Add the neural network methods
add_neural_network_methods()
print("Neural network architecture methods added!")

# CHANGE 5: Record Linkage Methods

# CHANGE 6: Configure FPN-RL Parameters

**Modify these values to tune the model:**

In [None]:
# MODIFY THESE PARAMETERS TO TUNE LINKAGE ACCURACY:
embedding_dim = 64          # Embedding dimension (higher = more capacity)
epsilon = 1.0               # Privacy budget (lower = more privacy)
delta = 1e-5                # Privacy parameter
min_sim_threshold = 0.7     # Similarity threshold for matching (0-1)
learning_rate = 0.001       # Learning rate for training
epochs = 50                 # Number of training epochs
batch_size = 32             # Batch size for training

fpn_rl = FederatedEmbeddingLinkage(
    embedding_dim=embedding_dim,
    epsilon=epsilon,
    delta=delta,
    min_sim_threshold=min_sim_threshold
)

print(f"FPN-RL initialized with ε={epsilon}, embedding_dim={embedding_dim}")

# CHANGE 7: Sample Data Generation

In [None]:
def generate_sample_data_with_text(n_records: int = 100, match_rate: float = 0.3):
    """
    Generate sample datasets with both structured and unstructured data for testing.
    
    Parameters:
    - n_records: Number of records to generate
    - match_rate: Fraction of records that should match between datasets
    
    Returns:
    - data1, data2: DataFrames with sample records
    - ground_truth: List of (index1, index2) tuples for true matches
    """
    
    # Sample data generation
    np.random.seed(42)
    random.seed(42)
    
    names = [f"Person_{i}" for i in range(n_records)]
    ages = np.random.randint(18, 80, n_records)
    cities = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_records)
    
    professions = ['Doctor', 'Engineer', 'Teacher', 'Artist', 'Lawyer', 'Scientist']
    hobbies = ['reading', 'hiking', 'cooking', 'painting', 'music', 'sports']
    
    descriptions = []
    for i in range(n_records):
        prof = np.random.choice(professions)
        hobby1 = np.random.choice(hobbies)
        hobby2 = np.random.choice(hobbies)
        desc = f"{prof} who enjoys {hobby1} and {hobby2}. Lives in {cities[i]}."
        descriptions.append(desc)
    
    # Create first dataset
    data1 = pd.DataFrame({
        'name': names,
        'age': ages,
        'city': cities,
        'description': descriptions
    })
    
    # Create second dataset with some modifications and matches
    n_matches = int(n_records * match_rate)
    match_indices = random.sample(range(n_records), n_matches)
    
    data2_records = []
    ground_truth = []
    
    # Add matches with some noise
    for i, orig_idx in enumerate(match_indices):
        # Add some variation to create realistic matching scenarios
        name_var = names[orig_idx] if random.random() > 0.1 else names[orig_idx].replace('Person', 'P')
        age_var = ages[orig_idx] + random.randint(-2, 2)
        city_var = cities[orig_idx] if random.random() > 0.05 else random.choice(['New York', 'Los Angeles', 'Chicago'])
        desc_var = descriptions[orig_idx]
        
        # Add some text variation
        if random.random() < 0.3:
            desc_var = desc_var.replace('enjoys', 'likes').replace(' and ', ' & ')
        
        data2_records.append({
            'name': name_var,
            'age': age_var,
            'city': city_var,
            'description': desc_var
        })
        
        ground_truth.append((orig_idx, i))
    
    # Add non-matching records
    remaining_slots = n_records - n_matches
    for i in range(remaining_slots):
        idx = n_matches + i
        data2_records.append({
            'name': f"NewPerson_{idx}",
            'age': np.random.randint(18, 80),
            'city': random.choice(['Boston', 'Seattle', 'Miami', 'Denver']),
            'description': f"{random.choice(professions)} from different dataset. Unique individual with various interests."
        })
    
    data2 = pd.DataFrame(data2_records)
    
    return data1, data2, ground_truth

# Generate sample datasets
data1, data2, ground_truth = generate_sample_data_with_text(n_records=50, match_rate=0.4)

print("Sample datasets generated!")
print(f"Dataset 1 shape: {data1.shape}")
print(f"Dataset 2 shape: {data2.shape}")
print(f"Ground truth matches: {len(ground_truth)}")

print("\nSample from Dataset 1:")
print(data1.head(3))
print("\nSample from Dataset 2:")
print(data2.head(3))

# CHANGE 8: Generate Test Data

In [None]:
data1, data2, ground_truth = generate_sample_data_with_text(n_records=50, match_rate=0.4)
print(f"Generated {len(data1)} records in dataset 1")
print(f"Generated {len(data2)} records in dataset 2")
print(f"Ground truth matches: {len(ground_truth)}")

# CHANGE 9: Load CSV Datasets

**Modify dataset paths to test different data:**

In [None]:
# MODIFY THESE PATHS TO USE DIFFERENT DATASETS:
csv_dataset1_path = '../csv_files/Alice_numrec_100_corr_50.csv'
csv_dataset2_path = '../csv_files/Bob_numrec_100_corr_50.csv'

try:
    csv_data1 = pd.read_csv(csv_dataset1_path)
    csv_data2 = pd.read_csv(csv_dataset2_path)
    print(f"Loaded CSV data: {len(csv_data1)} and {len(csv_data2)} records")
    print(f"Columns: {csv_data1.columns.tolist()}")
except Exception as e:
    print(f"Error loading CSV: {e}")
    csv_data1, csv_data2 = None, None

# CHANGE 10: Train FPN-RL Model

In [None]:
import time
start_time = time.time()

results = fpn_rl.train_and_link(
    data1, data2, ground_truth,
    epochs=epochs,
    batch_size=batch_size
)

training_time = time.time() - start_time

print(f"\nTraining completed in {training_time:.2f} seconds")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1_score']:.4f}")

# CHANGE 11: Parameter Comparison Experiments

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def test_fpn_rl_parameter_variations(param_name, param_values, base_params, datasets_dict):
    results = []
    
    for dataset_name, (path1, path2) in datasets_dict.items():
        try:
            d1 = pd.read_csv(path1)
            d2 = pd.read_csv(path2)
            
            d1['full_name'] = d1['first_name'] + ' ' + d1['last_name']
            d2['full_name'] = d2['first_name'] + ' ' + d2['last_name']
            
            gt = [(i, i) for i in range(min(len(d1), len(d2)))]
            
            for param_val in param_values:
                params = base_params.copy()
                params[param_name] = param_val
                
                model = FederatedEmbeddingLinkage(
                    embedding_dim=params['embedding_dim'],
                    epsilon=params['epsilon'],
                    delta=params['delta'],
                    min_sim_threshold=params['min_sim_threshold']
                )
                
                res = model.train_and_link(
                    d1, d2, gt,
                    epochs=params['epochs'],
                    batch_size=params['batch_size']
                )
                
                results.append({
                    'dataset': dataset_name,
                    'param_value': param_val,
                    'precision': res['precision'],
                    'recall': res['recall'],
                    'f1_score': res['f1_score']
                })
                
        except Exception as e:
            print(f"Error with dataset {dataset_name}: {e}")
            continue
    
    return pd.DataFrame(results)

base_params_fpn = {
    'embedding_dim': 64,
    'epsilon': 1.0,
    'delta': 1e-5,
    'min_sim_threshold': 0.7,
    'epochs': 30,
    'batch_size': 32
}

datasets_fpn = {
    '100_corr_25': ('../csv_files/Alice_numrec_100_corr_25.csv', '../csv_files/Bob_numrec_100_corr_25.csv'),
    '100_corr_50': ('../csv_files/Alice_numrec_100_corr_50.csv', '../csv_files/Bob_numrec_100_corr_50.csv'),
    '500_corr_25': ('../csv_files/Alice_numrec_500_corr_25.csv', '../csv_files/Bob_numrec_500_corr_25.csv'),
    '500_corr_50': ('../csv_files/Alice_numrec_500_corr_50.csv', '../csv_files/Bob_numrec_500_corr_50.csv')
}

# CHANGE 12: Privacy Budget (Epsilon) Analysis

In [None]:
epsilon_values_fpn = [0.5, 1.0, 2.0, 5.0, 10.0]
epsilon_results_fpn = test_fpn_rl_parameter_variations('epsilon', epsilon_values_fpn, base_params_fpn, datasets_fpn)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets_fpn.keys():
    data = epsilon_results_fpn[epsilon_results_fpn['dataset'] == dataset_name]
    if len(data) > 0:
        axes[0].plot(data['param_value'], data['precision'], marker='o', label=dataset_name)
        axes[1].plot(data['param_value'], data['recall'], marker='o', label=dataset_name)
        axes[2].plot(data['param_value'], data['f1_score'], marker='o', label=dataset_name)

axes[0].set_xlabel('Privacy Budget (ε)')
axes[0].set_ylabel('Precision')
axes[0].set_title('FPN-RL: Precision vs Privacy Budget')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Privacy Budget (ε)')
axes[1].set_ylabel('Recall')
axes[1].set_title('FPN-RL: Recall vs Privacy Budget')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Privacy Budget (ε)')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('FPN-RL: F1 Score vs Privacy Budget')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('fpn_rl_epsilon_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nEpsilon Results:')
print(epsilon_results_fpn.to_string())

# CHANGE 13: Embedding Dimension Analysis

In [None]:
embedding_dim_values = [32, 64, 128, 256]
embedding_dim_results = test_fpn_rl_parameter_variations('embedding_dim', embedding_dim_values, base_params_fpn, datasets_fpn)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets_fpn.keys():
    data = embedding_dim_results[embedding_dim_results['dataset'] == dataset_name]
    if len(data) > 0:
        axes[0].plot(data['param_value'], data['precision'], marker='s', label=dataset_name)
        axes[1].plot(data['param_value'], data['recall'], marker='s', label=dataset_name)
        axes[2].plot(data['param_value'], data['f1_score'], marker='s', label=dataset_name)

axes[0].set_xlabel('Embedding Dimension')
axes[0].set_ylabel('Precision')
axes[0].set_title('FPN-RL: Precision vs Embedding Dimension')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Embedding Dimension')
axes[1].set_ylabel('Recall')
axes[1].set_title('FPN-RL: Recall vs Embedding Dimension')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Embedding Dimension')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('FPN-RL: F1 Score vs Embedding Dimension')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('fpn_rl_embedding_dim_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nEmbedding Dimension Results:')
print(embedding_dim_results.to_string())

# CHANGE 14: Similarity Threshold Analysis

In [None]:
threshold_values_fpn = [0.5, 0.6, 0.7, 0.8, 0.9]
threshold_results_fpn = test_fpn_rl_parameter_variations('min_sim_threshold', threshold_values_fpn, base_params_fpn, datasets_fpn)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for dataset_name in datasets_fpn.keys():
    data = threshold_results_fpn[threshold_results_fpn['dataset'] == dataset_name]
    if len(data) > 0:
        axes[0].plot(data['param_value'], data['precision'], marker='^', label=dataset_name)
        axes[1].plot(data['param_value'], data['recall'], marker='^', label=dataset_name)
        axes[2].plot(data['param_value'], data['f1_score'], marker='^', label=dataset_name)

axes[0].set_xlabel('Similarity Threshold')
axes[0].set_ylabel('Precision')
axes[0].set_title('FPN-RL: Precision vs Similarity Threshold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Similarity Threshold')
axes[1].set_ylabel('Recall')
axes[1].set_title('FPN-RL: Recall vs Similarity Threshold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

axes[2].set_xlabel('Similarity Threshold')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('FPN-RL: F1 Score vs Similarity Threshold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('fpn_rl_threshold_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nThreshold Results:')
print(threshold_results_fpn.to_string())

# CHANGE 15: Combined Performance Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for dataset_name in datasets_fpn.keys():
    eps_data = epsilon_results_fpn[epsilon_results_fpn['dataset'] == dataset_name]
    if len(eps_data) > 0:
        axes[0, 0].plot(eps_data['param_value'], eps_data['f1_score'], marker='o', label=dataset_name)
    
    emb_data = embedding_dim_results[embedding_dim_results['dataset'] == dataset_name]
    if len(emb_data) > 0:
        axes[0, 1].plot(emb_data['param_value'], emb_data['f1_score'], marker='s', label=dataset_name)
    
    th_data = threshold_results_fpn[threshold_results_fpn['dataset'] == dataset_name]
    if len(th_data) > 0:
        axes[1, 0].plot(th_data['param_value'], th_data['f1_score'], marker='^', label=dataset_name)

axes[0, 0].set_title('F1 Score vs Privacy Budget (ε)')
axes[0, 0].set_xlabel('Privacy Budget (ε)')
axes[0, 0].set_ylabel('F1 Score')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].set_title('F1 Score vs Embedding Dimension')
axes[0, 1].set_xlabel('Embedding Dimension')
axes[0, 1].set_ylabel('F1 Score')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].set_title('F1 Score vs Similarity Threshold')
axes[1, 0].set_xlabel('Similarity Threshold')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].axis('off')

plt.tight_layout()
plt.savefig('fpn_rl_combined_comparison.png', dpi=150, bbox_inches='tight')
plt.show()