In [2]:
import numpy as np
import pandas as pd 
import random 
from scipy.stats import norm, bernoulli, poisson, t

# Section 1: Parameter Configuration 
## Seed for reproducibility

In [3]:
seed = 42
np.random.seed(seed)
random.seed(seed)

## Configurable parameters

In [8]:
# Number of clients
X = 10

# Numer of features per client
Y = 20

# Types of distributions
distributions = ['normal', 'standard_normal', 'bernoulli', 'poisson', 't']

# Multiplier for outlier amplification
outlier_amplification = 5

# Variance for additive noise
noise_variance = 0.1

# Percentage of features to apply noise to
percent_noisy_features = 0.2

# Percentage of clients with noisy datasets
percent_noisy_clients = 0.3

# Section 2: Utility Functions

In [7]:
def generate_data(distribution, size):
    """Generate data based on the specifid distribution."""
    if distribution == 'normal':
        return np.random.normal(loc=0, scale=1, size=size)
    elif distribution == 'standard_normal':
        return np.random.standard_normal(size=size)
    elif distribution == 'bernoulli':
        return bernoulli.rvs(p=0.5, size=size)
    elif distribution == 'poisson':
        return poisson.rvs(mu=3, size=size)
    elif distribution == 't':
        return t.rvs(df=10, size=size)
    else:
        raise ValueError(f"Unsupported distribution: {distribution}")
    
def add_outliers(data, amplification):
    """Add outliers to a portion of the dataset."""
    num_outliers = max(1, len(data) // 10)  # Add outliers to 10% of the data
    outlier_indices = np.random.choice(len(data), size=num_outliers, replace=False)
    data[outlier_indices] *= amplification
    return data

def add_noise(data, variance, percent_features):
    """Add noise to a subset of features in the dataset."""
    num_features = data.shape[1]
    num_noisy_features = int(percent_features * num_features)
    noisy_features = np.random.choice(num_features, size=num_noisy_features, replace=False)
    noise = np.random.normal(loc=0, scale=variance, size=(data.shape[0], len(noisy_features)))
    data[:, noisy_features] += noise
    return data

# Section 3: Dataset Creation

In [13]:
# X1: Each feature for a specific client comes from the same distribution
def create_X1():
    datasets = []
    for client in range(X):
        # Random number of samples between 50 and 200
        Z = np.random.randint(50, 201)
        distribution = random.choice(distributions)
        data = np.array([generate_data(distribution, Z) for _ in range(Y)]).T
        data = add_outliers(data, outlier_amplification)
        datasets.append(data)
    return datasets

# X2: Features for a client may come from different distributions
def create_X2():
    datasets = []
    for client in range(X):
        Z = np.random.randint(50, 201)  # Random number of samples between 50 and 200
        data = []
        for feature in range(Y):
            distribution = random.choice(distributions)
            data.append(generate_data(distribution, Z))
        data = np.array(data).T
        data = add_outliers(data, outlier_amplification)
        datasets.append(data)
    return datasets

# X1' and X2': Perturbed datasets
def perturb_datasets(datasets):
    perturbed = []
    num_noisy_clients = int(percent_noisy_clients * X)
    noisy_clients = random.sample(range(X), k=num_noisy_clients)
    for idx, data in enumerate(datasets):
        if idx in noisy_clients:
            data = add_noise(data, noise_variance, percent_noisy_features)
        perturbed.append(data)
    return perturbed

# Section 4: Generate and Save Datasets

In [14]:
# Create X1 and X2
X1 = create_X1()
X2 = create_X2()

# Create X1' and X2'
X1_prime = perturb_datasets(X1)
X2_prime = perturb_datasets(X2)

def save_datasets(datasets, prefix, path="."):
    """Save datasets to CSV files."""
    for i, data in enumerate(datasets):
        df = pd.DataFrame(data)
        df.to_csv(f"{path}/{prefix}_client_{i}.csv", index=False)


save_datasets(X1, "X1")
save_datasets(X1_prime, "X1_prime")
save_datasets(X2, "X2")
save_datasets(X2_prime, "X2_prime")