In [29]:
import numpy as np
import pandas as pd 
import random 
from scipy.stats import norm, bernoulli, poisson, t

# Section 1: Parameter Configuration 
## Seed for reproducibility

In [30]:
seed = 42
np.random.seed(seed)
random.seed(seed)

## Configurable parameters

In [31]:
# Number of clients
X = 10

# Numer of features per client
Y = 20

# Types of distributions
distributions = ['normal', 'standard_normal', 'bernoulli', 'poisson', 't']

# Multiplier for outlier amplification
outlier_amplification = 5

# Variance for additive noise
noise_variance = 0.1

# Percentage of features to apply noise to
percent_noisy_features = 0.2

# Percentage of clients with noisy datasets
percent_noisy_clients = 0.3

# Range for number of samples
min_samples = 50
max_samples = 200

# Path for storing 
path = "./Datasets"

# Section 2: Utility Functions

In [32]:
def generate_data(distribution, size):
    """Generate data based on the specifid distribution."""
    if distribution == 'normal':
        return np.random.normal(loc=0, scale=1, size=size)
    elif distribution == 'standard_normal':
        return np.random.standard_normal(size=size)
    elif distribution == 'bernoulli':
        return bernoulli.rvs(p=0.5, size=size)
    elif distribution == 'poisson':
        return poisson.rvs(mu=3, size=size)
    elif distribution == 't':
        return t.rvs(df=10, size=size)
    else:
        raise ValueError(f"Unsupported distribution: {distribution}")
    
def add_outliers(data, amplification):
    """Add outliers to a portion of the dataset."""
    num_outliers = max(1, len(data) // 10)  # Add outliers to 10% of the data
    outlier_indices = np.random.choice(len(data), size=num_outliers, replace=False)
    data[outlier_indices] *= amplification
    return data

def add_noise(data, variance, percent_features):
    """Add noise to a subset of features in the dataset."""
    data = data.astype(float) # To avoid type mismatch if bernoulli or possion
    num_features = data.shape[1]
    num_noisy_features = int(percent_features * num_features)
    noisy_features = np.random.choice(num_features, size=num_noisy_features, replace=False)
    noise = np.random.normal(loc=0, scale=variance, size=(data.shape[0], len(noisy_features)))
    data[:, noisy_features] += noise
    return data

def save_datasets(datasets, prefix, path=path):
    """Save datasets to CSV files at the specified path."""
    for i, data in enumerate(datasets):
        columns = [f"Feature{str(j+1).zfill(2)}" for j in range(Y)]
        df = pd.DataFrame(data, columns=columns)
        df.to_csv(f"{path}/{prefix}_client_{i}.csv", index=False)

def save_distributions_X1(feature_distributions_list, path=path):
    """Save the distribution information for X1 to a CSV file."""
    df = pd.DataFrame({
        "Client": [f"Client_{i}" for i in range(X)],
        "Distribution": [dists[0] for dists in feature_distributions_list] # Same for all features (per client)
    })
    df.to_csv(f"{path}/X1_distributions.csv", index=False)

def save_distributions_X2(feature_distributions_list, path=path):
    """Save the distribution information for X2 to a CSV file."""
    records = []
    for client_idx, distributions in enumerate(feature_distributions_list):
        record = {"Client": f"Client_{client_idx}"}
        for feature_idx, dist in enumerate(distributions):
            record[f"Feature{str(feature_idx + 1).zfill(2)}"] = dist
        records.append(record)
    df = pd.DataFrame(records)
    df.to_csv(f"{path}/X2_distributions.csv", index=False)

# Section 3: Dataset Creation

In [33]:
# X1: Each feature for a specific client comes from the same distribution
def create_X1():
    datasets = []
    feature_distributions_list = [] # Track distributions for each client
    for client in range(X):
        # Random number of samples in the range [min_samples, max_samples]
        Z = np.random.randint(min_samples, max_samples + 1)
        distribution = random.choice(distributions)
        data = np.array([generate_data(distribution, Z) for _ in range(Y)]).T
        data = add_outliers(data, outlier_amplification)
        datasets.append(data)
        feature_distributions_list.append([distribution] * Y)
    return datasets, feature_distributions_list

# X2: Features for a client may come from different distributions
def create_X2():
    datasets = []
    feature_distributions_list = []
    for client in range(X):
        Z = np.random.randint(min_samples, max_samples + 1)
        data = []
        feature_distributions = []
        for feature in range(Y):
            distribution = random.choice(distributions)
            data.append(generate_data(distribution, Z))
            feature_distributions.append(distribution)
        data = np.array(data).T
        data = add_outliers(data, outlier_amplification)
        datasets.append(data)
        feature_distributions_list.append(feature_distributions)
    return datasets, feature_distributions_list

# X1' and X2': Perturbed datasets
def perturb_datasets(datasets):
    perturbed = []
    num_noisy_clients = int(percent_noisy_clients * X)
    noisy_clients = random.sample(range(X), k=num_noisy_clients)
    for idx, data in enumerate(datasets):
        if idx in noisy_clients:
            data = add_noise(data, noise_variance, percent_noisy_features)
        perturbed.append(data)
    return perturbed

# Section 4: Generate and Save Datasets

In [34]:
# Create X1 and X2
X1, X1_distributions = create_X1()
X2, X2_distributions = create_X2()

# Create X1' and X2'
X1_prime = perturb_datasets(X1)
X2_prime = perturb_datasets(X2)

# Save datasets
save_datasets(X1, "X1")
save_datasets(X1_prime, "X1_prime")
save_datasets(X2, "X2")
save_datasets(X2_prime, "X2_prime")

# Save distribution information
save_distributions_X1(X1_distributions)
save_distributions_X2(X2_distributions)