# Import Necessary Modules

In [15]:
import numpy as np
import pandas as pd
import os
from pathlib import Path

# Configurable Parameters

In [23]:
# Number of clients and features
NUM_CLIENTS = 5
NUM_FEATURES = 8

# Sample size range
MIN_SAMPLES = 50
MAX_SAMPLES = 200

# Number of rouge clients
NUM_ROUGE_CLIENTS = 0

# Distribution configuration
DISTRIBUTIONS = {
    'Square Meters': {'type': 'normal', 'mu_range': (100, 300), 'sigma_range': (10, 30)},
    'Year Built': {'type': 'normal', 'mu_range': (1950, 2020), 'sigma_range': (5, 20)},
    'Neighborhood Quality': {'type': 'categorical', 'categories': ['Low', 'Medium', 'High'], 'prob_range': [(0.5, 0.3, 0.2), (0.3, 0.4, 0.3)]},
    'Distance to Amenities': {'type': 'uniform', 'low_range': (100, 1000), 'high_range': (5, 20)},
    'Number of Rooms': {'type': 'poisson', 'lambda_range': (2, 5)},
    'Lot Size': {'type': 'normal', 'mu_range': (500, 2000), 'sigma_range': (100, 300)},
    'House Style': {'type': 'categorical', 'categories': ['Single Family', 'Condo', 'Townhouse'], 'prob_range': [(0.6, 0.3, 0.1), (0.4, 0.4, 0.2)]},
    'Local Economic Index': {'type': 'normal', 'mu_range': (50, 150), 'sigma_range': (10, 30)}
}

# Utility Functions

In [17]:
def generate_feature(feature_name, dist_params, num_samples):
    """Generates data for a specific feature based on its distribution."""
    dist_type = dist_params['type']
    if dist_type == 'normal':
        mu = np.random.uniform(*dist_params['mu_range'])
        sigma = np.random.uniform(*dist_params['sigma_range'])
        if feature_name == "Year Built":
            return np.round(np.random.normal(mu, sigma, num_samples)).astype(int), {'mu': mu, 'sigma': sigma}
        return np.random.normal(mu, sigma, num_samples), {'mu': mu, 'sigma': sigma}
    elif dist_type == 'uniform':
        low = np.random.uniform(*dist_params['low_range'])
        high = np.random.uniform(*dist_params['high_range'])
        return np.random.uniform(low, high, num_samples), {'low': low, 'high': high}
    elif dist_type == 'categorical':
        prob = dist_params['prob_range'][np.random.randint(0, len(dist_params['prob_range']))]
        return np.random.choice(dist_params['categories'], num_samples, p=prob), {'probabilities': prob}
    elif dist_type == 'poisson':
        lam = np.random.uniform(*dist_params['lambda_range'])
        return np.random.poisson(lam, num_samples), {'lambda': lam}
    else:
        raise ValueError(f"Unknown distribution type: {dist_type}")
    
def encode_categorical_features(client_data):
    """Encodes categorical features as numerical values."""
    encoded_data = {}
    encoding_maps = {}
    for feature_name, values in client_data.items():
        if isinstance(values[0], str):  # Check if the feature is categorical
            unique_values = list(set(values))
            encoding_map = {val: idx for idx, val in enumerate(unique_values)}
            encoded_data[feature_name] = np.array([encoding_map[val] for val in values])
            encoding_maps[feature_name] = encoding_map
        else:
            encoded_data[feature_name] = values
    return encoded_data, encoding_maps

def decode_categorical_features(client_data, encoding_maps):
    """Decodes numerical categorical features back to their string values."""
    decoded_data = {}
    for feature_name, values in client_data.items():
        if feature_name in encoding_maps:
            decoding_map = {v: k for k, v in encoding_maps[feature_name].items()}
            decoded_data[feature_name] = [decoding_map[val] for val in values]
        else:
            decoded_data[feature_name] = values
    return decoded_data

def generate_target_variable(features, weights, noise_std=1000):
    """Generates the target variable (house price) as a weighted sum of features with noise."""
    linear_combination = sum(w * features[i] for i, w in enumerate(weights))
    noise = np.random.normal(0, noise_std, len(features[0]))
    return linear_combination + noise

def add_noise_to_features(client_data, noise_level=0.5):
    """Adds noise to numerical features in the client data."""
    noisy_data = {}
    for feature_name, feature_values in client_data.items():
        if feature_name != 'House Price' and np.issubdtype(type(feature_values[0]), np.number):  # Only numerical features
            noise = np.random.normal(0, noise_level * np.std(feature_values), len(feature_values))
            noisy_data[feature_name] = feature_values + noise
        else:
            noisy_data[feature_name] = feature_values
    return noisy_data

def save_to_csv(data, filename):
    """Saves a DataFrame to a CSV file."""
    data.to_csv(filename, index=False)

# Dataset Creation


In [24]:
# Output directories
output_dir = Path("house_pricing_datasets_0_rouge")
output_dir.mkdir(exist_ok=True)
metadata = []

# Select rouge clients
rouge_clients = set(np.random.choice(range(1, NUM_CLIENTS + 1), NUM_ROUGE_CLIENTS, replace=False))

for client_id in range(1, NUM_CLIENTS + 1):
    # Random number of samples for this client
    num_samples = np.random.randint(MIN_SAMPLES, MAX_SAMPLES + 1)

    client_data = {}
    client_metadata = {'Client_ID': client_id, 'Is_Rouge': client_id in rouge_clients}

    # Generate features
    for feature_name, dist_params in DISTRIBUTIONS.items():
        feature_data, params = generate_feature(feature_name, dist_params, num_samples)
        client_data[feature_name] = feature_data
        client_metadata[feature_name] = params

    # Add noise for rouge clients
    if client_id in rouge_clients:
        client_data = add_noise_to_features(client_data)

    # Encode categorical features
    encoded_client_data, encoding_maps = encode_categorical_features(client_data)

    # Generate target variable (house price)
    weights = np.random.uniform(0.1, 1, NUM_FEATURES)
    features = [encoded_client_data[feature_name] for feature_name in DISTRIBUTIONS.keys()]
    encoded_client_data['House Price'] = generate_target_variable(features, weights)

    # Decode categorical features before saving
    decoded_client_data = decode_categorical_features(encoded_client_data, encoding_maps)

    # Save client dataset to CSV
    client_df = pd.DataFrame(decoded_client_data)
    save_to_csv(client_df, output_dir / f"client_{client_id}.csv")

    # Save metadata
    client_metadata['Encoding Maps'] = encoding_maps
    metadata.append(client_metadata)

# Save metadata to a CSV file
metadata_df = pd.DataFrame(metadata)
save_to_csv(metadata_df, output_dir / "metadata.csv")

# Summary
print(f"Generated datasets for {NUM_CLIENTS} clients, including {NUM_ROUGE_CLIENTS} rouge clients. Check the {output_dir} directory.")

Generated datasets for 5 clients, including 0 rouge clients. Check the house_pricing_datasets_0_rouge directory.
