# Data simulation

In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_context('notebook')

# Set random seed for reproducibility
np.random.seed(79)

# Number of samples per class
n_samples = 200

In [3]:
# Define labels
labels = np.concatenate([np.full(n_samples, 0), np.full(n_samples//2, 1), np.full(n_samples//2, 2), np.full(n_samples, 3)])

# Feature 1
feature_1 = np.concatenate([np.random.normal(loc=5, scale=1, size=n_samples), 
                            np.random.normal(loc=5, scale=1, size=round(n_samples/2)), 
                            np.random.normal(loc=10, scale=1, size=round(n_samples/2)), 
                            np.random.normal(loc=5, scale=1, size=n_samples)])

# Feature 2
feature_2 = np.concatenate([np.random.normal(loc=5, scale=1, size=n_samples), 
                            np.random.normal(loc=10, scale=1, size=round(n_samples/2)), 
                            np.random.normal(loc=5, scale=1, size=round(n_samples/2)), 
                            np.random.normal(loc=5, scale=1, size=n_samples)])

# Add noise features
noise_feature_1 = np.random.normal(loc=0, scale=1, size=len(labels))
noise_feature_2 = np.random.binomial(n=1, p=0.5, size=len(labels))

# Add another normal feature with a different mean for one subclass - Feature 3
normal_feature_subclass_1 = np.random.normal(loc=5, scale=1, size=n_samples)
normal_feature_subclass_2 = np.random.normal(loc=5, scale=1, size=n_samples)
normal_feature_subclass_3 = np.random.normal(loc=10, scale=1, size=n_samples)
# Combine the new normal feature
normal_feature = np.concatenate([normal_feature_subclass_1, normal_feature_subclass_2, normal_feature_subclass_3])

# Create a new column combining Class 1 and Class 2
combined_labels = np.where(np.isin(labels, [2, 3]), 1, labels)

# Create a DataFrame
data = pd.DataFrame({
    'Feature_1': feature_1,
    'Feature_2': feature_2, 
    'Feature_3': normal_feature, 
    'Noise_feature_1': noise_feature_1, 
    'Noise_feature_2': noise_feature_2, 
    'Subclass': labels,
    'Class': combined_labels
})


Save the data: 

In [5]:
data.to_csv('simulated_data.csv', index=False)