In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

n_samples = 10000

# Simulate numerical variabels
X1 = np.random.normal(0, 1, n_samples)
X2 = np.random.normal(0, 1, n_samples)
X3 = np.random.normal(0, 1, n_samples)
X4 = np.random.normal(0, 1, n_samples)

# Simulate categorical variables
X5 = pd.Categorical(np.random.choice(['low', 'medium', 'high'], size=n_samples, p=[0.5, 0.2, 0.3]), categories=['low', 'medium', 'high'], ordered=True)

# Simulate the target variable y (imbalanced: 10% positive, 90% negative)
y = np.random.choice([0, 1], size=n_samples, p=[0.9, 0.1])

# Add signal to X1 for positive class
X1[y == 1] += 2

# Simulate X6: categorical, with 'low' more frequent when y=1
X6_values = []
for label in y:
    if label == 1:
        X6_values.append(np.random.choice(['low', 'medium', 'high'], p=[0.6, 0.2, 0.2]))
    else:
        X6_values.append(np.random.choice(['low', 'medium', 'high'], p=[0.3, 0.3, 0.4]))
X6 = pd.Categorical(X6_values, categories=['low', 'medium', 'high'], ordered=True)

# Create DataFrame
df = pd.DataFrame({
    'X1': X1,
    'X2': X2,
    'X3': X3,
    'X4': X4,
    'X5': X5,
    'X6': X6,
    'y': y
})

In [2]:
df.to_csv('example_data.csv', index=False)