In [None]:
import pandas as pd
import numpy as np
from sklearn``.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=3,
    weights=None,
    flip_y=0.01,
    random_state=42
)

# Create feature names
numerical_features = [f'numerical_feature_{i}' for i in range(6)]
categorical_features = [f'categorical_feature_{i}' for i in range(4)]

# Create DataFrame
df = pd.DataFrame(X, columns=numerical_features + categorical_features)

# Convert some features to categorical
for cat_feature in categorical_features:
    df[cat_feature] = pd.qcut(df[cat_feature], q=4, labels=['A', 'B', 'C', 'D'])

# Add target column
df['target'] = y

# Add some missing values
for col in df.columns[:-1]:  # Exclude target column
    mask = np.random.rand(len(df)) < 0.05  # 5% missing values
    df.loc[mask, col] = np.nan

# Print info about the dataset
print("Dataset Info:")
print(df.info())
print("\nSample of the dataset:")
print(df.head())
print("\nSummary statistics:")
print(df.describe())

# Save to CSV
df.to_csv('customer_churn_data.csv', index=False)

# Print value counts for categorical columns
print("\nValue counts for categorical columns:")
for col in categorical_features:
    print(f"\n{col}:")
    print(df[col].value_counts())

# Print class distribution
print("\nClass distribution:")
print(df['target'].value_counts(normalize=True))