In [2]:
import numpy as np
import pandas as pd

In [3]:
def generate_zkip_data(n=1000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=None, pi2=None, pi3=None, seed=42):
    """
    Generate synthetic ZKIP (Zero-K Inflated Poisson) data with flexible probability control
    
    Parameters:
    -----------
    n : int
        Number of samples
    k_inflated : int
        The value at which inflation occurs (default: 3)
    true_beta : array-like
        Coefficients for Poisson component [intercept, x1, x2, ...]
    true_gamma : float
        Log-odds parameter for zero-inflation (alternative to pi1)
    true_delta : float
        Log-odds parameter for k-inflation (alternative to pi2)
    pi1 : float
        Probability of zero-inflation class (if specified, overrides true_gamma)
    pi2 : float
        Probability of k-inflation class (if specified, overrides true_delta)
    pi3 : float
        Probability of Poisson class (if specified, must satisfy pi1 + pi2 + pi3 = 1)
    seed : int
        Random seed
    
    Returns:
    --------
    X_data : array
        Covariate matrix (without intercept)
    y : array
        Generated counts
    metadata : dict
        Dictionary with true parameters and probabilities
    """
    np.random.seed(seed)
    
    # Default beta coefficients
    if true_beta is None:
        true_beta = np.array([0.5, -0.3, 0.8])  # intercept, x1, x2
    
    # Generate covariates
    X = np.column_stack([
        np.ones(n),
        np.random.normal(0, 1, n),  # x1
        np.random.normal(0, 1, n)   # x2
    ])
    
    # Handle probability specification
    if pi1 is not None and pi2 is not None and pi3 is not None:
        # Direct probability specification
        if not np.isclose(pi1 + pi2 + pi3, 1.0):
            raise ValueError("pi1 + pi2 + pi3 must sum to 1")
        true_pi1, true_pi2, true_pi3 = pi1, pi2, pi3
        # Back-calculate gamma and delta for consistency
        true_gamma = np.log(pi1 / pi3)
        true_delta = np.log(pi2 / pi3)
    
    elif pi1 is not None and pi2 is not None:
        # pi1 and pi2 specified, pi3 = 1 - pi1 - pi2
        if pi1 + pi2 >= 1.0:
            raise ValueError("pi1 + pi2 must be less than 1")
        true_pi3 = 1 - pi1 - pi2
        true_pi1, true_pi2 = pi1, pi2
        true_gamma = np.log(pi1 / true_pi3)
        true_delta = np.log(pi2 / true_pi3)
    
    elif true_gamma is not None and true_delta is not None:
        # Gamma/delta specification (original method)
        true_pi3 = 1 / (1 + np.exp(true_gamma) + np.exp(true_delta))
        true_pi1 = np.exp(true_gamma) * true_pi3
        true_pi2 = np.exp(true_delta) * true_pi3
    
    else:
        # Default values
        true_gamma = 0.5
        true_delta = 0.5
        true_pi3 = 1 / (1 + np.exp(true_gamma) + np.exp(true_delta))
        true_pi1 = np.exp(true_gamma) * true_pi3
        true_pi2 = np.exp(true_delta) * true_pi3
    
    # Calculate lambda for Poisson component
    lambd = np.exp(X @ true_beta)
    
    # Generate latent class indicators
    z = np.random.choice([0, 1, 2], size=n, p=[true_pi1, true_pi2, true_pi3])
    
    # Generate counts based on latent class
    y = np.zeros(n)
    
    # Class 0: Degenerate at 0
    mask_0 = (z == 0)
    y[mask_0] = 0
    
    # Class 1: Degenerate at k
    mask_k = (z == 1)
    y[mask_k] = k_inflated
    
    # Class 2: Poisson
    mask_poisson = (z == 2)
    y[mask_poisson] = np.random.poisson(lambd[mask_poisson])
    
    # Remove intercept from X for modeling
    X_data = X[:, 1:]
    
    return X_data, y, {
        'true_beta': true_beta,
        'true_gamma': true_gamma,
        'true_delta': true_delta,
        'true_pi1': true_pi1,
        'true_pi2': true_pi2,
        'true_pi3': true_pi3,
        'k_inflated': k_inflated,
        'class_counts': np.bincount(z),
        'class_proportions': np.bincount(z) / n
    }

# Example usage:
if __name__ == "__main__":
    # Method 1: Using gamma/delta parameters (original)
    X1, y1, meta1 = generate_zkip_data(n=1000, true_gamma=1.0, true_delta=0.5)
    print("Method 1 - Gamma/Delta:")
    print(f"Probabilities: π1={meta1['true_pi1']:.3f}, π2={meta1['true_pi2']:.3f}, π3={meta1['true_pi3']:.3f}")
    
    # Method 2: Direct probability specification
    X2, y2, meta2 = generate_zkip_data(n=1000, pi1=0.3, pi2=0.2, pi3=0.5)
    print("\nMethod 2 - Direct probabilities:")
    print(f"Probabilities: π1={meta2['true_pi1']:.3f}, π2={meta2['true_pi2']:.3f}, π3={meta2['true_pi3']:.3f}")
    
    # Method 3: Specify pi1 and pi2 only
    X3, y3, meta3 = generate_zkip_data(n=1000, pi1=0.4, pi2=0.3)  # pi3 = 0.3 automatically
    print("\nMethod 3 - pi1 and pi2 only:")
    print(f"Probabilities: π1={meta3['true_pi1']:.3f}, π2={meta3['true_pi2']:.3f}, π3={meta3['true_pi3']:.3f}")
    
    # Method 4: Extreme case - high zero inflation
    X4, y4, meta4 = generate_zkip_data(n=1000, pi1=0.7, pi2=0.1, pi3=0.2)
    print("\nMethod 4 - High zero inflation:")
    print(f"Probabilities: π1={meta4['true_pi1']:.3f}, π2={meta4['true_pi2']:.3f}, π3={meta4['true_pi3']:.3f}")
    print(f"Class counts: {meta4['class_counts']}")

Method 1 - Gamma/Delta:
Probabilities: π1=0.506, π2=0.307, π3=0.186

Method 2 - Direct probabilities:
Probabilities: π1=0.300, π2=0.200, π3=0.500

Method 3 - pi1 and pi2 only:
Probabilities: π1=0.400, π2=0.300, π3=0.300

Method 4 - High zero inflation:
Probabilities: π1=0.700, π2=0.100, π3=0.200
Class counts: [697 109 194]


In [8]:
def data_sets(n=1000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=None, pi2=None, pi3=None, seed=42):
    # Step 1: Generate Synthetic Data
    print("\n1. GENERATING SYNTHETIC DATA")
    print("-" * 30)

    X, y, true_params = generate_zkip_data(n=n, k_inflated=k_inflated, true_beta=true_beta, true_gamma=None, true_delta=None, 
                      pi1=pi1, pi2=pi2, pi3=pi3, seed=42)

    # Create train/test split
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    X_train_df=pd.DataFrame(X_train, columns=['X1', 'X2'])
    X_test_df=pd.DataFrame(X_test, columns=['X1', 'X2'])
    y_train_df=pd.DataFrame(y_train, columns=['Y'])
    y_test_df=pd.DataFrame(y_test, columns=['Y'])

    X_train_df.to_csv(f'DataSets/X_train_0k_inflated {X.shape}.csv', index=False)
    X_test_df.to_csv(f'DataSets/X_test_0k_inflated {X.shape}.csv', index=False)
    y_train_df.to_csv(f'DataSets/y_train {X.shape}.csv', index=False)
    y_test_df.to_csv(f'DataSets/y_test {X.shape}.csv', index=False)
    
    print(f'X_train_0k_inflated {X.shape}.csv')
    print(f'X_test_0k_inflated {X.shape}.csv')
    print(f'y_train {X.shape}.csv.csv')
    print(f'y_test {X.shape}.csv')
    
    print(f"Data shape: {X.shape}")

In [5]:
data_sets(n=50, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
Data shape: (50, 2)


In [6]:
data_sets(n=200, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
Data shape: (200, 2)


In [9]:
data_sets(n=500, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
X_train_0k_inflated (500, 2).csv
X_test_0k_inflated (500, 2).csv
y_train (500, 2).csv.csv
y_test (500, 2).csv
Data shape: (500, 2)


In [10]:
data_sets(n=1000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
X_train_0k_inflated (1000, 2).csv
X_test_0k_inflated (1000, 2).csv
y_train (1000, 2).csv.csv
y_test (1000, 2).csv
Data shape: (1000, 2)


In [11]:
data_sets(n=2000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
X_train_0k_inflated (2000, 2).csv
X_test_0k_inflated (2000, 2).csv
y_train (2000, 2).csv.csv
y_test (2000, 2).csv
Data shape: (2000, 2)


In [12]:
data_sets(n=5000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
X_train_0k_inflated (5000, 2).csv
X_test_0k_inflated (5000, 2).csv
y_train (5000, 2).csv.csv
y_test (5000, 2).csv
Data shape: (5000, 2)


In [13]:
data_sets(n=10000, k_inflated=3, true_beta=None, true_gamma=None, true_delta=None, 
                      pi1=0.4, pi2=0.4, pi3=None, seed=42)


1. GENERATING SYNTHETIC DATA
------------------------------
X_train_0k_inflated (10000, 2).csv
X_test_0k_inflated (10000, 2).csv
y_train (10000, 2).csv.csv
y_test (10000, 2).csv
Data shape: (10000, 2)
