In [4]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.genmod.generalized_linear_model import SET_USE_BIC_LLF

# Import custom modules and functions
from generate_hnb import generate_hnb
from generate_ZI import generate_ZI
from AIC_BIC import calculate_aic_bic

# Import model classes
from models.HurdlePoisson import ZeroKInflatedPoisson as ZKIHurdle
from models.ZINB import ZINB_EM, predict_mean as ZINB_pred_mean
from models.ZIP import ZIP_EM, predict_mean as ZIP_pred_mean
from models.ZKINB import ZkINB_EM
from models.ZKIP import ZKIP_EM
from models.ZkICMP import ZkICMP


class ModelEvaluator:
    """Class to evaluate and compare different count data models."""
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.n_train = X_train.shape[0]
        self.results = {}
    
    def evaluate_model(self, model_name, y_pred, llf, k_params):
        """Calculate evaluation metrics for a model."""
        metrics = {
            'mse': mean_squared_error(self.y_test, y_pred),
            'mae': mean_absolute_error(self.y_test, y_pred),
            'r2': r2_score(self.y_test, y_pred),
            'llf': llf,
            'aic_bic': calculate_aic_bic(self.n_train, llf, k_params)
        }
        self.results[model_name] = metrics
        return metrics


def generate_and_prepare_data(n=300, k=3):
    """Generate and prepare the dataset for modeling."""
    # Generate data
    y = generate_ZI(
        n=n, k=k, beta0=-1, beta1=1, gamma0=-2, gamma1=0.3,
        alpha0=0.5, alpha1=1, r=10, cov_type="nbinary"
    )
    
    # Prepare features and target
    X = y.loc[:, 'x'].values
    y_target = y.loc[:, 'y'].values
    
    # Add intercept
    X = sm.add_constant(X)
    X = pd.DataFrame(X, columns=['intercept', 'x'])
    y_target = pd.DataFrame(y_target, columns=['y'])
    
    return X, y_target


def define_model_parameters():
    """Define parameter counts for different models."""
    return {
        'poisson': 2,
        'nb': 3,  # if r known (2)
        'zip': 4,  # 2+2
        'zinb': 5,  # if r known (2+2+1)
        'zkhurdle_poi': 4,  # 2+2
        'zkip': 6,  # 2+2+2
        'zkinb': 7,  # if r known (2+2+2+1)
        'zkicmp': 7  # 2+2+2+1
    }


def main():
    # Configuration
    SET_USE_BIC_LLF(True)
    k = 3
    n = 300
    
    # Generate data
    X, y_target = generate_and_prepare_data(n, k)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X[['intercept', 'x']], y_target['y'], 
        test_size=0.3, random_state=42
    )
    
    # Get parameter counts
    param_counts = define_model_parameters()
    
    # Initialize evaluator
    evaluator = ModelEvaluator(X_train, X_test, y_train, y_test)
    
    # 1. Poisson Model
    print("Fitting Poisson model...")
    poisson_model = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
    poisson_pred = poisson_model.predict(X_test)
    evaluator.evaluate_model(
        'poisson', poisson_pred, poisson_model.llf, param_counts['poisson']
    )
    
    # 2. Negative Binomial Model
    print("Fitting Negative Binomial model...")
    nb_model = sm.NegativeBinomial(y_train, X_train).fit()
    nb_pred = nb_model.predict(X_test)
    evaluator.evaluate_model(
        'negative_binomial', nb_pred, nb_model.llf, param_counts['nb']
    )
    
    # 3. Zero-K Inflated Poisson Hurdle Model
    print("Fitting Zero-K Inflated Poisson Hurdle model...")
    zkihurdle_model = ZKIHurdle(k=3)
    zkihurdle_res = zkihurdle_model.fit(X_train, y_train)
    zkihurdle_ll = zkihurdle_model.loglikelihood(X_train, y_train)
    zkihurdle_pred = zkihurdle_model.predict_mean(X_test)
    evaluator.evaluate_model(
        'zk_hurdle_poisson', zkihurdle_pred, zkihurdle_ll, param_counts['zkhurdle_poi']
    )
    
    # 4. ZINB Model
    print("Fitting ZINB model...")
    alpha = 1/10
    beta, gamma, zinb_ll = ZINB_EM(y_train.values, X_train.values, X_train.values, alpha)
    zinb_pred = ZINB_pred_mean(X_test.values, X_test.values, beta, gamma)
    evaluator.evaluate_model(
        'zinb', zinb_pred, zinb_ll, param_counts['zinb']
    )
    
    # 5. ZIP Model
    print("Fitting ZIP model...")
    beta, gamma, zip_ll = ZIP_EM(y_train.values, X_train.values, X_train.values)
    zip_pred = ZIP_pred_mean(X_test.values, X_test.values, beta, gamma)
    evaluator.evaluate_model(
        'zip', zip_pred, zip_ll, param_counts['zip']
    )
    
    # 6. ZKINB Model
    print("Fitting ZKINB model...")
    zkinb_model = ZkINB_EM()
    zkinb_res = zkinb_model.fit_em(y_train, X_train.values, X_train.values, k)
    zkinb_pred = zkinb_model.predict(X_test.values, X_test.values)
    evaluator.evaluate_model(
        'zkinb', zkinb_pred, zkinb_res['final_loglik'], param_counts['zkinb']
    )
    
    # 7. ZKIP Model
    print("Fitting ZKIP model...")
    zkip_model = ZKIP_EM(k_inflated=k)
    zkip_res = zkip_model.fit(X_train.values, y_train.values)
    zkip_pred = zkip_model.predict_expected(X_test.values)
    evaluator.evaluate_model(
        'zkip', zkip_pred, zkip_res.final_loglik, param_counts['zkip']
    )
    
    # 8. ZkICMP Model
    print("Fitting ZkICMP model...")
    zkicmp_model = ZkICMP(k=k)
    zkicmp_res = zkicmp_model.fit(X_train.values, y_train.values)
    pred_results = zkicmp_model.predict(X_test.values)
    _, _, zkicmp_pred, _ = pred_results
    evaluator.evaluate_model(
        'zkicmp', zkicmp_pred, -zkicmp_res.final_loglik, param_counts['zkicmp']
    )
    
    # Print results
    print("\n" + "="*80)
    print("MODEL COMPARISON RESULTS")
    print("="*80)
    
    for model_name, metrics in evaluator.results.items():
        print(f"\n{model_name.upper()}:")
        print(f"  MSE: {metrics['mse']:.4f}")
        print(f"  MAE: {metrics['mae']:.4f}")
        print(f"  R²:  {metrics['r2']:.4f}")
        print(f"  LLF: {metrics['llf']:.4f}")
        print(f"  AIC/BIC: {metrics['aic_bic']}")
    
    return evaluator.results


if __name__ == "__main__":
    results = main()

Fitting Poisson model...
Fitting Negative Binomial model...
Optimization terminated successfully.
         Current function value: 1.522362
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9
Fitting Zero-K Inflated Poisson Hurdle model...
Fitting ZINB model...
Fitting ZIP model...
Fitting ZKINB model...
Fitting ZKIP model...
Fitting ZkICMP model...

MODEL COMPARISON RESULTS

POISSON:
  MSE: 3.8334
  MAE: 1.4101
  R²:  -0.0003
  LLF: -375.2094
  AIC/BIC: (754.4187977950703, 761.1130128565053)

NEGATIVE_BINOMIAL:
  MSE: 3.8828
  MAE: 1.4276
  R²:  -0.0132
  LLF: -319.6961
  AIC/BIC: (645.3921555851194, 655.4334781772718)

ZK_HURDLE_POISSON:
  MSE: 4.4250
  MAE: 1.5505
  R²:  -0.1547
  LLF: -291.5611
  AIC/BIC: (591.1221910564129, 604.5106211792828)

ZINB:
  MSE: 3.7975
  MAE: 1.4086
  R²:  0.0091
  LLF: -300.0158
  AIC/BIC: (610.0315118764296, 626.767049530017)

ZIP:
  MSE: 3.8296
  MAE: 1.4163
  R²:  0.0007
  LLF: -300.8488
  AIC/BIC: (609.697692141