In [None]:
from mssm.models import *
from mssmViz.sim import *
from mssmViz.plot import *
import pickle
import copy
import os
from src.utils import GAMLSSGENSMOOTHFamily,llk_gamm_fun,init_lambda

try:
    os.makedirs("./results")
except:
    print("Result directories already exist.")

try:
    os.makedirs("./results/sim")
    os.makedirs("./results/data")
    os.makedirs("./results/plots")
    os.makedirs("./results/data/sim1")
    os.makedirs("./results/data/sim2")
    os.makedirs("./results/data/sim3")
    os.makedirs("./results/data/sim1/plots")
    os.makedirs("./results/data/sim3/plots")
    os.makedirs("./results/data/sim4")
    os.makedirs("./results/data/sim5")
except:
    print("Result sub-directory already exist.")

In [None]:
############################# Simulation 2 #############################
n_sim = 500

sim_fams = [Gaussian(),Gamma(),Binomial()]
mod_fams = [Gaussian(),Gamma(),Binomial()]
mod2_fams = [GAUMLSS([Identity(),LOGb(-0.01)]),GAMMALS([LOG(),LOGb(-0.01)]),MULNOMLSS(1)]
fam_names = ["Gaussian","Gamma", "Binom"]

for should_correlate in [True,False]:

    for fam_name, mod_fam, mod_fam2, sim_fam in zip(fam_names,mod_fams,mod2_fams,sim_fams):
        
        # Set up storage for current sim
        eta_mses = np.zeros((n_sim,4))
        n_lam_updt = np.zeros((n_sim,4))
        Failures = np.zeros((n_sim,4))

        # Prevent gamm initialization for gammlss
        if fam_name != "Binom":
            mod_fam2.init_coef = lambda models: np.array([1e-4 for _ in range(models[0].formula.n_coef + 1)]).reshape(-1,1)
        else:
            mod_fam2.init_coef = lambda models: np.array([1e-4 for _ in range(models[0].formula.n_coef)]).reshape(-1,1)     

        mod_fam2.init_lambda = init_lambda

        gsmm_fam = GAMLSSGENSMOOTHFamily(2 if fam_name != "Binom" else 1,copy.deepcopy(mod_fam2.links),llk_gamm_fun,copy.deepcopy(mod_fam2))
        
        if fam_name != "Binom":
            gsmm_fam.init_coef = lambda models: np.array([1e-4 for _ in range(models[0].formula.n_coef+1)]).reshape(-1,1)
        else:
            gsmm_fam.init_coef = lambda models: np.array([1e-4 for _ in range(models[0].formula.n_coef)]).reshape(-1,1)
        
        gsmm_fam.init_lambda = init_lambda
  
        iterator = tqdm(range(n_sim),desc="Simulating",leave=True)
        for sim_i in iterator:

            sim_dat = sim3(500,2,c=1,seed=sim_i,family=sim_fam,binom_offset = -5 if fam_name == "Binom" else 0,correlate=should_correlate)
            sim_dat.to_csv(f'./results/data/sim2/sim_size:{n_sim}_fam:{fam_name}_corr:{should_correlate}_set:{sim_i}.csv',index=False)

            # We need to model the mean: \mu_i
            sim_formula_m = Formula(lhs("y"),
                                [i(),f(["x0"]),f(["x1"]),f(["x2"]),f(["x3"])],
                                data=sim_dat)
            
            # And for sd - here constant
            sim_formula_sd = Formula(lhs("y"),
                                [i()],
                                data=sim_dat)
            
            if fam_name != "Binom":
                sim_formulas = [sim_formula_m,sim_formula_sd]
            else:
                sim_formulas = [sim_formula_m]

            ############################# Fit model with single inner iteration #############################
            sim_i_failed = [False, False, False, False]

            try:
                model_efs1 = GAMM(copy.deepcopy(sim_formula_m),mod_fam)
                model_efs1.family.is_canonical = True # Force full step-length control for all models

                model_efs1.fit(max_outer=1000,extend_lambda=False,method="Chol",progress_bar=False,max_inner=1)
                model_efs1.info.eps = 0 
                
            except:
                sim_i_failed[0] = True
                model_efs1 = None
            

            ############################# Fit model with repeated inner iterattions #############################
            try:
                model_efs2 = GAMM(copy.deepcopy(sim_formula_m),copy.deepcopy(mod_fam))
                model_efs2.family.is_canonical = True # Force full step-length control for all models

                model_efs2.fit(max_outer=1000,extend_lambda=False,method="Chol",progress_bar=False,max_inner=500)
                model_efs2.info.eps = 0 
                
            except:
                sim_i_failed[1] = True
                model_efs2 = None
            
            ############################# Fit model with Newton #############################
            try:
                model_efs3 = GAMMLSS(copy.deepcopy(sim_formulas),copy.deepcopy(mod_fam2))
                
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    model_efs3.fit(seed=sim_i,max_outer=200,max_inner=500,min_inner=500,method="QR/Chol",progress_bar=False,extend_lambda=False,control_lambda=2,should_keep_drop=False,repara=True,prefit_grad=True)

                model_efs3.pred = model_efs3.overall_preds[0]

            except:
                sim_i_failed[2] = True
                model_efs3 = None
            
            ############################# Fit model with qEFS #############################
            
            try:
                
                gsmm_model = GSMM(formulas=copy.deepcopy(sim_formulas),family=copy.deepcopy(gsmm_fam))

                # Fit with qEFS update without initialization
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    bfgs_opt={"gtol":1e-9,
                              "ftol":1e-9,
                              "maxcor":30,
                              "maxls":200,
                              "maxfun":1e7}
                    
                    gsmm_model.fit(init_coef=None,method='qEFS',extend_lambda=False,
                                   control_lambda=False,max_outer=200,max_inner=500,min_inner=500,
                                   seed=sim_i,qEFSH='SR1',max_restarts=0,overwrite_coef=False,qEFS_init_converge=False,prefit_grad=True,
                                   progress_bar=False,repara=True,**bfgs_opt)
                

                # Get linear predictors for gsmm_model
                if fam_name != "Binom":
                    split_coef = np.split(gsmm_model.overall_coef,gsmm_model.coef_split_idx)
                else:
                    split_coef = [gsmm_model.overall_coef]

                Xs = gsmm_model.get_mmat()
                gsmm_model.pred = Xs[0]@split_coef[0].reshape(-1,1)

                gsmm_model.info.eps = 0 # Not used but set to None, which messes up loop below
            except:
                sim_i_failed[3] = True
                gsmm_model = None
            

            ######################################## Collect MSEs ####################################
            models = [model_efs1,model_efs2,model_efs3,gsmm_model]

            for mi, model in enumerate(models):

                if sim_i_failed[mi]:
                    print(f"Model {mi+1} failed at {sim_i}")
                    Failures[sim_i,mi] = 1
                    n_lam_updt[sim_i,mi] = np.nan
                    eta_mses[sim_i,mi] = np.nan
                    continue

                # Not converged but not failed outright
                if model.info.code > 0:
                    Failures[sim_i,mi] = 1
                
                n_lam_updt[sim_i,mi] = model.info.iter

                pred_diff = model.pred.flatten() - sim_dat["eta"].values
                eta_mses[sim_i,mi] = np.dot(pred_diff,pred_diff)/len(pred_diff)
            
            iterator.set_description_str(desc=f"MSE.: {[(float(np.round(m,decimals=4)),float(np.round(sd,decimals=2))) for m,sd in zip(np.mean(eta_mses[:(sim_i+1),:],axis=0),np.std(eta_mses[:(sim_i+1),:],axis=0))]}", refresh=True)
            ###################################### Save in progress results ######################################
            res = {"eta_mses":eta_mses,
                   "n_lam_updt":n_lam_updt,
                   "Failures":Failures
                   }
            
            with open(f'./results/sim/sim_2_size:{n_sim}_fam:{fam_name}_corr:{should_correlate}.pickle', 'wb') as file:
                pickle.dump(res,file, protocol=pickle.HIGHEST_PROTOCOL)
        
        iterator.close()

In [None]:
############################# Simulation 2 #############################
n_sim = 500

for should_correlate in [True,False]:

        
    # Set up storage for current sim
    eta_mses = np.zeros((n_sim,2))
    n_lam_updt = np.zeros((n_sim,2))
    Failures = np.zeros((n_sim,2))

    iterator = tqdm(range(n_sim),desc="Simulating",leave=True)
    for sim_i in iterator:

        sim_dat = sim3(500,2,c=1,seed=sim_i,family=PropHaz([0],[0]),binom_offset = 0.1,correlate=should_correlate)
        
        # Prep everything for prophaz model
        sim_dat = sim_dat.sort_values(['y'],ascending=[False])
        sim_dat = sim_dat.reset_index(drop=True)

        u,inv = np.unique(sim_dat["y"],return_inverse=True)
        ut = np.flip(u)
        r = np.abs(inv - max(inv))
        
        sim_dat.to_csv(f'./results/data/sim2/sim_size:{n_sim}_fam:{"PropHaz"}_corr:{should_correlate}_set:{sim_i}.csv',index=False)

        # We only need to model the mean: \mu_i
        sim_formula_m = Formula(lhs("delta"),
                            [f(["x0"]),f(["x1"]),f(["x2"]),f(["x3"])],
                            data=sim_dat)
        
        sim_formulas = [sim_formula_m]

        sim_i_failed = [False, False,]
        
        ############################# Fit model with Newton #############################
        try:
            gsmm_newton_fam = PropHaz(ut,r)
            gsmm_newton_fam.init_lambda = init_lambda
            gsmm_newton = GSMM(copy.deepcopy(sim_formulas),gsmm_newton_fam)
            
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                gsmm_newton.fit(init_coef=None,method="QR/Chol",extend_lambda=False,
                                control_lambda=False,max_outer=200,seed=sim_i,max_inner=500,
                                min_inner=500,prefit_grad=True,repara=True,progress_bar=False)
            
            # Get linear predictors for gsmm_model
            split_coef = [gsmm_newton.overall_coef]

            Xs = gsmm_newton.get_mmat()
            gsmm_newton.pred = Xs[0]@split_coef[0].reshape(-1,1)

        except:
            sim_i_failed[0] = True
            gsmm_newton = None
        
        ############################# Fit model with qEFS #############################
        
        try:
            gsmm_qefs_fam = PropHaz(ut,r)
            gsmm_qefs_fam.init_lambda = init_lambda
            gsmm_qefs = GSMM(copy.deepcopy(sim_formulas),gsmm_qefs_fam)

            # Fit with qEFS update without initialization
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                bfgs_opt={"gtol":1e-9,
                            "ftol":1e-9,
                            "maxcor":30,
                            "maxls":200,
                            "maxfun":1e7}
                
                gsmm_qefs.fit(init_coef=None,method='qEFS',extend_lambda=False,
                              control_lambda=False,max_outer=200,max_inner=500,min_inner=500,
                              seed=sim_i,qEFSH='SR1',max_restarts=0,overwrite_coef=False,qEFS_init_converge=False,prefit_grad=True,
                              progress_bar=False,repara=True,**bfgs_opt)
            
            # Get linear predictors for gsmm_qefs
            split_coef = [gsmm_qefs.overall_coef]

            Xs = gsmm_qefs.get_mmat()
            gsmm_qefs.pred = Xs[0]@split_coef[0].reshape(-1,1)

            gsmm_qefs.info.eps = 0 # Not used but set to None, which messes up loop below
        except:
            sim_i_failed[1] = True
            gsmm_qefs = None
        
        ######################################## Collect MSEs ####################################
        models = [gsmm_newton,gsmm_qefs]

        for mi, model in enumerate(models):

            if sim_i_failed[mi]:
                print(f"Model {mi+1} failed at {sim_i}")
                Failures[sim_i,mi] = 1
                n_lam_updt[sim_i,mi] = np.nan
                eta_mses[sim_i,mi] = np.nan
                continue

            # Not converged but not failed outright
            if model.info.code > 0:
                Failures[sim_i,mi] = 1
            
            n_lam_updt[sim_i,mi] = model.info.iter

            pred_diff = model.pred.flatten() - sim_dat["eta"].values
            eta_mses[sim_i,mi] = np.dot(pred_diff,pred_diff)/len(pred_diff)
        
        iterator.set_description_str(desc=f"MSE.: {[(float(np.round(m,decimals=4)),float(np.round(sd,decimals=2))) for m,sd in zip(np.mean(eta_mses[:(sim_i+1),:],axis=0),np.std(eta_mses[:(sim_i+1),:],axis=0))]}", refresh=True)
        ###################################### Save in progress results ######################################
        res = {"eta_mses":eta_mses,
                "n_lam_updt":n_lam_updt,
                "Failures":Failures
                }
        
        with open(f'./results/sim/sim_2_size:{n_sim}_fam:{"PropHaz"}_corr:{should_correlate}.pickle', 'wb') as file:
            pickle.dump(res,file, protocol=pickle.HIGHEST_PROTOCOL)
        
    iterator.close()