# Import Required Libraries
This cell imports the necessary Python libraries for the analysis. It includes the custom `functions_main` library, `numpy` and `numpy.random` for numerical calculations.

In [4]:
from functions_main import *  
import numpy as np
import cvxpy as cp
import numpy.random as rgt
from scipy.stats import norm
from joblib import Parallel, delayed 
from tqdm.notebook import tqdm  

# Model for Low-Dimensional Data 
We simulate low-dimensional data based on the model:


Y =  X β + b^{1/2}ε  


where the parameter vector β is defined as \{a, -a, a, -a, ...\}, with each value being either a or -a, parameter b controls the strength of the noise. 

## Inputs required

n/p: sample size/dimension
 
a: magnitude of β(coefficient); β=\{a, -a, a, -a, ...\}

b: scale of the noise ε; here, different a/b can reflect the different signal-to-noise ratios 

Gaussian_covariate:  (1)True, X ~ N(0,1);  (2)False,  X ~ U(-sqrt{3},sqrt{3}) 

Gaussian_error: (1)True, ε ~ N(0,1);   (2)False,   ε ~ t_{2.25}  

In [5]:
####################### 1. GDP estimation of standard deviation ########################
def St_GDP(Y, epsilon):

    """
    ε-GDP estimator of the standard deviation based on clipping and the Gaussian mechanism.

    Parameters
    ----------
    Y : n by 1 numpy array of response variables. 
    epsilon :  privacy parameter.
    """

    n =  Y.size
    gamma = np.log(n)
    y =  np.clip(np.asarray(Y, dtype=float), -gamma, gamma)
    m1 = y.mean()
    m2 = np.mean(y**2)  
    Delta1 = 2*gamma / n
    Delta2 = (gamma**2) / n

    epsilon1 = epsilon2 = epsilon /np.sqrt(2)    # Split the privacy budget equally between the mean and second moment
    b1 = Delta1 / float(epsilon1)
    b2 = Delta2 / float(epsilon2)
    noise1 =  rgt.normal(0.0, b1)
    noise2 =  rgt.normal(0.0, b2) 
         
    m1_dp = m1 + noise1
    m2_dp = m2 + noise2
    var_dp = m2_dp - m1_dp**2 
    if var_dp > 0:
        st_dp = np.sqrt(var_dp)
    else:
        st_dp = 2.0
    return st_dp

#####################  2. huber & ridge initialization ##################### 
def huber_ridge_GDP(X, Y, tau, epsilon): 

    '''
    Huber + ridge regression & output perturbation
    
    Parameters
    ---------
    X : n by p0 numpy array of covariates; each row is an observation vector. (no intercept)
    Y : n by 1 numpy array of response variables. 
    tau : (GDP) robust parameter for huber regression.
    epsilon : privacy parameter.   
    '''

    n, p0 = X.shape
    gamma = np.sqrt(p0+1)/6 
    X_t = clipping_l2(X, gamma) 
 
    Z = np.hstack([np.ones((n, 1)), X_t])   # [1, X̃]
    beta = cp.Variable(p0 + 1) 

    resid = Y - Z  @ beta
    lam = 0.2 # Ridge regularization parameter
    obj = (1.0 / n) * cp.sum(cp.huber(resid, tau)) + 0.5 * lam * cp.sum_squares(  beta) 

    prob = cp.Problem(cp.Minimize(obj)) 
    prob.solve(solver="SCS", verbose=False, warm_start=True)

    b = np.asarray(beta.value).ravel() 
    B = np.sqrt(1 + gamma**2)
    noise_scale = 2*B*tau/(n*epsilon*lam)
    bDP = b + noise_scale * rgt.standard_normal(p0 + 1)
    return bDP  

 

## Simulations (different SNRs)

### Main function

In [None]:
def main_lowdim_GDP(n,p,a,b,Gaussian_covariate= True, Gaussian_error=True):
    # true beta
    rgt.seed(0) # set seed
    beta = a*np.ones(p)*(2*rgt.binomial(1, 0.5, size=p)-1) #   beta: {a,-a,a,...}  
    beta_norm = beta.dot(beta)**0.5 # the l2 norm of true beta
    
    #privacy parameters 
    epsilon = np.array([.3, .5, .9])  
    epsilon_init = epsilon/(2*np.sqrt(2))  # privacy budget for initialization  
    epsilon_est = np.sqrt(7)*epsilon/(2*np.sqrt(2)) # privacy budget for estmation 

    repetitions = 300 # number of repetitions   
    rela_l2  = np.zeros([  repetitions])  # relative l2-error results of the non-private estimate 
    priv_rela_l2_1 = np.zeros([  repetitions]) # relative l2-error results of the private estimate with epsilon = 0.3
    priv_rela_l2_2 = np.zeros([  repetitions]) # relative l2-error results of the private estimate with epsilon = 0.5
    priv_rela_l2_3 = np.zeros([  repetitions]) # relative l2-error results of the private estimate with epsilon = 0.9
    for m in range(repetitions):
        rgt.seed(m+10) # set seed
        
        ############################################  
        ############# generate data ################  
        ############################################  
        # X 
        if Gaussian_covariate == True: X = rgt.normal(0, 1, size=(n,p-1)) 
        else: X = rgt.uniform(-np.sqrt(3),np.sqrt(3), size=(n,p-1))
        # error
        if Gaussian_error == True: err = rgt.normal(0,1, n)
        else: err = rgt.standard_t(2.25, n) 
        # Y
        Y = beta[0] + X.dot(beta[1:]) +    np.sqrt(b)*  err 

        
        ############################################### 
        #-------------  Private (proposed)  ----------- 
        ###############################################  
        
        #---- initialization ----
        epsilon_tau =  epsilon_init/np.sqrt(2)  # privacy budget for GDP standard deviation estimation 
        epsilon_ridge =  epsilon_init/np.sqrt(2) # privacy budget for GDP huber+ridge estimation 
        tau_init_1 =   St_GDP(Y, epsilon = epsilon_tau[0])   
        beta_0DP_1 =  huber_ridge_GDP(X, Y, tau=tau_init_1, epsilon= epsilon_ridge[0] ) 
        tau_init_2 =   St_GDP(Y, epsilon = epsilon_tau[1])   
        beta_0DP_2 =  huber_ridge_GDP(X, Y, tau=tau_init_2, epsilon= epsilon_ridge[1] ) 
        tau_init_3 =   St_GDP(Y, epsilon = epsilon_tau[2])   
        beta_0DP_3 =  huber_ridge_GDP(X, Y, tau=tau_init_3, epsilon= epsilon_ridge[2] ) 

        #---- estimation ---- 
        gamma = 0.5*(p + np.log(n))**0.5
        T = int(np.ceil(2*np.log(n)))
        sigma_scale = 2*gamma*np.sqrt(T)/epsilon_est
        tau_priv_1 = .04* tau_init_1 * (n * epsilon[0]  /(p+np.log(n)))**0.5
        tau_priv_2 = .04* tau_init_2 * (n * epsilon[1]  /(p+np.log(n)))**0.5
        tau_priv_3 = .04* tau_init_3 * (n * epsilon[2]  /(p+np.log(n)))**0.5   
        lr  =  0.2 # learning rate for both private and non-private estimations
        model = Huber(X, Y,intercept=True) 
        out_priv_1 = model.noisygd(tau=tau_priv_1, lr= lr, beta0=beta_0DP_1, B=gamma , sigma_scale=sigma_scale[0],T=T) 
        out_priv_2 = model.noisygd(tau=tau_priv_2, lr= lr, beta0=beta_0DP_2, B=gamma , sigma_scale=sigma_scale[1],T=T) 
        out_priv_3 = model.noisygd(tau=tau_priv_3, lr= lr, beta0=beta_0DP_3, B=gamma , sigma_scale=sigma_scale[2],T=T) 
        priv_rela_l2_1[m] = np.log(np.sum((out_priv_1['beta']  - beta )**2 )**0.5  / beta_norm)
        priv_rela_l2_2[m] = np.log(np.sum((out_priv_2['beta']  - beta )**2 )**0.5  / beta_norm)
        priv_rela_l2_3[m] = np.log(np.sum((out_priv_3['beta']  - beta )**2 )**0.5  / beta_norm)
          
        
        ############################################### 
        #----------------  Non-private  --------------- 
        ###############################################
        model = Huber(X, Y,intercept=True)
        tau0 = np.sqrt(np.mean(Y**2)-(np.mean(Y)**2))
        tau_np = .2 *tau0* (n/(p+np.log(n)))**0.5   
        out  = model.gd(tau=tau_np,  lr= 0.5, beta0= np.array([]), T=T)   
        rela_l2[m] = np.log(np.sum((out['beta']  - beta )**2 )**0.5  / beta_norm) 
            
    return np.array([priv_rela_l2_1,priv_rela_l2_2,priv_rela_l2_3,rela_l2])

In [None]:
#test
re_GDP_SNR = main_lowdim_GDP(n=5000, p=10, a=1, b=1, Gaussian_covariate=True, Gaussian_error=False)
#print(np.mean(re_GDP_SNR, axis=1))

#### Main function adapted for parallel computing

In [None]:
def main_GDP_SNR_parallel(m ):
    n_set = np.array([2500, 5000, 10000]) 
    p_set = np.array([5,10,20 ]) 
    a_set = np.array([0.5, 1, 2]) 
    b_set = np.array([0.5, 1, 2]) 
    TF_set = np.array([True, False])  
    results = []
    for T1 in TF_set:
        for T2 in TF_set: 
            for p in p_set: 
                for a in a_set:
                    for b in b_set:
                        for n in n_set: 
                            # true beta
                            rgt.seed(0) # set seed
                            beta = a*np.ones(p)*(2*rgt.binomial(1, 0.5, size=p)-1) #   beta: {a,-a,a,...}  
                            beta_norm = beta.dot(beta)**0.5 # the l2 norm of true beta
                            
                            #privacy parameters 
                            epsilon = np.array([.3, .5, .9])  
                            epsilon_init = epsilon/(2*np.sqrt(2))  # privacy budget for initialization  
                            epsilon_est = np.sqrt(7)*epsilon/(2*np.sqrt(2)) # privacy budget for estmation 

                            rgt.seed(m+10) # set seed
        
                            ############################################  
                            ############# generate data ################  
                            ############################################  
                            # X 
                            if T1 == True: X = rgt.normal(0, 1, size=(n,p-1)) 
                            else: X = rgt.uniform(-np.sqrt(3),np.sqrt(3), size=(n,p-1))
                            # error
                            if T2 == True: err = rgt.normal(0,1, n)
                            else: err = rgt.standard_t(2.25, n) 
                            # Y
                            Y = beta[0] + X.dot(beta[1:]) +    np.sqrt(b)*  err 

                            
                            ############################################### 
                            #-------------  Private (proposed)  ----------- 
                            ###############################################  
                            
                            #---- initialization ----
                            epsilon_tau =  epsilon_init/np.sqrt(2)  # privacy budget for GDP standard deviation estimation 
                            epsilon_ridge =  epsilon_init/np.sqrt(2) # privacy budget for GDP huber+ridge estimation 
                            tau_init_1 =   St_GDP(Y, epsilon = epsilon_tau[0])   
                            beta_0DP_1 =  huber_ridge_GDP(X, Y, tau=tau_init_1, epsilon= epsilon_ridge[0] ) 
                            tau_init_2 =   St_GDP(Y, epsilon = epsilon_tau[1])   
                            beta_0DP_2 =  huber_ridge_GDP(X, Y, tau=tau_init_2, epsilon= epsilon_ridge[1] ) 
                            tau_init_3 =   St_GDP(Y, epsilon = epsilon_tau[2])   
                            beta_0DP_3 =  huber_ridge_GDP(X, Y, tau=tau_init_3, epsilon= epsilon_ridge[2] ) 

                            #---- estimation ---- 
                            gamma = 0.5*(p + np.log(n))**0.5
                            T = int(np.ceil(2*np.log(n)))
                            sigma_scale = 2*gamma*np.sqrt(T)/epsilon_est
                            tau_priv_1 = .04* tau_init_1 * (n * epsilon[0]  /(p+np.log(n)))**0.5
                            tau_priv_2 = .04* tau_init_2 * (n * epsilon[1]  /(p+np.log(n)))**0.5
                            tau_priv_3 = .04* tau_init_3 * (n * epsilon[2]  /(p+np.log(n)))**0.5   
                            lr  =  0.2 # learning rate for both private and non-private estimations
                            model = Huber(X, Y,intercept=True) 
                            out_priv_1 = model.noisygd(tau=tau_priv_1, lr= lr, beta0=beta_0DP_1, B=gamma , sigma_scale=sigma_scale[0],T=T) 
                            out_priv_2 = model.noisygd(tau=tau_priv_2, lr= lr, beta0=beta_0DP_2, B=gamma , sigma_scale=sigma_scale[1],T=T) 
                            out_priv_3 = model.noisygd(tau=tau_priv_3, lr= lr, beta0=beta_0DP_3, B=gamma , sigma_scale=sigma_scale[2],T=T) 
                            priv_rela_l2_1 = np.log(np.sum((out_priv_1['beta']  - beta )**2 )**0.5  / beta_norm)
                            priv_rela_l2_2 = np.log(np.sum((out_priv_2['beta']  - beta )**2 )**0.5  / beta_norm)
                            priv_rela_l2_3 = np.log(np.sum((out_priv_3['beta']  - beta )**2 )**0.5  / beta_norm)
                            
                            
                            ############################################### 
                            #----------------  Non-private  --------------- 
                            ###############################################
                            model = Huber(X, Y,intercept=True)
                            tau0 = np.sqrt(np.mean(Y**2)-(np.mean(Y)**2))
                            tau_np = .2 *tau0* (n/(p+np.log(n)))**0.5   
                            out  = model.gd(tau=tau_np,  lr= 0.5, beta0= np.array([]), T=T)   
                            rela_l2 = np.log(np.sum((out['beta']  - beta )**2 )**0.5  / beta_norm) 

                            re_all = np.array([rela_l2,priv_rela_l2_1,priv_rela_l2_2,priv_rela_l2_3 ])
                            results.append(re_all)

    return results

In [6]:
# test 6 hours
cuda_cores = 20
M = 300
re_GDP_SNR = np.array(Parallel(n_jobs=cuda_cores)(delayed(main_GDP_SNR_parallel)(m ) for m in tqdm(range(M))))
results_GDP_SNR_array = np.mean(re_GDP_SNR, axis=0)
#print(results_GDP_SNR_array)

  0%|          | 0/300 [00:00<?, ?it/s]

In [7]:
import pandas as pd
df_GDP = pd.DataFrame(results_GDP_SNR_array) 
df_GDP.to_csv("results_lowdim_GDP.csv", index=False)