In [21]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from scipy.spatial import distance
from sklearn import metrics
import math


In [22]:
#=============================specify the model to fit============================
fitmodel = sm.families.Binomial(link = sm.families.links.logit())
#========================fit oracle model========================

#=============================function to calculate the inverse logit link============================
def invLink(lp):
    pv = 1/(1 + math.exp(-lp))
    return pv

# vectorize the function
invLinkVec = np.vectorize(invLink)
#========================Define the log-likelihood for evaluation========================
# logistic regression log-likelihood
def loglikeli(YEval, predCombined):
    ll = np.mean(YEval * np.log(predCombined) + (1-YEval) * np.log(1 - predCombined))
    return ll

In [23]:
#=============================read data=====================================
import pickle
fname = "data_splitted_dic.p"
infile = open(fname, 'rb')
new_dict = pickle.load(infile)
infile.close()

In [24]:
YTrain = new_dict["YTrain"]
YVal = new_dict["YVal"]
YTest = new_dict["YTest"]

XATrain = new_dict["XATrain"]
XBTrain = new_dict["XBTrain"]
XAVal = new_dict["XAVal"]
XBVal = new_dict["XBVal"]
XATest = new_dict["XATest"]
XBTest = new_dict["XBTest"]

In [25]:
# concatenate train and val for AE-AL

Y = np.concatenate((YTrain, YVal), axis = 0)

XA = np.concatenate((XATrain, XAVal), axis = 0)

XB = np.concatenate((XBTrain, XBVal), axis = 0)

X = np.concatenate((XA, XB),axis = 1)
XTest = np.concatenate((XATest, XBTest),axis = 1)

In [26]:
# function to fit and evaulate the VFL
# when batchsize=0, apply batch gradient descent instead of minibatch gradient descent
def fitVFL(Model, batchsize, Q, mu, eta0, etaFun, TrainPattern, Y, XA, XB, YEval, XEval, kNum, fitmodel, loglikeli, reportAUC, reportll):
    X = np.concatenate((XA, XB),axis = 1)

    # X matrix with intercept
    Xintercept = sm.add_constant(X)

    # XA matrix with intercept
    XAintercept = sm.add_constant(XA)

    # XB matrix with intercept
    XBintercept = sm.add_constant(XB)

    pB = XB.shape[1]

    XEvalintercept = sm.add_constant(XEval)
    # betaOracle: a 1-d np array of oracle model fitted coefficients
    # betaTemp: a 1-d np array of assisted learning model fitted coefficients
    # Y: predictor data
    # predCombined: prediction from the assisted learning model
    def eval(betaOracle, betaTemp, Y, predCombined, loglikeli):
        # calculate the Euclidean distance between the oracle model and assiste learning model fitted coefficients.
        EuDis = distance.euclidean(betaOracle, betaTemp)
        
        if reportAUC:
        #  calculate the AUC
            fpr, tpr, _ = metrics.roc_curve(Y, predCombined)
            AUC = metrics.auc(fpr, tpr)
        if reportll:
            # calculate the loglikelihood
            ll = loglikeli(Y, predCombined)

        if reportAUC:
            if reportll:
                return EuDis, AUC, ll
            else:
                return EuDis, AUC
        elif reportll:
            return EuDis, ll
        else:
            return EuDis
    if Model == "logcosh":
            #========================define functions for fitting log-cosh====================
            logcoshA = 0.3
            def gradCal(Y, XAintercept, beta0):
                # get the residual
                resid = Y - np.dot(XAintercept, beta0)
                # calculate the gradient
                grad = np.mean((np.tanh(logcoshA * resid).reshape(XAintercept.shape[0],-1)) * XAintercept, axis = 0)
                return(grad)
            # gradient calculation with offset
            def gradCalAssisted(Y, XAintercept, offset, beta0):
                # get the residual
                resid = Y - np.dot(XAintercept, beta0) - offset
                # calculate the gradient
                grad = np.mean((np.tanh(logcoshA * resid).reshape(XAintercept.shape[0],-1)) * XAintercept, axis = 0)
                return(grad)

            def HessCal(Y, XAintercept, beta0):
                # get the residual
                resid = Y - np.dot(XAintercept, beta0)
                # calculate the Hessian
                Hess = logcoshA * X.shape[0]**(-1) * np.dot(np.dot(np.transpose(XAintercept), np.diag(np.cosh(logcoshA * resid)**(-1))), XAintercept)
                return(Hess)
            def fitLogCosh(Y, XAintercept):
                # fit a linear regression as the initial value
                model = sm.GLM(endog = Y, exog = XAintercept, family = fitmodel)
                # get initial value
                beta0 = model.fit().params

                # evaluate the gradient
                grad_updated = gradCal(Y, XAintercept, beta0)
                gradL2 = np.mean(grad_updated**2)

                # set convergence threthold
                thre = 1e-15
                # set counter
                ct = 0
                while (gradL2 > thre) | (ct <100):
                    
                    grad = gradCal(Y, XAintercept, beta0)
                    Hess = HessCal(Y, XAintercept, beta0)
                    beta0 = beta0 + np.dot(np.linalg.inv(Hess), grad)
                    grad_updated = gradCal(Y, XAintercept, beta0)
                    ct = ct + 1
                    gradL2 = np.mean(grad_updated**2)
                return beta0

    #========================fit oracle model========================
    if Model == "logcosh":
        betaOracle = fitLogCosh(Y, Xintercept)
    else:
        oracle_model = sm.GLM(endog = Y, exog = Xintercept, family = fitmodel)
        oracle_results = oracle_model.fit()
        betaOracle = oracle_results.params

    # evaluation metrics for the oracle model
    if reportAUC | reportll:
        # linear predictor values from the initial value.
        lp_oracle = np.dot(XEvalintercept, betaOracle )

        # fitted probabilities
        predCombined_oracle =invLinkVec(lp_oracle)

    # evaluate the performance of the oracle model
    if Model != "logcosh":
        if reportAUC:
            if reportll:
                _, AUC_oracle, ll_oracle = eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
            else:
                _, AUC_oracle = eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
        elif reportll:
            _, ll_oracle = eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
    #========================fit the initial model by A========================
    # obtain initial values
    # fit the model from A
    if Model == "logcosh":
        betaA = fitLogCosh(Y, XAintercept)
    else:
        modelA = sm.GLM(endog = Y, exog = XAintercept, family = fitmodel)
        resultsA = modelA.fit()
        betaA = resultsA.params

    # initialize betaB by zeros
    betaB = np.zeros((pB + 1))
    #===============================Evaluation for the initial value=====================
    # obtain the estimated coefficients from the initial values. take 0 for those from B
    betaTemp = np.concatenate((betaA, np.repeat(0, pB)))

    if reportAUC | reportll:
        # linear predictor values from the initial value.
        lp = np.dot(XEvalintercept, betaTemp)

        # fitted probabilities
        predCombined =invLinkVec(lp)

    # evaluate the performance of the initial value
    if reportAUC:
        if reportll:
            EuDis, AUC, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        else:
            EuDis, AUC = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
    elif reportll:
        EuDis, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
    else:
        EuDis = distance.euclidean(betaOracle, betaTemp)    


    #store the evaluation results
    # a list that stores the Euclidean distance between the assisted learning beta and oracl beta
    EuDisList = [EuDis]

    # a list that stores the AUC values
    if reportAUC:
        AUCList = [AUC]

    # a list that stores the log-likelihoods
    if reportll:
        llList = [ll]

    # an array that stores betaTemp
    betaTempArray = betaTemp.reshape(1,-1)

    #===============================start fitting the assisted learning model with kNum iteration=====================
    for k in range(kNum):
        # calculate the current step size 
        eta = etaFun(k, eta0)
        # record most recent syncronized betas before the updates
        betaB_old = betaB
        betaA_old = betaA

        if batchsize == 0:
            XBintercept_minibatch = XBintercept
            XAintercept_minibatch = XAintercept
            Y_minibatch = Y
        else:
            # sample a minibatch
            minibatch_ind = np.random.choice(X.shape[0], size=batchsize, replace=False)

            XBintercept_minibatch = XBintercept[minibatch_ind, :]
            XAintercept_minibatch = XAintercept[minibatch_ind, :]
            Y_minibatch = Y[minibatch_ind]
        # exchanged values
        lpA = np.dot(XAintercept_minibatch, betaA)
        lpB = np.dot(XBintercept_minibatch, betaB)
        #===============================update of B===============================
        for Qnum in range(Q):
            # calculate the current lpB
            lpB_Q = np.dot(XBintercept_minibatch, betaB)
            if Model == "logcosh":
                Grad_B = gradCalAssisted(Y_minibatch, XBintercept_minibatch, lpA, betaB)
            else:

                # calculate the fitted probabilities
                lpCombined_Q_B = lpA + lpB_Q
                predCombined_B =invLinkVec(lpCombined_Q_B)
                # calculate the gradient, add the proximal term
                Grad_B = np.dot((Y_minibatch - predCombined_B),XBintercept_minibatch)/len(Y_minibatch) - mu * (betaB - betaB_old)
            betaB = betaB + eta * Grad_B
        #===========================================================================
        # if the pattern is sequential, A gets the udpated lpB from B. Otherwise, it gets the lpB before B's update
        if TrainPattern == "sequential":
            lpB = np.dot(XBintercept_minibatch, betaB)
            betaB_old = betaB

        #===============================update of A===============================
        for Qnum in range(Q):
            # calculate the current lpA
            lpA_Q = np.dot(XAintercept_minibatch, betaA)
            if Model == "logcosh":
                Grad_A = gradCalAssisted(Y_minibatch, XAintercept_minibatch, lpB, betaA)
            else:

                # calculate the fitted probabilities
                lpCombined_Q_A = lpA_Q + lpB
                predCombined_A =invLinkVec(lpCombined_Q_A)
                # calculate the gradient add the proximal term
                Grad_A = np.dot((Y_minibatch - predCombined_A),XAintercept_minibatch)/len(Y_minibatch) - mu * (betaA - betaA_old)
            betaA = betaA + eta * Grad_A

        # combine the two coefficients from A and B
        betaTemp = np.concatenate(([betaA[0] + betaB[0]],betaA[1:], betaB[1:]))
        betaTempArray = np.concatenate((betaTempArray, betaTemp.reshape(1,-1)), axis = 0)
        
        if reportAUC | reportll:
            # calculate the the linear predictor value
            lp = np.dot(XEvalintercept, betaTemp)
            # calculat the fitted probabilities from the asissted learning model   
            predCombined =invLinkVec(lp)

        # evaluate the performance of the current model
            
        if reportAUC:
            if reportll:
                EuDis, AUC, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
            else:
                EuDis, AUC = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        elif reportll:
            EuDis, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        else:
            EuDis = distance.euclidean(betaOracle, betaTemp)    
    

        if reportAUC:
            AUCList.append(AUC)
        if reportll:
            llList.append(ll)
        EuDisList.append(EuDis)
    # return the evaluation results
    if reportAUC:
        if reportll:
            return AUCList, EuDisList, llList, AUC_oracle, ll_oracle
        else:
            return AUCList, EuDisList, AUC_oracle
    elif reportll:
        return EuDisList, llList, ll_oracle
    else:
        return EuDisList

In [27]:
# fit VFL
kNum = 50
reportAUC = True
reportll = False

Model = "logistic"
TrainPattern = "parallel"
# batchsize = 0
batchsizeList = [0, 32]

# number of eta0 values
numEta0s = 20 
minval = 10**(-5)
maxval = 0.1
#eta0List = math.e**(np.linspace(math.log(minval), math.log(maxval), num=numEta0s))[::-1]
eta0List = math.e**(np.linspace(math.log(minval), math.log(maxval), num=numEta0s))
# function to generate the decay learning rate, where k is the iteration number
def etaFun(k, eta0):
    eta = eta0/math.sqrt(k + 1)
    return(eta)



In [28]:
#=========================================training of FedBCD==============================

muList = [0, 0.1]
#muList = [0.1]
for batchsize in batchsizeList:
    for mu in muList:
        if mu == 0:
            #QList = [1, 5, 10, 25]
            # do not train fedSGD together with fedBCD
            QList = [5, 10, 25]
        else:
            QList = [5, 10, 25]
        for Q in QList:
            for eta0 in eta0List:
                try: 
                    AUCList, EuDisList, AUC_oracle = fitVFL(Model, batchsize, Q, mu, eta0, etaFun, TrainPattern, Y, XA, XB, YTest, XTest, kNum, fitmodel, loglikeli, reportAUC, reportll)
                    #==============================================export the results===============================================
                    result = {"EuDis": EuDisList, "AUC": AUCList,  "AUC_oracle": AUC_oracle}

                    pickle.dump(result, open("VFL_result_batchsize" + str(batchsize) + "_Q_" + str(Q) + "_mu_" + str(mu) + "_eta0_" + str(eta0) + "TrainPattern" + TrainPattern + "_dic.p", "wb"))

                except BaseException:
                    print('The step size is too large, stop and try other settings')
                    break




            

The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings
The step size is too large, stop and try other settings


In [29]:
#=========================================training of FedSGD==============================
# apply a set of larger step sizes
minval2 = 10**(-5)
maxval2 = 0.1
#eta0List = math.e**(np.linspace(math.log(minval), math.log(maxval), num=numEta0s))[::-1]
eta0List2 = math.e**(np.linspace(math.log(minval2), math.log(maxval2), num=numEta0s))

#batchsize = 0
Q = 1
mu = 0
for batchsize in batchsizeList:
    for eta0 in eta0List2:
        try: 
            AUCList, EuDisList, AUC_oracle = fitVFL(Model, batchsize, Q, mu, eta0, etaFun, TrainPattern, Y, XA, XB, YTest, XTest, kNum, fitmodel, loglikeli, reportAUC, reportll)
            #==============================================export the results===============================================
            result = {"EuDis": EuDisList, "AUC": AUCList,  "AUC_oracle": AUC_oracle}

            pickle.dump(result, open("VFL_result_batchsize" + str(batchsize) + "_Q_" + str(Q) + "_mu_" + str(mu) + "_eta0_" + str(eta0) + "TrainPattern" + TrainPattern + "_dic.p", "wb"))

        except BaseException:
            print('The step size is too large, stop and try other settings')
            break

The step size is too large, stop and try other settings
The step size is too large, stop and try other settings


In [None]:
resultVals = {"maxval": maxval, "minval": minval, "maxval2": maxval2, "minval2": minval2}

pickle.dump(resultVals, open("minmaxVals_dic.p", "wb"))

