In [95]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from scipy.spatial import distance
from sklearn import metrics
import math


In [96]:
#=============================specify the model to fit============================
fitmodel = sm.families.Binomial(link = sm.families.links.logit())
#========================fit oracle model========================

#=============================function to calculate the inverse logit link============================
def invLink(lp):
    pv = 1/(1 + math.exp(-lp))
    return pv

# vectorize the function
invLinkVec = np.vectorize(invLink)
#========================Define the log-likelihood for evaluation========================
# logistic regression log-likelihood
def loglikeli(YEval, predCombined):
    ll = np.mean(YEval * np.log(predCombined) + (1-YEval) * np.log(1 - predCombined))
    return ll

In [97]:
#=============================read data=====================================
import pickle
fname = "data_splitted_dic.p"
infile = open(fname, 'rb')
new_dict = pickle.load(infile)
infile.close()

In [98]:
YTrain = new_dict["YTrain"]
YVal = new_dict["YVal"]
YTest = new_dict["YTest"]

XATrain = new_dict["XATrain"]
XBTrain = new_dict["XBTrain"]
XAVal = new_dict["XAVal"]
XBVal = new_dict["XBVal"]
XATest = new_dict["XATest"]
XBTest = new_dict["XBTest"]

In [99]:
# concatenate train and val for AE-AL

Y = np.concatenate((YTrain, YVal), axis = 0)

XA = np.concatenate((XATrain, XAVal), axis = 0)

XB = np.concatenate((XBTrain, XBVal), axis = 0)

X = np.concatenate((XA, XB),axis = 1)
XTest = np.concatenate((XATest, XBTest),axis = 1)

In [100]:
def fitAssisted(Y, XA, XB, YEval, XEval, kNum, fitmodel, loglikeli, reportAUC, reportll):
    X = np.concatenate((XA, XB),axis = 1)

    # X matrix with intercept
    Xintercept = sm.add_constant(X)

    # XA matrix with intercept
    XAintercept = sm.add_constant(XA)

    # XB matrix with intercept
    XBintercept = sm.add_constant(XB)


    XEvalintercept = sm.add_constant(XEval)
    # betaOracle: a 1-d np array of oracle model fitted coefficients
    # betaTemp: a 1-d np array of assisted learning model fitted coefficients
    # Y: predictor data
    # predCombined: prediction from the assisted learning model
    def eval(betaOracle, betaTemp, Y, predCombined, loglikeli):
        # calculate the Euclidean distance between the oracle model and assiste learning model fitted coefficients.
        EuDis = distance.euclidean(betaOracle, betaTemp)
        
        if reportAUC:
        #  calculate the AUC
            fpr, tpr, _ = metrics.roc_curve(Y, predCombined)
            AUC = metrics.auc(fpr, tpr)
        if reportll:
            # calculate the loglikelihood
            ll = loglikeli(Y, predCombined)

        if reportAUC:
            if reportll:
                return EuDis, AUC, ll
            else:
                return EuDis, AUC
        elif reportll:
            return EuDis, ll
        else:
            return EuDis


    #========================fit oracle model========================

    oracle_model = sm.GLM(endog = Y, exog = Xintercept, family = fitmodel)
    oracle_results = oracle_model.fit()
    betaOracle = oracle_results.params

    # linear predictor values from the initial value.
    lp_oracle = np.dot(XEvalintercept, betaOracle )

    # fitted probabilities
    predCombined_oracle =invLinkVec(lp_oracle)

    # evaluate the performance of the oracle model
    if reportAUC:
        if reportll:
            _, AUC_oracle, ll_oracle = eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
        else:
            _, AUC_oracle= eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
    elif reportll:
        _, ll_oracle = eval(betaOracle, betaOracle, YEval, predCombined_oracle, loglikeli)
    #========================fit assisted model========================
    # obtain initial values
    # fit the model from A
    modelA = sm.GLM(endog = Y, exog = XAintercept, family = fitmodel)
    resultsA = modelA.fit()
    # calcula the linear predictor
    lpA = np.dot(XAintercept, resultsA.params)

    #===============================Evaluation for the initial value=====================
    # obtain the estimated coefficients from the initial values. take 0 for those from B
    betaTemp = np.concatenate((resultsA.params, np.repeat(0, XB.shape[1])))

    # linear predictor values from the initial value.
    lp = np.dot(XEvalintercept, betaTemp)

    # fitted probabilities
    predCombined =invLinkVec(lp)

    # evaluate the performance of the initial value
    if reportAUC:
        if reportll:
            EuDis, AUC, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        else:
            EuDis, AUC = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
    elif reportll:
        EuDis, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
    else:
        ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)

    #store the evaluation results
    # a list that stores the Euclidean distance between the assisted learning beta and oracl beta
    EuDisList = [EuDis]

    # a list that stores the AUC values
    if reportAUC:
        AUCList = [AUC]

    # a list that stores the log-likelihoods
    if reportll:
        llList = [ll]

    # an array that stores betaTemp
    betaTempArray = betaTemp.reshape(1,-1)

    #===============================start fitting the assisted learning model with kNum iteration=====================
    for k in range(kNum):
        # B fits the model
        modelB = sm.GLM(endog = Y, exog = XBintercept, offset =lpA, family = fitmodel)
        resultsB = modelB.fit()
        # calcula the linear predictor
        lpB = np.dot(XBintercept, resultsB.params)

        # A fits the model
        modelA = sm.GLM(endog = Y, exog = XAintercept, offset =lpB, family = fitmodel)
        resultsA = modelA.fit()
        # calcula the linear predictor
        lpA = np.dot(XAintercept, resultsA.params)
        # combine the two coefficients from A and B
        betaTemp = np.concatenate(([resultsA.params[0] + resultsB.params[0]],resultsA.params[1:], resultsB.params[1:]))
        betaTempArray = np.concatenate((betaTempArray, betaTemp.reshape(1,-1)), axis = 0)
        
        # calcula the the linear predictor value
        lp = np.dot(XEvalintercept, betaTemp)

        # cap the values
        for i in range(len(lp)):
            if lp[i] > 10**2:
                lp[i] = 10**2
            elif lp[i] < -10**2:
                lp[i] = -10**2
        # calculat the fitted probabilities from the asissted learning model   
        predCombined =invLinkVec(lp)

        # evaluate the performance of the current model
        if reportAUC:
            if reportll:
                EuDis, AUC, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
            else:
                EuDis, AUC = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        elif reportll:
            EuDis, ll = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)
        else:
            EuDis = eval(betaOracle, betaTemp, YEval, predCombined, loglikeli)

        if reportAUC:
            AUCList.append(AUC)
        if reportll:
            llList.append(ll)

        EuDisList.append(EuDis)
        
            # return the evaluation results
    if reportAUC:
        if reportll:
            return AUCList, EuDisList, llList, AUC_oracle, ll_oracle
        else:
            return AUCList, EuDisList, AUC_oracle
    elif reportll:
        return EuDisList, llList, ll_oracle
    else:
        return EuDisList

In [101]:
# fit AE-AL
kNum = 50
reportAUC = True
reportll = False
AUCList, EuDisList, AUC_oracle = fitAssisted(Y, XA, XB, YTest, XTest, kNum, fitmodel, loglikeli, reportAUC, reportll)

In [103]:
#==============================================export the results===============================================
result = {"EuDis": EuDisList, "AUC": AUCList,  "AUC_oracle": AUC_oracle}

pickle.dump(result, open("AE_AL_result_dic.p", "wb"))
