In [1]:
from all_functions import * 
from sklearn.decomposition import PCA
import copy
from scipy.linalg import eigh
from sklearn.linear_model import LinearRegression

In [2]:
"""
This new functions takes the following inputs: 

    name -- the name of the Y variables, should be within double quote
    ncomps -- number of principal components to be used 
    error_comps -- number of error components to be used 
    outlier_remove -- a logical variable indicating whether to remove outliers 
    
This function select the Y variable based on the input name, and set X as 
all other variables. 

"""
def FRED_MD_DATA_PREP(name, ncomps = 5, error_comps = 5, outlier_remove = False):
   
    """
    Read the entire data
    """
    
    data = pd.read_csv('Transformed_na_removed.csv')
    
    """
    Set the Y axis to be the column of given name in the input, 
    remove that column and set the rest of the matrix as X. 
    Finally, do PCA on the X matrix. 
    
    """
    
    Y_axis = name
    X = copy.deepcopy(data)
    X.pop(name)
    X = X.to_numpy()
    n_data = X.shape[0]
    Y = data[name].to_numpy()
    W, V = eigh(X @ X.T)
    F_hat = np.sqrt(n_data) * V[:,-ncomps:] # select last ncomps columns of V
    B_hat = (1/n_data) * (X.T @ F_hat)
    
    if error_comps > 0:
        
        """
        First we project out the space of principal components
        to estimate the idiosyncratic errors. 
        """
        
#         loadings = X.T @ X_pca @ np.linalg.inv(X_pca.T @ X_pca) # OLS of X on X.pca
#         errors = X - X_pca @ loadings.T # OLS residuals
        U_hat = X - (F_hat @ B_hat.T)
        model = LinearRegression()
        model.fit(F_hat, Y)
        Y_u = Y - model.predict(F_hat)
        
        
        """
            Next we apply conditional SURE independent screening to select the 
            idiosyncratic components.
        """
       
        def cor_cal(x):
            return np.corrcoef(x, Y_u)[0, 1]
        
        cor_vec = np.apply_along_axis(cor_cal, 0, U_hat)
        idx = np.argsort(-np.abs(cor_vec))
        errors_to_add = U_hat[:,idx[:error_comps]]
        Data = np.hstack([F_hat, errors_to_add, Y.reshape(-1, 1)])
    
    else:
        Data = np.hstack([F_hat, Y.reshape(-1, 1)])
    
    if outlier_remove:
        Q1 = np.percentile(Y, 25, method = 'midpoint')
        Q3 = np.percentile(Y, 75, method = 'midpoint')
        IQR = Q3 - Q1
        upper=Q3+1.5*IQR
        upper_array=np.array(Y<=upper)
        lower=Q1-1.5*IQR
        lower_array=np.array(Y>=lower)
        index_keep = upper_array & lower_array
        Data = Data[index_keep[:,0].tolist(),:]
        print("The number of data being removed is ", Y.shape[0]-Data.shape[0])
        
    return Data, Y_axis

In [3]:
def UTOPIA_FRED_MD_MULTIVARIATE(Data, shrink = True):
    train_idx = int(Data.shape[0] * 0.8)
    train_data, test_data = np.split(Data, [train_idx])
    X_pre = train_data[:,:-1]
    Y_pre = train_data[:,-1].reshape(-1,1)
    X_opt = train_data[:,:-1]
    Y_opt = train_data[:,-1].reshape(-1,1)
    X_adj = train_data[:,:-1]
    Y_adj = train_data[:,-1].reshape(-1,1)
    X_t = test_data[:,:-1]
    Y_t = test_data[:,-1].reshape(-1,1)


    n_pre = len(Y_pre)
    n_opt = len(Y_opt)
    n_adj = len(Y_adj)
    n_t = len(Y_t)

    # Obtain mean estimator
    known_mean = "False"
    if known_mean == "True":
        M_pre = np.zeros(n_pre).reshape(-1,1)
        M_opt = np.zeros(n_opt).reshape(-1,1)
        M_adj = np.zeros(n_adj).reshape(-1,1)
        M_t = np.zeros(n_t).reshape(-1,1)
    else:
        est_type = "NN1"
        M_pre, M_opt, M_adj, M_t = mean_est(est_type,X_pre,Y_pre,X_opt,X_adj,X_t)

    # Obtain variance estimator
    var_opt, var_adj, var_t = var_est(X_pre,Y_pre,M_pre,X_opt,X_adj,X_t,est_type ="NN1")


    # Obtain quantile estimators
    quantile = [0.05,0.35,0.65,0.95]
    m1,Q1_opt,Q1_adj,Q1_t = est_quantile("NN1",quantile[0],X_pre,Y_pre,X_opt,X_adj,X_t)
    m2,Q2_opt,Q2_adj,Q2_t = est_quantile("NN2",quantile[1],X_pre,Y_pre,X_opt,X_adj,X_t)
    m3,Q3_opt,Q3_adj,Q3_t = est_quantile("qrf",quantile[2],X_pre,Y_pre,X_opt,X_adj,X_t)
    m4,Q4_opt,Q4_adj,Q4_t = est_quantile("gb",quantile[3],X_pre,Y_pre,X_opt,X_adj,X_t)

    # construct estimator matrix
    E_opt = np.hstack(((Q1_opt-M_opt)**2, (Q2_opt-M_opt)**2, (Q3_opt-M_opt)**2, (Q4_opt-M_opt)**2, var_opt))
    E_opt = E_opt.T
    E_adj = np.hstack(((Q1_adj-M_adj)**2, (Q2_adj-M_adj)**2, (Q3_adj-M_adj)**2, (Q4_adj-M_adj)**2, var_adj))
    E_adj = E_adj.T
    E_t = np.hstack(((Q1_t-M_t)**2, (Q2_t-M_t)**2, (Q3_t-M_t)**2, (Q4_t-M_t)**2, var_t))
    E_t = E_t.T


    # solve optimization problem

    optimal_weight, V100_adj, V100_t= solve_opt(X_opt,Y_opt, M_opt, M_adj, M_t, X_adj, X_t, "aug", E_opt, E_adj, E_t)
    # opt_sol, V100_adj, V100_t = solve_opt(X_opt,Y_opt, M_opt, M_adj, M_t, X_adj, X_t, "rkhs_poly", degree = 2)
    # opt_sol, V100_adj, V100_t = solve_opt(X_opt,Y_opt, M_opt, M_adj, M_t, X_adj, X_t, "rkhs_rbf", sigma = 1)

    # adjust interval
    if shrink: 
        alpha = 0.05
        delta = interval_adj(X_adj,Y_adj,M_adj,V100_adj,alpha,stepsize = 0.001)

        # plot
        V_alpha_t = delta*V100_t
    else:
        V_alpha_t = V100_t
    
    coverage = (np.power(Y_t[:,0]-M_t[:,0], 2) <= V_alpha_t[:,0]).mean()
    bandwidth = np.mean(V_alpha_t[:,0])
    print("The overall coverage for UTOPIA is", coverage)
    print("Mean bandwidth on test data for UTOPIA is", bandwidth)
    return X_t,Y_t,M_t,V_alpha_t,coverage,bandwidth

In [4]:
def LQR_FRED_MD_MULTIVARIATE(Data):
    train_idx = int(Data.shape[0] * 0.8)
    train_data, test_data = np.split(Data, [train_idx])

    X_lin = train_data[:,:-1]
    Y_lin = train_data[:,-1].reshape(-1,1)
    y_lin = Y_lin[:,0]
    X_quantile = train_data[:,:-1]
    Y_quantile = train_data[:,-1].reshape(-1,1)
    X_test = test_data[:,:-1]
    Y_test = test_data[:,-1].reshape(-1,1)

    # Estimate the mean using NN1
    est_type = "NN1"
    M_quantile, M_test = mean_est_others(est_type,X_lin,Y_lin,X_quantile,X_test)

    # Estimate the quantile
    alpha = 0.05

    model_quantile = QuantileRegressor(quantile=1-(alpha/2), alpha=0, solver = 'highs')
    model_quantile.fit(X_quantile, (Y_quantile-M_quantile).reshape(-1))
    Q_test = model_quantile.predict(X_test)


    V_test = Q_test**2
    V_test = V_test.reshape(-1,1)
    coverage = (np.power(Y_test[:,0]-M_test[:,0], 2) <= V_test[:,0]).mean()
    bandwidth = np.mean(V_test[:,0])
    print("The overall coverage for LQR is", coverage)
    print("Mean bandwidth on test data for LQR is", bandwidth)
    return X_test,Y_test,M_test,V_test, coverage, bandwidth

In [5]:
def SplitCF_FRED_MD_MULTIVARIATE(Data): 
    train_idx = int(Data.shape[0] * 0.8)
    train_data, test_data = np.split(Data, [train_idx])


    X_lin = train_data[:,:-1]
    Y_lin = train_data[:,1].reshape(-1,1)
    y_lin = Y_lin[:,0]
    X_res = train_data[:,:-1]
    Y_res = train_data[:,1].reshape(-1,1)
    y_res = Y_res[:,0]
    X_test = test_data[:,:-1]
    Y_test = test_data[:,1].reshape(-1,1)
    y_test = Y_test[:,0]

    # Estimate the mean using NN1
    est_type = "NN1"
    Y_res_pred, M_test = mean_est_others(est_type,X_lin,Y_lin,X_res,X_test)
    y_res_pred = Y_res_pred[:,0]

    # Calculate the conformity scores
    residuals = np.abs(y_res - y_res_pred)

    alpha = 0.05  # 95% confidence level
    k = int((1 - alpha) * len(y_lin))
    residuals_sorted = np.sort(residuals)
    threshold = residuals_sorted[k]

    # Calculate the prediction interval
    v_test = (threshold**2)*np.ones(len(y_test))
    V_test = v_test.reshape(-1,1)
    coverage = (np.power(Y_test[:,0]-M_test[:,0], 2) <= V_test[:,0]).mean()
    bandwidth = np.mean(V_test[:,0])
    print("The overall coverage for SplitCF is", coverage)
    print("Mean bandwidth on test data for SplitCF is", bandwidth)
    return X_test,Y_test,M_test,V_test,coverage,bandwidth

In [6]:

Data, Y_axis = FRED_MD_DATA_PREP('FEDFUNDS', ncomps=2, error_comps=2)


X_t,Y_t,M_t,V_alpha_t,covarage,bandwidth = UTOPIA_FRED_MD_MULTIVARIATE(Data, shrink = True)
X_test,Y_test,M_test,V_test,covarage,bandwidth = LQR_FRED_MD_MULTIVARIATE(Data)
X_test,Y_test,M_test,V_test,coverage,bandwidth = SplitCF_FRED_MD_MULTIVARIATE(Data)

The overall coverage for UTOPIA is 0.989010989010989
Mean bandwidth on test data for UTOPIA is 0.05000081006534005
The overall coverage for LQR is 0.4175824175824176
Mean bandwidth on test data for LQR is 0.002460689001458032
The overall coverage for SplitCF is 0.6043956043956044
Mean bandwidth on test data for SplitCF is 0.004344519265782366


In [7]:
data = pd.read_csv('Transformed_na_removed.csv')
col = data.columns

In [8]:
output_coverage = np.zeros((len(data.columns)-1, 3))
output_bandwidth = np.zeros((len(data.columns)-1, 3))

for j in range(1, len(data.columns)):
    idx = data.columns[j]
    Data, Y_axis = FRED_MD_DATA_PREP(idx, ncomps=2, error_comps=2)
    
    X_t,Y_t,M_t,V_alpha_t, coverage_utopia, bandwidth_utopia = UTOPIA_FRED_MD_MULTIVARIATE(Data, shrink = True)
    X_test,Y_test,M_test,V_test,coverage_lqr, bandwidth_lqr = LQR_FRED_MD_MULTIVARIATE(Data)
    X_test,Y_test,M_test,V_test, coverage_split, bandwidth_split = SplitCF_FRED_MD_MULTIVARIATE(Data)
    
    output_coverage[j,] = np.array([coverage_utopia, coverage_lqr, coverage_split]).reshape(-1)
    output_bandwidth[j,] = np.array([bandwidth_utopia, bandwidth_lqr, bandwidth_split]).reshape(-1)
    print(j, "th index is done!")

The overall coverage for UTOPIA is 0.6703296703296703
Mean bandwidth on test data for UTOPIA is 0.00018222621644607435
The overall coverage for LQR is 0.6153846153846154
Mean bandwidth on test data for LQR is 0.000152136549915781
The overall coverage for SplitCF is 0.5054945054945055
Mean bandwidth on test data for SplitCF is 0.002352426249221485
1 th index is done!




The overall coverage for UTOPIA is 0.6153846153846154
Mean bandwidth on test data for UTOPIA is 0.00017330117484767128
The overall coverage for LQR is 0.6373626373626373
Mean bandwidth on test data for LQR is 0.00018883164663082753
The overall coverage for SplitCF is 0.4945054945054945
Mean bandwidth on test data for SplitCF is 0.002344698801502436
2 th index is done!
The overall coverage for UTOPIA is 0.6263736263736264
Mean bandwidth on test data for UTOPIA is 0.0001913887330110964
The overall coverage for LQR is 0.4065934065934066
Mean bandwidth on test data for LQR is 0.00010354854292050874
The overall coverage for SplitCF is 0.46153846153846156
Mean bandwidth on test data for SplitCF is 0.002342474412501553
3 th index is done!
The overall coverage for UTOPIA is 0.8571428571428571
Mean bandwidth on test data for UTOPIA is 0.00034666384385599684
The overall coverage for LQR is 0.7142857142857143
Mean bandwidth on test data for LQR is 0.0002087958056166355
The overall coverage for Sp



The overall coverage for UTOPIA is 0.6373626373626373
Mean bandwidth on test data for UTOPIA is 0.0001917439449883254
The overall coverage for LQR is 0.5164835164835165
Mean bandwidth on test data for LQR is 0.00015594872958674447
The overall coverage for SplitCF is 0.5274725274725275
Mean bandwidth on test data for SplitCF is 0.00253358134283988
7 th index is done!
The overall coverage for UTOPIA is 0.6153846153846154
Mean bandwidth on test data for UTOPIA is 0.00017423132111903225
The overall coverage for LQR is 0.4945054945054945
Mean bandwidth on test data for LQR is 0.00011653714248864585
The overall coverage for SplitCF is 0.5604395604395604
Mean bandwidth on test data for SplitCF is 0.002937484375050804
8 th index is done!
The overall coverage for UTOPIA is 0.6153846153846154
Mean bandwidth on test data for UTOPIA is 0.00017597967070776224
The overall coverage for LQR is 0.6043956043956044
Mean bandwidth on test data for LQR is 0.0001956459362664035
The overall coverage for Spli



The overall coverage for UTOPIA is 0.6043956043956044
Mean bandwidth on test data for UTOPIA is 0.00019176768709165524
The overall coverage for LQR is 0.6043956043956044
Mean bandwidth on test data for LQR is 0.0001650301298554606
The overall coverage for SplitCF is 0.46153846153846156
Mean bandwidth on test data for SplitCF is 0.002225867159971563
32 th index is done!
The overall coverage for UTOPIA is 0.5604395604395604
Mean bandwidth on test data for UTOPIA is 0.0001707406724241567
The overall coverage for LQR is 0.6373626373626373
Mean bandwidth on test data for LQR is 0.00017647150864270173
The overall coverage for SplitCF is 0.4725274725274725
Mean bandwidth on test data for SplitCF is 0.00228077117561467
33 th index is done!
The overall coverage for UTOPIA is 0.8021978021978022
Mean bandwidth on test data for UTOPIA is 0.0005915822157648058
The overall coverage for LQR is 0.5934065934065934
Mean bandwidth on test data for LQR is 0.00019506804028770973
The overall coverage for Sp



The overall coverage for UTOPIA is 0.3956043956043956
Mean bandwidth on test data for UTOPIA is 0.0001809992844129827
The overall coverage for LQR is 0.5824175824175825
Mean bandwidth on test data for LQR is 0.00018605690666764478
The overall coverage for SplitCF is 0.4725274725274725
Mean bandwidth on test data for SplitCF is 0.0022949339188943767
36 th index is done!
The overall coverage for UTOPIA is 0.6813186813186813
Mean bandwidth on test data for UTOPIA is 0.0001756647745579485
The overall coverage for LQR is 0.6043956043956044
Mean bandwidth on test data for LQR is 0.00015301781254212582
The overall coverage for SplitCF is 0.46153846153846156
Mean bandwidth on test data for SplitCF is 0.002302944557336688
37 th index is done!
The overall coverage for UTOPIA is 0.4725274725274725
Mean bandwidth on test data for UTOPIA is 0.0001922697259056958
The overall coverage for LQR is 0.5054945054945055
Mean bandwidth on test data for LQR is 0.000161452814228166
The overall coverage for Sp



The overall coverage for UTOPIA is 0.8681318681318682
Mean bandwidth on test data for UTOPIA is 0.00021183629683086568
The overall coverage for LQR is 0.5164835164835165
Mean bandwidth on test data for LQR is 0.00013414281373989184
The overall coverage for SplitCF is 0.45054945054945056
Mean bandwidth on test data for SplitCF is 0.002189421361870305
41 th index is done!
The overall coverage for UTOPIA is 0.7472527472527473
Mean bandwidth on test data for UTOPIA is 0.00018633996147584213
The overall coverage for LQR is 0.6043956043956044
Mean bandwidth on test data for LQR is 0.0001637744703339571
The overall coverage for SplitCF is 0.42857142857142855
Mean bandwidth on test data for SplitCF is 0.002142533078551305
42 th index is done!
The overall coverage for UTOPIA is 0.5934065934065934
Mean bandwidth on test data for UTOPIA is 0.0001820779624825388
The overall coverage for LQR is 0.5714285714285714
Mean bandwidth on test data for LQR is 0.0001608566644647923
The overall coverage for 



The overall coverage for UTOPIA is 0.8131868131868132
Mean bandwidth on test data for UTOPIA is 0.00018716442195082693
The overall coverage for LQR is 0.4945054945054945
Mean bandwidth on test data for LQR is 0.00012237729620911286
The overall coverage for SplitCF is 0.5164835164835165
Mean bandwidth on test data for SplitCF is 0.0025423051530061914
72 th index is done!
The overall coverage for UTOPIA is 0.8351648351648352
Mean bandwidth on test data for UTOPIA is 0.00029209324976607257
The overall coverage for LQR is 0.7582417582417582
Mean bandwidth on test data for LQR is 0.00030705423316092917
The overall coverage for SplitCF is 0.5384615384615384
Mean bandwidth on test data for SplitCF is 0.0025444478911147905
73 th index is done!
The overall coverage for UTOPIA is 0.8351648351648352
Mean bandwidth on test data for UTOPIA is 0.0003646368719815483
The overall coverage for LQR is 0.6373626373626373
Mean bandwidth on test data for LQR is 0.00014610048634262787
The overall coverage fo



The overall coverage for UTOPIA is 0.6923076923076923
Mean bandwidth on test data for UTOPIA is 0.0001814053763959844
The overall coverage for LQR is 0.5274725274725275
Mean bandwidth on test data for LQR is 0.000130123366011963
The overall coverage for SplitCF is 0.4945054945054945
Mean bandwidth on test data for SplitCF is 0.002306691203823418
111 th index is done!




The overall coverage for UTOPIA is 0.9230769230769231
Mean bandwidth on test data for UTOPIA is 0.0002034282713646593
The overall coverage for LQR is 0.4835164835164835
Mean bandwidth on test data for LQR is 0.00011987542674033671
The overall coverage for SplitCF is 0.4945054945054945
Mean bandwidth on test data for SplitCF is 0.0023203090369158927
112 th index is done!
The overall coverage for UTOPIA is 0.7362637362637363
Mean bandwidth on test data for UTOPIA is 0.00021804565915999944
The overall coverage for LQR is 0.7362637362637363
Mean bandwidth on test data for LQR is 0.00021166104690329316
The overall coverage for SplitCF is 0.6593406593406593
Mean bandwidth on test data for SplitCF is 0.004059242754133903
113 th index is done!
The overall coverage for UTOPIA is 0.6043956043956044
Mean bandwidth on test data for UTOPIA is 0.000174256439065513
The overall coverage for LQR is 0.5274725274725275
Mean bandwidth on test data for LQR is 0.0001291381529610944
The overall coverage for 

IndexError: index 126 is out of bounds for axis 0 with size 126

In [11]:
idx = np.where(output_coverage[:,0] > 0.945)[0]
print(data.columns[idx])
print(output_coverage[idx,:])
print(output_bandwidth[idx,:])

Index(['IPFUELS', 'UEMP15OV', 'HOUST', 'HOUSTNE', 'HOUSTMW', 'HOUSTS',
       'PERMIT', 'PERMITNE', 'PERMITMW', 'PERMITS', 'TOTRESNS',
       'S.P.div.yield', 'S.P.PE.ratio', 'FEDFUNDS', 'CP3Mx', 'TB3MS', 'TB6MS',
       'GS10', 'AAA', 'BAA', 'TB3SMFFM', 'AAAFFM', 'BAAFFM', 'PPICMM',
       'DTCOLNVHFNM'],
      dtype='object')
[[0.96703297 0.58241758 0.1978022 ]
 [0.94505495 0.8021978  0.46153846]
 [0.94505495 0.         0.46153846]
 [0.96703297 0.08791209 0.46153846]
 [0.95604396 0.         0.53846154]
 [0.96703297 0.         0.46153846]
 [0.97802198 0.         0.46153846]
 [0.98901099 0.07692308 0.45054945]
 [0.96703297 0.         0.57142857]
 [0.96703297 0.         0.46153846]
 [0.95604396 1.         0.49450549]
 [0.94505495 0.59340659 0.65934066]
 [0.97802198 1.         0.47252747]
 [0.98901099 0.41758242 0.6043956 ]
 [0.94505495 0.45054945 0.68131868]
 [0.94505495 0.8021978  0.68131868]
 [0.96703297 0.43956044 0.58241758]
 [0.97802198 0.81318681 0.62637363]
 [0.97802198 0.9780219