In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import gaussian_kde
from xgboost import XGBRegressor
from joblib import Parallel, delayed
from tqdm import tqdm

# ----- Helper functions -----


def l1(data, x, kde):
    # Extract individual kernel values at x (equivalent to evaluating the kernel at each y_i - x)
    bandwidth = np.sqrt(kde.covariance)[0, 0]  # Extract bandwidth (Silverman's rule)
    kernels_at_x = np.exp(-0.5 * ((data - x) / bandwidth) ** 2) / (bandwidth * np.sqrt(2 * np.pi))
    return kernels_at_x

def generate_data(mean, cov, n, m):
    data = np.random.multivariate_normal(mean, cov, n)
    Y, X = data[:, 0], data[:, 1:]
    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
    X1, Y1 = X_train, Y_train
    X2, Y2 = X_test, Y_test

    data_unlabel = np.random.multivariate_normal(mean, cov, m)
    X_unlabel = data_unlabel[:, 1:]
    # Split the new data into two sets
    X1_unlabel, X2_unlabel = train_test_split(X_unlabel, test_size=0.5, random_state=42)
    
    return X1, Y1, X2, Y2, X1_unlabel, X2_unlabel

def calculate_Uss(Y, f1_label, f2_label, f1_unlabel, f2_unlabel, kde_value):
    kde_value_ss = kde_value - np.mean(np.concatenate((f1_label, f2_label))) \
                         + np.mean(np.concatenate((f1_label, f2_label, f1_unlabel, f2_unlabel)))
    return kde_value_ss

def calculate_Uss_optimal(Y, f1_label, f2_label, f1_unlabel, f2_unlabel, ell1_Y1, ell1_Y2, kde_value):
    gamma1 = np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label - np.mean(f1_label))) \
             / np.var(np.concatenate((f1_label, f1_unlabel)))
    gamma2 = np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label - np.mean(f2_label))) \
             / np.var(np.concatenate((f2_label, f2_unlabel)))
    kde_value_ss = kde_value - np.mean(np.concatenate((gamma1 * f1_label, gamma2 * f2_label))) \
                         + np.mean(np.concatenate((gamma1 * f1_label, gamma2 * f2_label, gamma1 * f1_unlabel, gamma2 * f2_unlabel)))
    return kde_value_ss

def calculate_Uss_optimal_pooled(Y, f1_label_1, f2_label_1, f1_unlabel_1, f2_unlabel_1, 
                                 f1_label_2, f2_label_2, f1_unlabel_2, f2_unlabel_2, 
                                 f1_label_3, f2_label_3, f1_unlabel_3, f2_unlabel_3, 
                                 ell1_Y1, ell1_Y2, kde_value):
    var_matrix_1 = np.cov(np.vstack((np.concatenate((f1_label_1, f1_unlabel_1)), np.concatenate((f1_label_2, f1_unlabel_2)), np.concatenate((f1_label_3, f1_unlabel_3)))))
    inv_var_matrix_1 = np.linalg.inv(var_matrix_1)
    var_matrix_2 = np.cov(np.vstack((np.concatenate((f2_label_1, f2_unlabel_1)), np.concatenate((f2_label_2, f2_unlabel_2)), np.concatenate((f2_label_3, f2_unlabel_3)))))
    inv_var_matrix_2 = np.linalg.inv(var_matrix_2)
    cov_matrix_1 = np.array([[np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_1 - np.mean(f1_label_1))), 
                              np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_2 - np.mean(f1_label_2))),
                              np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_3 - np.mean(f1_label_3)))]])
    cov_matrix_2 = np.array([[np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_1 - np.mean(f2_label_1))),  
                              np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_2 - np.mean(f2_label_2))),
                              np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_3 - np.mean(f2_label_3)))]])
    gamma1, gamma2 = inv_var_matrix_1 @ cov_matrix_1.T, inv_var_matrix_2 @ cov_matrix_2.T

    kde_value_ss = kde_value \
      - np.mean(np.concatenate((gamma1[0] * f1_label_1, gamma2[0] * f2_label_1))) \
      + np.mean(np.concatenate((gamma1[0] * f1_label_1, gamma2[0] * f2_label_1, gamma1[0] * f1_unlabel_1, gamma2[0] * f2_unlabel_1))) \
      - np.mean(np.concatenate((gamma1[1] * f1_label_2, gamma2[1] * f2_label_2))) \
      + np.mean(np.concatenate((gamma1[1] * f1_label_2, gamma2[1] * f2_label_2, gamma1[1] * f1_unlabel_2, gamma2[1] * f2_unlabel_2))) \
      - np.mean(np.concatenate((gamma1[2] * f1_label_3, gamma2[2] * f2_label_3))) \
      + np.mean(np.concatenate((gamma1[2] * f1_label_3, gamma2[2] * f2_label_3, gamma1[2] * f1_unlabel_3, gamma2[2] * f2_unlabel_3)))
    return kde_value_ss

def fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA, modelB, ell1_Y1, ell1_Y2):
    modelA.fit(X1, ell1_Y1)
    modelB.fit(X2, ell1_Y2)
    f1_label = modelB.predict(X1)
    f1_unlabel = modelB.predict(X1_unlabel)
    f2_label = modelA.predict(X2)
    f2_unlabel = modelA.predict(X2_unlabel)
    return f1_label, f1_unlabel, f2_label, f2_unlabel


# ----- Simulation settings -----

n = 1000          # number of labeled samples
m = 50000         # number of unlabeled samples
d = 101          # total dimensions: 1 response + 1000 predictors
num_iterations = 1000
rho_values = [0, 0.25, 0.5, 0.9]
x0 = 1
true = np.exp(-0.5 * x0 ** 2) / np.sqrt(2 * np.pi)

# The overall mean is zero.
mean = np.zeros(d)

# ----- Loop over different rho values -----

results_summary = {}

for rho in rho_values:
    print(f"\nRunning simulations for rho = {rho} ...")
    
    # Build the covariance matrix:
    # We'll make a block for the first s variables (response and first predictor) with off-diagonals = rho,
    # and the remaining predictors are independent.
    cov = np.full((d, d), rho)
    np.fill_diagonal(cov, 1)  # set all variances to 1

    # Define a version of run_iteration that uses the current cov and mean.
    def run_iteration(_):
        X1, Y1, X2, Y2, X1_unlabel, X2_unlabel = generate_data(mean, cov, n, m)
        Y = np.concatenate((Y1, Y2))

        kde = gaussian_kde(Y, bw_method='silverman')
        
        # Estimate ell_1 function on the labeled sets
        ell1 = l1(Y, x0, kde)
        ell1_Y1 = ell1[:(n//2)]
        ell1_Y2 = ell1[(n//2):]
        
        # Compute the true Gini mean difference on Y
        kde_at_x = kde.evaluate(x0)

        # ----- Random Forest -----
        modelA_rf = RandomForestRegressor(n_estimators=100, random_state=42)
        modelB_rf = RandomForestRegressor(n_estimators=100, random_state=42)
        f1_label_rf, f1_unlabel_rf, f2_label_rf, f2_unlabel_rf = \
            fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA_rf, modelB_rf, ell1_Y1, ell1_Y2)
        result_rf = calculate_Uss(Y, f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, kde_at_x)
        result_rf_optimal = calculate_Uss_optimal(Y, f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, 
                                                   ell1_Y1, ell1_Y2, kde_at_x)
        
        # ----- XGBoost -----
        modelA_xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
        modelB_xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
        f1_label_xgb, f1_unlabel_xgb, f2_label_xgb, f2_unlabel_xgb = \
            fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA_xgb, modelB_xgb, ell1_Y1, ell1_Y2)
        result_xgb = calculate_Uss(Y, f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, kde_at_x)
        result_xgb_optimal = calculate_Uss_optimal(Y, f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, 
                                                   ell1_Y1, ell1_Y2, kde_at_x)
        
        # ---- First Coordinate ----
        f1_label_linear = X1[:, 0]
        f2_label_linear = X2[:, 0]
        f1_unlabel_linear = X1_unlabel[:, 0]
        f2_unlabel_linear = X2_unlabel[:, 0]
        result_linear = calculate_Uss(Y, f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear, kde_at_x)
        result_linear_optimal = calculate_Uss_optimal(Y, f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear,
                                                        ell1_Y1, ell1_Y2, kde_at_x) 
        
        # ----- Optimal Pool -----
        result_optimal_pool = calculate_Uss_optimal_pooled(
            Y, 
            f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, 
            f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, 
            f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear,
            ell1_Y1, ell1_Y2, kde_at_x)

        base = np.abs(kde_at_x - true) ** 2
        mse_rf = np.abs(result_rf - true) ** 2
        mse_xgb = np.abs(result_xgb - true) ** 2
        mse_linear = np.abs(result_linear - true) ** 2
        mse_rf_optimal = np.abs(result_rf_optimal - true) ** 2
        mse_xgb_optimal = np.abs(result_xgb_optimal - true) ** 2
        mse_linear_optimal = np.abs(result_linear_optimal - true) ** 2
        mse_optimal_pool = np.abs(result_optimal_pool - true) ** 2

        return base, mse_rf, mse_xgb, mse_linear, mse_rf_optimal, mse_xgb_optimal, mse_linear_optimal, mse_optimal_pool

    # Run the iterations in parallel
    results_base = []
    results_rf = []
    results_xgb = []
    results_linear = []
    results_rf_optimal = []
    results_xgb_optimal = []
    results_linear_optimal = []
    results_optimal_pool = []

    results = Parallel(n_jobs=-1)(delayed(run_iteration)(_) for _ in tqdm(range(num_iterations), desc="Iterations"))
    
    for base, mse_rf, mse_xgb, mse_linear, mse_rf_optimal, mse_xgb_optimal, mse_linear_optimal, mse_optimal_pool in results:
        results_base.append(base)
        results_rf.append(mse_rf)
        results_xgb.append(mse_xgb)
        results_linear.append(mse_linear)
        results_rf_optimal.append(mse_rf_optimal)
        results_xgb_optimal.append(mse_xgb_optimal)
        results_linear_optimal.append(mse_linear_optimal)
        results_optimal_pool.append(mse_optimal_pool)

    mean_base = np.mean(results_base)
    mean_rf = np.mean(results_rf)
    mean_xgb = np.mean(results_xgb)
    mean_linear = np.mean(results_linear)
    mean_rf_optimal = np.mean(results_rf_optimal)
    mean_xgb_optimal = np.mean(results_xgb_optimal)
    mean_linear_optimal = np.mean(results_linear_optimal)
    mean_optimal_pool = np.mean(results_optimal_pool)
    
    # Save results for this rho value
    results_summary[rho] = {
        "RF/Base": mean_rf / mean_base,
        "XGB/Base": mean_xgb / mean_base,
        "Linear/Base": mean_linear / mean_base,
        "RF Optimal/Base": mean_rf_optimal / mean_base,
        "XGB Optimal/Base": mean_xgb_optimal / mean_base,
        "Linear Optimal/Base": mean_linear_optimal / mean_base,
        "Optimal Pool/Base": mean_optimal_pool / mean_base,
    }
    
    # Print the results for this rho
    print(f"Results for rho = {rho}:")
    print(f"  RF / Base:         {results_summary[rho]['RF/Base']:.4f}")
    print(f"  XGB / Base:        {results_summary[rho]['XGB/Base']:.4f}")
    print(f"  Linear / Base:      {results_summary[rho]['Linear/Base']:.4f}")
    print(f"  RF Optimal / Base: {results_summary[rho]['RF Optimal/Base']:.4f}")
    print(f"  XGB Optimal / Base:{results_summary[rho]['XGB Optimal/Base']:.4f}")
    print(f"  Linear Optimal / Base:{results_summary[rho]['Linear Optimal/Base']:.4f}")
    print(f"  Optimal Pool / Base:{results_summary[rho]['Optimal Pool/Base']:.4f}")
    

    # Print results for each rho
    for rho in results_summary:
        print(f" {results_summary[rho]['RF/Base']:.4f} & {results_summary[rho]['XGB/Base']:.4f} & "
            f"{results_summary[rho]['Linear/Base']:.4f} & {results_summary[rho]['RF Optimal/Base']:.4f} & "
            f"{results_summary[rho]['XGB Optimal/Base']:.4f} & {results_summary[rho]['Linear Optimal/Base']:.4f} & "
            f"{results_summary[rho]['Optimal Pool/Base']:.4f} \\\\")



Running simulations for rho = 0 ...


Iterations: 100%|██████████| 1000/1000 [16:44<00:00,  1.00s/it]


Results for rho = 0:
  RF / Base:         1.0943
  XGB / Base:        1.1649
  Linear / Base:      6.0221
  RF Optimal / Base: 0.9982
  XGB Optimal / Base:1.0031
  Linear Optimal / Base:1.0020
  Optimal Pool / Base:1.0029
 1.0943 & 1.1649 & 6.0221 & 0.9982 & 1.0031 & 1.0020 & 1.0029 \\

Running simulations for rho = 0.25 ...


Iterations: 100%|██████████| 1000/1000 [12:08<00:00,  1.37it/s]


Results for rho = 0.25:
  RF / Base:         1.0393
  XGB / Base:        1.1416
  Linear / Base:      6.0904
  RF Optimal / Base: 0.9865
  XGB Optimal / Base:0.9866
  Linear Optimal / Base:0.9979
  Optimal Pool / Base:0.9888
 1.0943 & 1.1649 & 6.0221 & 0.9982 & 1.0031 & 1.0020 & 1.0029 \\
 1.0393 & 1.1416 & 6.0904 & 0.9865 & 0.9866 & 0.9979 & 0.9888 \\

Running simulations for rho = 0.5 ...


Iterations: 100%|██████████| 1000/1000 [09:28<00:00,  1.76it/s]


Results for rho = 0.5:
  RF / Base:         0.8728
  XGB / Base:        0.9914
  Linear / Base:      4.8788
  RF Optimal / Base: 0.8722
  XGB Optimal / Base:0.9166
  Linear Optimal / Base:0.9429
  Optimal Pool / Base:0.8711
 1.0943 & 1.1649 & 6.0221 & 0.9982 & 1.0031 & 1.0020 & 1.0029 \\
 1.0393 & 1.1416 & 6.0904 & 0.9865 & 0.9866 & 0.9979 & 0.9888 \\
 0.8728 & 0.9914 & 4.8788 & 0.8722 & 0.9166 & 0.9429 & 0.8711 \\

Running simulations for rho = 0.9 ...


Iterations: 100%|██████████| 1000/1000 [07:11<00:00,  2.31it/s]


Results for rho = 0.9:
  RF / Base:         0.5060
  XGB / Base:        0.5861
  Linear / Base:      4.1903
  RF Optimal / Base: 0.5112
  XGB Optimal / Base:0.5774
  Linear Optimal / Base:0.8185
  Optimal Pool / Base:0.5215
 1.0943 & 1.1649 & 6.0221 & 0.9982 & 1.0031 & 1.0020 & 1.0029 \\
 1.0393 & 1.1416 & 6.0904 & 0.9865 & 0.9866 & 0.9979 & 0.9888 \\
 0.8728 & 0.9914 & 4.8788 & 0.8722 & 0.9166 & 0.9429 & 0.8711 \\
 0.5060 & 0.5861 & 4.1903 & 0.5112 & 0.5774 & 0.8185 & 0.5215 \\
