In [None]:

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from joblib import Parallel, delayed
from tqdm import tqdm
from scipy.stats import t, norm

# ----- Helper functions -----

def gini_mean_difference(data):
    n = len(data)
    if n < 2:
        return 0 
    mean_diff = np.abs(np.subtract.outer(data, data)).sum()
    return mean_diff / (n * (n - 1))

def l1(y, data):
    return np.mean(np.abs(y - data))

def generate_data(mean, cov, n, m):
    data = np.random.multivariate_normal(mean, cov, n)
    # Transform the distribution of each coordinate to t(5)
    data = t(df=5).ppf(norm.cdf(data))
    Y, X = data[:, 0], data[:, 1:]
    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
    X1, Y1 = X_train, Y_train
    X2, Y2 = X_test, Y_test

    data_unlabel = np.random.multivariate_normal(mean, cov, m)
    data_unlabel = t(df=5).ppf(norm.cdf(data_unlabel))
    X_unlabel = data_unlabel[:, 1:]
    # Split the new data into two sets
    X1_unlabel, X2_unlabel = train_test_split(X_unlabel, test_size=0.5, random_state=42)
    
    return X1, Y1, X2, Y2, X1_unlabel, X2_unlabel

def calculate_Uss(Y, f1_label, f2_label, f1_unlabel, f2_unlabel, mmd_value):
    mmd_value_ss = mmd_value - np.mean(np.concatenate((f1_label, f2_label))) \
                         + np.mean(np.concatenate((f1_label, f2_label, f1_unlabel, f2_unlabel)))
    return mmd_value_ss

def calculate_Uss_optimal(Y, f1_label, f2_label, f1_unlabel, f2_unlabel, ell1_Y1, ell1_Y2, mmd_value):
    gamma1 = np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label - np.mean(f1_label))) \
             / np.var(np.concatenate((f1_label, f1_unlabel)))
    gamma2 = np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label - np.mean(f2_label))) \
             / np.var(np.concatenate((f2_label, f2_unlabel)))
    mmd_value_ss = mmd_value - np.mean(np.concatenate((gamma1 * f1_label, gamma2 * f2_label))) \
                         + np.mean(np.concatenate((gamma1 * f1_label, gamma2 * f2_label, gamma1 * f1_unlabel, gamma2 * f2_unlabel)))
    return mmd_value_ss

def calculate_Uss_optimal_pooled(Y, f1_label_1, f2_label_1, f1_unlabel_1, f2_unlabel_1, 
                                 f1_label_2, f2_label_2, f1_unlabel_2, f2_unlabel_2, 
                                 f1_label_3, f2_label_3, f1_unlabel_3, f2_unlabel_3, 
                                 ell1_Y1, ell1_Y2, mmd_value):
    var_matrix_1 = np.cov(np.vstack((np.concatenate((f1_label_1, f1_unlabel_1)), np.concatenate((f1_label_2, f1_unlabel_2)), np.concatenate((f1_label_3, f1_unlabel_3)))))
    inv_var_matrix_1 = np.linalg.inv(var_matrix_1)
    var_matrix_2 = np.cov(np.vstack((np.concatenate((f2_label_1, f2_unlabel_1)), np.concatenate((f2_label_2, f2_unlabel_2)), np.concatenate((f2_label_3, f2_unlabel_3)))))
    inv_var_matrix_2 = np.linalg.inv(var_matrix_2)
    cov_matrix_1 = np.array([[np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_1 - np.mean(f1_label_1))), 
                              np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_2 - np.mean(f1_label_2))),
                              np.mean((ell1_Y1 - np.mean(ell1_Y1)) * (f1_label_3 - np.mean(f1_label_3)))]])
    cov_matrix_2 = np.array([[np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_1 - np.mean(f2_label_1))),  
                              np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_2 - np.mean(f2_label_2))),
                              np.mean((ell1_Y2 - np.mean(ell1_Y2)) * (f2_label_3 - np.mean(f2_label_3)))]])
    gamma1, gamma2 = inv_var_matrix_1 @ cov_matrix_1.T, inv_var_matrix_2 @ cov_matrix_2.T

    mmd_value_ss = mmd_value \
      - np.mean(np.concatenate((gamma1[0] * f1_label_1, gamma2[0] * f2_label_1))) \
      + np.mean(np.concatenate((gamma1[0] * f1_label_1, gamma2[0] * f2_label_1, gamma1[0] * f1_unlabel_1, gamma2[0] * f2_unlabel_1))) \
      - np.mean(np.concatenate((gamma1[1] * f1_label_2, gamma2[1] * f2_label_2))) \
      + np.mean(np.concatenate((gamma1[1] * f1_label_2, gamma2[1] * f2_label_2, gamma1[1] * f1_unlabel_2, gamma2[1] * f2_unlabel_2))) \
      - np.mean(np.concatenate((gamma1[2] * f1_label_3, gamma2[2] * f2_label_3))) \
      + np.mean(np.concatenate((gamma1[2] * f1_label_3, gamma2[2] * f2_label_3, gamma1[2] * f1_unlabel_3, gamma2[2] * f2_unlabel_3)))
    return mmd_value_ss

def fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA, modelB, ell1_Y1, ell1_Y2):
    modelA.fit(X1, ell1_Y1)
    modelB.fit(X2, ell1_Y2)
    f1_label = modelB.predict(X1)
    f1_unlabel = modelB.predict(X1_unlabel)
    f2_label = modelA.predict(X2)
    f2_unlabel = modelA.predict(X2_unlabel)
    return f1_label, f1_unlabel, f2_label, f2_unlabel


# ----- Simulation settings -----

n = 1000          # number of labeled samples
m = 50000         # number of unlabeled samples
d = 101          # total dimensions: 1 response + 1000 predictors
num_iterations = 1000
s_values = [2, 10, 50, 100]
true = 35 * np.sqrt(5) / (18 * np.pi)

# The overall mean is zero.
mean = np.zeros(d)

# ----- Loop over different s values -----

results_summary = {}

for s in s_values:
    print(f"\nRunning simulations for s = {s} ...")
    
    # Build the covariance matrix:
    # We'll make a block for the first s variables (response and first predictor) with off-diagonals = s,
    # and the remaining predictors are independent.
    cov = np.full((d, d), 0.0)
    cov[:s, :s] = 0.9
    np.fill_diagonal(cov, 1)  # set all variances to 1

    # Define a version of run_iteration that uses the current cov and mean.
    def run_iteration(_):
        X1, Y1, X2, Y2, X1_unlabel, X2_unlabel = generate_data(mean, cov, n, m)
        Y = np.concatenate((Y1, Y2))

        # Estimate ell_1 function on the labeled sets
        ell1_Y1 = np.array([l1(y, Y1) for y in Y1])
        ell1_Y2 = np.array([l1(y, Y2) for y in Y2])
        
        # Compute the true Gini mean difference on Y
        gini_mean_diff = gini_mean_difference(Y)

        # ----- Random Forest -----
        modelA_rf = RandomForestRegressor(n_estimators=100, random_state=42)
        modelB_rf = RandomForestRegressor(n_estimators=100, random_state=42)
        f1_label_rf, f1_unlabel_rf, f2_label_rf, f2_unlabel_rf = \
            fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA_rf, modelB_rf, ell1_Y1, ell1_Y2)
        result_rf = calculate_Uss(Y, f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, gini_mean_diff)
        result_rf_optimal = calculate_Uss_optimal(Y, f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, 
                                                   ell1_Y1, ell1_Y2, gini_mean_diff)
        
        # ----- XGBoost -----
        modelA_xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
        modelB_xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
        f1_label_xgb, f1_unlabel_xgb, f2_label_xgb, f2_unlabel_xgb = \
            fit_and_predict(X1, Y1, X2, Y2, X1_unlabel, X2_unlabel, modelA_xgb, modelB_xgb, ell1_Y1, ell1_Y2)
        result_xgb = calculate_Uss(Y, f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, gini_mean_diff)
        result_xgb_optimal = calculate_Uss_optimal(Y, f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, 
                                                   ell1_Y1, ell1_Y2, gini_mean_diff)
        
        # ---- First Coordinate ----
        f1_label_linear = X1[:, 0]
        f2_label_linear = X2[:, 0]
        f1_unlabel_linear = X1_unlabel[:, 0]
        f2_unlabel_linear = X2_unlabel[:, 0]
        result_linear = calculate_Uss(Y, f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear, gini_mean_diff)
        result_linear_optimal = calculate_Uss_optimal(Y, f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear,
                                                        ell1_Y1, ell1_Y2, gini_mean_diff) 
        
        # ----- Optimal Pool -----
        result_optimal_pool = calculate_Uss_optimal_pooled(
            Y, 
            f1_label_rf, f2_label_rf, f1_unlabel_rf, f2_unlabel_rf, 
            f1_label_xgb, f2_label_xgb, f1_unlabel_xgb, f2_unlabel_xgb, 
            f1_label_linear, f2_label_linear, f1_unlabel_linear, f2_unlabel_linear,
            ell1_Y1, ell1_Y2, gini_mean_diff)

        base = np.abs(gini_mean_diff - true) ** 2
        mse_rf = np.abs(result_rf - true) ** 2
        mse_xgb = np.abs(result_xgb - true) ** 2
        mse_linear = np.abs(result_linear - true) ** 2
        mse_rf_optimal = np.abs(result_rf_optimal - true) ** 2
        mse_xgb_optimal = np.abs(result_xgb_optimal - true) ** 2
        mse_linear_optimal = np.abs(result_linear_optimal - true) ** 2
        mse_optimal_pool = np.abs(result_optimal_pool - true) ** 2

        return base, mse_rf, mse_xgb, mse_linear, mse_rf_optimal, mse_xgb_optimal, mse_linear_optimal, mse_optimal_pool

    # Run the iterations in parallel
    results_base = []
    results_rf = []
    results_xgb = []
    results_linear = []
    results_rf_optimal = []
    results_xgb_optimal = []
    results_linear_optimal = []
    results_optimal_pool = []

    results = Parallel(n_jobs=-1)(delayed(run_iteration)(_) for _ in tqdm(range(num_iterations), desc="Iterations"))
    
    for base, mse_rf, mse_xgb, mse_linear, mse_rf_optimal, mse_xgb_optimal, mse_linear_optimal, mse_optimal_pool in results:
        results_base.append(base)
        results_rf.append(mse_rf)
        results_xgb.append(mse_xgb)
        results_linear.append(mse_linear)
        results_rf_optimal.append(mse_rf_optimal)
        results_xgb_optimal.append(mse_xgb_optimal)
        results_linear_optimal.append(mse_linear_optimal)
        results_optimal_pool.append(mse_optimal_pool)

    mean_base = np.mean(results_base)
    mean_rf = np.mean(results_rf)
    mean_xgb = np.mean(results_xgb)
    mean_linear = np.mean(results_linear)
    mean_rf_optimal = np.mean(results_rf_optimal)
    mean_xgb_optimal = np.mean(results_xgb_optimal)
    mean_linear_optimal = np.mean(results_linear_optimal)
    mean_optimal_pool = np.mean(results_optimal_pool)
    
    # Save results for this s value
    results_summary[s] = {
        "RF/Base": mean_rf / mean_base,
        "XGB/Base": mean_xgb / mean_base,
        "Linear/Base": mean_linear / mean_base,
        "RF Optimal/Base": mean_rf_optimal / mean_base,
        "XGB Optimal/Base": mean_xgb_optimal / mean_base,
        "Linear Optimal/Base": mean_linear_optimal / mean_base,
        "Optimal Pool/Base": mean_optimal_pool / mean_base,
    }
    
    # Print the results for this s
    print(f"Results for s = {s}:")
    print(f"  RF / Base:         {results_summary[s]['RF/Base']:.4f}")
    print(f"  XGB / Base:        {results_summary[s]['XGB/Base']:.4f}")
    print(f"  Linear / Base:      {results_summary[s]['Linear/Base']:.4f}")
    print(f"  RF Optimal / Base: {results_summary[s]['RF Optimal/Base']:.4f}")
    print(f"  XGB Optimal / Base:{results_summary[s]['XGB Optimal/Base']:.4f}")
    print(f"  Linear Optimal / Base:{results_summary[s]['Linear Optimal/Base']:.4f}")
    print(f"  Optimal Pool / Base:{results_summary[s]['Optimal Pool/Base']:.4f}")
    
    # Print results for each s
    for s in results_summary:
        print(f" {results_summary[s]['RF/Base']:.4f} & {results_summary[s]['XGB/Base']:.4f} & "
            f"{results_summary[s]['Linear/Base']:.4f} & {results_summary[s]['RF Optimal/Base']:.4f} & "
            f"{results_summary[s]['XGB Optimal/Base']:.4f} & {results_summary[s]['Linear Optimal/Base']:.4f} & "
            f"{results_summary[s]['Optimal Pool/Base']:.4f} \\\\")