## Gaussion Process Regression

Load multiple GP predictions from multiple data resamples and calculate ensemble average of predictions.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

In [2]:
repo_root = '../..'
checkpoint_path = '../../output/checkpoint__synapseclr__so3__second_stage'
output_root = '../../output/gp/synapse_simclr_production'

In [3]:
# load imputed tables
df_list = []
for seed in [40, 41, 42, 43, 45]:
    filename = os.path.join(output_root, f'imputed_meta__rbf__300__c=0.000__s={seed}.csv')
    df = pd.read_csv(filename, index_col=0)
    df_list.append(df)

In [4]:
def effective_normal_parametrs(
        mu_bk: np.ndarray,
        sigma_bk: np.ndarray):
    
    batch_size, n_trials = mu_bk.shape
    assert sigma_bk.shape == mu_bk.shape
    
    eff_mu_b = np.mean(mu_bk, axis=-1)
    eff_var_b = np.mean(mu_bk ** 2 + sigma_bk ** 2, axis=-1) - eff_mu_b ** 2
    eff_sigma_b = np.sqrt(eff_var_b)
    
    return eff_mu_b, eff_sigma_b

In [5]:
trait_key_list = [
    'cleft_size_log1p_zscore',
    'presyn_soma_dist_log1p_zscore',
    'postsyn_soma_dist_log1p_zscore',
    'mito_size_pre_vx_log1p_zscore_zi',
    'mito_size_post_vx_log1p_zscore_zi',
    'pre_and_post_cell_types',
    'pre_cell_type',
    'post_cell_type',
    'has_mito_pre',
    'has_mito_post'
]

trait_type_list = [
    'continuous',
    'continuous',
    'continuous',
    'continuous',
    'continuous',
    'categorical',
    'categorical',
    'categorical',
    'categorical',
    'categorical'
]

trait_num_categories_list = [
    None,
    None,
    None,
    None,
    None,
    4,
    2,
    2,
    2,
    2
]

trait_control_list = [
    None,
    None,
    None,
    'has_mito_pre',
    'has_mito_post',
    None,
    None,
    None,
    None,
    None
]

In [6]:
from collections import OrderedDict

eff_df_data = OrderedDict()

# prepend synapse_id
eff_df_data['synapse_id'] = df_list[0]['synapse_id'].values

# aux
n_entries = len(eff_df_data['synapse_id'])
n_dfs = len(df_list)

for trait_key, trait_type, trait_num_categories in zip(trait_key_list, trait_type_list, trait_num_categories_list):
    
    if trait_type == 'continuous':
        mean_col_name = f'imputed__{trait_key}__mean'
        std_col_name = f'imputed__{trait_key}__std'
        mean_list = []
        std_list = []
        for df in df_list:
            mean = df[mean_col_name].values
            std = df[std_col_name].values
            mean_list.append(mean)
            std_list.append(std)
        mu_bk = np.asarray(mean_list).T
        std_bk = np.asarray(std_list).T
        eff_mu_k, eff_std_k = effective_normal_parametrs(mu_bk, std_bk)
        eff_df_data[mean_col_name] = eff_mu_k
        eff_df_data[std_col_name] = eff_std_k
        
    elif trait_type == 'categorical':
        col_names = [f'imputed__{trait_key}__class_{k}' for k in range(trait_num_categories)]
        data_bjk = np.zeros((n_entries, trait_num_categories, n_dfs))
        for k, df in enumerate(df_list):
            for j, col_name in enumerate(col_names):
                data_bjk[:, j, k] = df[col_name].values
        data_bj = np.mean(data_bjk, -1)
        for j, col_name in enumerate(col_names):
            eff_df_data[col_name] = data_bj[:, j]

    else:
        raise ValueError
        
eff_df = pd.DataFrame(eff_df_data)

In [7]:
eff_df

Unnamed: 0,synapse_id,imputed__cleft_size_log1p_zscore__mean,imputed__cleft_size_log1p_zscore__std,imputed__presyn_soma_dist_log1p_zscore__mean,imputed__presyn_soma_dist_log1p_zscore__std,imputed__postsyn_soma_dist_log1p_zscore__mean,imputed__postsyn_soma_dist_log1p_zscore__std,imputed__mito_size_pre_vx_log1p_zscore_zi__mean,imputed__mito_size_pre_vx_log1p_zscore_zi__std,imputed__mito_size_post_vx_log1p_zscore_zi__mean,...,imputed__pre_and_post_cell_types__class_2,imputed__pre_and_post_cell_types__class_3,imputed__pre_cell_type__class_0,imputed__pre_cell_type__class_1,imputed__post_cell_type__class_0,imputed__post_cell_type__class_1,imputed__has_mito_pre__class_0,imputed__has_mito_pre__class_1,imputed__has_mito_post__class_0,imputed__has_mito_post__class_1
0,1000004,0.353060,0.280311,0.057786,0.314275,-0.530573,0.466851,0.355470,0.241811,-0.058009,...,0.820416,1.421129e-01,0.022224,0.977776,0.753984,0.246016,0.041953,0.958047,0.003175,0.996825
1,1000270,-0.508038,0.250857,0.180006,0.446762,0.709085,0.408213,-1.444437,0.557580,-0.067514,...,0.000513,2.767836e-05,0.997182,0.002818,0.994486,0.005514,0.974392,0.025608,0.830716,0.169284
2,1001064,2.427951,0.224710,-0.277008,0.670462,0.461000,0.437646,-0.804745,0.516344,0.068529,...,0.000037,3.078269e-07,0.998227,0.001773,0.999937,0.000063,0.002030,0.997970,0.954449,0.045551
3,1001916,0.088627,0.553806,-0.284920,0.807036,-0.217735,0.541268,0.275995,0.957244,-0.279717,...,0.581755,3.297013e-01,0.047546,0.952454,0.570163,0.429837,0.076963,0.923037,0.002104,0.997896
4,1001959,0.564563,0.598241,-0.304952,0.533632,-1.422242,0.408024,0.499894,0.344937,0.324444,...,0.890711,1.078349e-01,0.000909,0.999091,0.963397,0.036603,0.001034,0.998966,0.043088,0.956912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94869,999825,0.932100,0.300337,0.238375,0.459148,0.367410,0.467859,-2.091767,0.407382,0.069641,...,0.000030,4.557014e-06,0.999143,0.000857,0.999060,0.000940,0.912987,0.087013,0.901715,0.098285
94870,999871,-0.880295,0.497696,0.056011,0.697705,0.730940,0.751113,-0.955930,0.647155,-0.195676,...,0.034478,2.554275e-04,0.725779,0.274221,0.994298,0.005702,0.933356,0.066644,0.695179,0.304821
94871,999892,-0.507733,0.770560,-0.037969,1.069154,0.657038,1.023221,-1.728502,0.984160,0.741565,...,0.142861,7.656909e-02,0.576315,0.423685,0.132569,0.867431,0.775982,0.224018,0.017129,0.982871
94872,999934,-0.361869,0.301072,0.536208,0.515692,0.818825,0.397894,-1.284641,0.502909,-0.086433,...,0.007009,3.618794e-03,0.987027,0.012973,0.810629,0.189371,0.860396,0.139604,0.745032,0.254968


In [8]:
output_filename = os.path.join(output_root, f'imputed_meta__rbf__synapse_simclr__ensemble.csv')
eff_df.to_csv(output_filename)