In [20]:
from os import path
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from reconstruction_utils import run_kNeighbors, reorder_FA
from selfregulation.utils.utils import get_recent_dataset
from selfregulation.utils.result_utils import load_results, get_info
from selfregulation.utils.r_to_py_utils import psychFA

In [21]:
def k_nearest_close_reconstruction(results, drop_regex, num_available_measures=None,
                             pseudo_pop_size=60, n_reps=100, 
                             k_list=None, EFA_rotation='oblimin', 
                             metric='correlation',
                             verbose=True,
                             weightings = ['distance']):
    """
    Extension of k_nearest_reconstruction that uses a omnipotent "closeness" algorithm 
    to select a subset of variables
    """
    def run_EFA(data, c, rotation, orig_loading):
        fa, out = psychFA(data, c, rotate=EFA_rotation)
        loadings = pd.DataFrame(out['loadings'], index=data.columns)
        loadings = reorder_FA(orig_loadings, loadings)
        return loadings
    
    def get_closest(data, target, n_tasks=5, metric='correlation'):
        index = data.columns.get_loc(target)
        distances = squareform(pdist(data.T, metric=metric))
        sort_vars = data.columns[np.argsort(distances[index])]
        # get closest tasks until tasks are filled up
        tasks = set()
        for var in sort_vars:
            task, *_ = var.split('.')
            tasks.add(task)
            if len(tasks) == n_tasks:
                break
        # get variables from tasks
        neighbors = data.filter(regex='|'.join(tasks)).columns
        return neighbors
    c = results.EFA.results['num_factors']
    orig_loadings = results.EFA.get_loading(c, rotate=EFA_rotation)
    full_data = results.data
    drop_vars = list(full_data.filter(regex=drop_regex).columns)
    subset = full_data.drop(drop_vars, axis=1)
    full_loadings = run_EFA(subset, c, EFA_rotation, orig_loadings)
    if full_loadings is None:
        return
    var_reconstructions = pd.DataFrame()
    for var in drop_vars:
        # imagine we have a good estimate of one measure tomap is related to
        target = full_data.corr()[var].drop(drop_vars).idxmax()
        # get a neighborhood around that target
        available_vars = get_closest(full_loadings.T, target, n_tasks=num_available_measures,
                                    metric=metric)

        # get dataset and loadings
        data = full_data.loc[:, set(available_vars) | set(drop_vars)]
        loadings = full_loadings.loc[available_vars,:]
        for rep in range(n_reps):
            random_subset = data.sample(pseudo_pop_size)
            distances = pd.DataFrame(squareform(pdist(random_subset.T, metric=metric)), 
                                     index=data.columns, 
                                     columns=data.columns).drop(drop_vars, axis=1)
            tmp_reconstruction = run_kNeighbors(distances, loadings, [var], weightings, 
                                                [min(loadings.shape[0], 13)])
            tmp_reconstruction['label'] = "closest_reconstruction"
            tmp_reconstruction['num_available_measures'] = num_available_measures
            tmp_reconstruction['rep'] = rep
            var_reconstructions = pd.concat([var_reconstructions, tmp_reconstruction])
    return var_reconstructions

In [22]:
EFA_rotation='oblimin'
dataset = get_recent_dataset()

results = load_results(dataset)['task']
output_dir = path.join(get_info('results_directory'),
                       'ontology_reconstruction', dataset, results.ID, EFA_rotation)
measure_list = np.unique([i.split('.')[0] for i in results.data.columns])

In [37]:
full_reconstruction = pd.DataFrame()
regex_list = ['^'+m for m in measure_list]
for num_available_measures in range(1,11):
    for pop_size in [30, 50, 100, 400]:
        for drop_regex in regex_list:
            var_reconstruction = k_nearest_close_reconstruction(results, drop_regex, num_available_measures, 
                                          pseudo_pop_size=pop_size, n_reps=50)
            if var_reconstruction is None:
                continue
            var_reconstruction['pop_size'] = pop_size
            full_reconstruction = pd.concat([full_reconstruction, var_reconstruction])
full_reconstruction = full_reconstruction.sort_values(by='var')
full_reconstruction.reset_index(drop=True, inplace=True)

In [41]:
# get reconstruction scores
loadings = results.EFA.get_loading()
loadings
scores = []
for i, row in full_reconstruction.iterrows():
    var = row['var']
    onto_embedding = loadings.loc[var]
    estimated_embedding = row[onto_embedding.index]
    score = np.corrcoef(list(estimated_embedding), 
                          list(onto_embedding))[0,1]
    scores.append(score)
full_reconstruction.loc[:, 'corr_score'] = scores

# of components not specified, using BIC determined #


In [42]:
summary = full_reconstruction.groupby(['var','pop_size','num_available_measures']).corr_score.agg([np.mean, np.std]).reset_index()
summary.to_pickle(path.join(output_dir, 'KNNRclosest_correlation_summary.pkl' ))