In [None]:
import numpy as np
from os import path
import pandas as pd
import pickle

from dimensional_structure.EFA_plots import get_communality, get_adjusted_communality
from selfregulation.utils.plot_utils import format_num
from selfregulation.utils.r_to_py_utils import get_attr
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_recent_dataset, get_retest_data


In [None]:
# load results
results = load_results(get_recent_dataset())

In [None]:
def print_result(s, survey_val, task_val):
    num1 = format_num(survey_val, 3)
    num2 = format_num(task_val, 3)
    print('%s:\nsurvey: %s\ntask: %s' % (s, num1, num2))

# Main Paper Numeric Results

## EFA bootstrapped factor reliability

In [None]:
survey_boot_sds = results['survey'].EFA.get_boot_stats()['sds'].mean().mean()
task_boot_sds = results['task'].EFA.get_boot_stats()['sds'].mean().mean()

print_result('Loading Average SDs across bootstrap samples', survey_boot_sds, task_boot_sds)

## Variance Expalined by EFA models

Get the variance explained for each factor analytic model. Extracted from the fa function from R's Psych package. Equivalent to taking the mean of individual variable communalities.

In [None]:
# get variance explained by survey and task EFA
survey_EFA_rout = results['survey'].EFA.results['factor_tree_Rout_oblimin'][12]
survey_EFA_cummvar = get_attr(survey_EFA_rout, 'Vaccounted')[2,-1]

task_EFA_rout = results['task'].EFA.results['factor_tree_Rout_oblimin'][5]
task_EFA_cummvar = get_attr(task_EFA_rout, 'Vaccounted')[2,-1]

print_result('Variance Expalined', survey_EFA_cummvar, task_EFA_cummvar)

"Adjusted" variable explained taking into account the reliability of each measure. We first drop out variables with very low reliability (<.2) before conducting this analysis as they can have extreme effects on the communality of a variable. 

To ensure that the effect of adjustment isn't explained by dropping out these variables we show the variance explained for the reliable measure subsets. It is clear that subsetting is now meaningfully changing the variance explained values for surveys or tasks.

In [None]:
def return_adjusted(results):
    communality = get_communality(results.EFA)
    retest_data = get_retest_data(dataset=results.dataset.replace('Complete','Retest'))
    # reorder data in line with communality
    retest_data = retest_data.loc[communality.index]
    adjusted, *_ = get_adjusted_communality(communality, retest_data)
    return adjusted

survey_adjusted = return_adjusted(results['survey'])
task_adjusted = return_adjusted(results['task'])
print_result('Adjusted Variance Expalined', survey_adjusted.mean(), task_adjusted.mean())

# also calculate the unadjusted communality for the remaining variables
unadjusted_survey = get_communality(results['survey'].EFA)[survey_adjusted.index].mean()
unadjusted_task = get_communality(results['task'].EFA)[task_adjusted.index].mean()
print_result('Unadjusted Variance Expalined for reliable subset', 
             unadjusted_survey, unadjusted_task)

## Factor Correlations

In [None]:
results['survey'].EFA.get_scores().corr()

In [None]:
results['task'].EFA.get_scores().corr()

## Prediction Results

In [None]:
def get_pred_summary(predictions, scores='scores_cv'):
    R2s = []
    for k,v in predictions.items():
        R2s.append(v[scores][0]['R2'])
    return np.mean(R2s), np.min(R2s), np.max(R2s), np.array(R2s)



In [None]:
for classifier in ['ridge', 'svm']:
    for EFA in [True, False]:
        EFA_tag = 'Ontology' if EFA else 'Raw'
        print('*'*30, EFA_tag, classifier, '*'*30)
        # Load the prediction resultings using ridge regression with EFA factors as predictors
        survey_prediction_results = results['survey'].load_prediction_object(classifier=classifier, EFA=EFA)['data']
        task_prediction_results = results['task'].load_prediction_object(classifier=classifier, EFA=EFA)['data']

        print ('Mean, Min, Max Performance')
        survey_out = get_pred_summary(survey_prediction_results)
        task_out = get_pred_summary(task_prediction_results)
        print_result('Prediction Performance', survey_out[:-1], task_out[:-1])

        # evaluate degree of overestimation when performing insample prediction
        survey_insample = get_pred_summary(survey_prediction_results, scores='scores_insample')[-1]
        task_insample = get_pred_summary(task_prediction_results, scores='scores_insample')[-1]

        survey_exaggeration = np.mean(survey_insample-survey_out[-1])
        task_exaggeration = np.mean(task_insample-task_out[-1])
        print_result('\nInsample exaggeration Absolute', survey_exaggeration, task_exaggeration)