## Notebook to clean training and test datasets 

* The procedure involves the following steps 
* load variables
* transform with power transform 
* remove outliers 
* in the case of self-regulation variables only: remove correlated variables
* save clean variables
* impute variables 
* save imputed variabels


In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
from selfregulation.utils.utils import get_info, get_admin_data, get_behav_data, get_recent_dataset,get_save_directory_train_test
from selfregulation.utils.data_preparation_utils import  remove_correlated_task_variables, remove_outliers
from selfregulation.utils.data_preparation_utils import  fit_transform_pt, transform_pt
from selfregulation.utils.r_to_py_utils import missForest
from selfregulation.utils.plot_utils import format_num
import datetime
from os import makedirs, path
from math import ceil
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

def clean_dataset(selected_var_pt = None, directory = None , suffix =  None):
    #this wrapper function takes the transformed variables and 
    selected_var_pt_out   = remove_outliers(selected_var_pt) # remove outliers
    selected_var_pt_clean = remove_correlated_task_variables(selected_var_pt_out) # remove correlated variables
    return  selected_var_pt_out, selected_var_pt_clean 

def drop_not_common_vars(orig_data, common_vars = None): 
    drop_unique_vars = set(orig_data)-set(common_vars)
    orig_data.drop(drop_unique_vars, axis=1, inplace = True)
    return orig_data.sort_index(axis = 1)

def get_missing_values_data(data): 
    percent_missing = (data.isnull().sum().sum()*100)/(data.shape[1]*data.shape[0])
    print('Missing values: %', percent_missing.round(2))
    
    var_with_missing = np.sum(data.isnull().mean()>0)
    var_with_no_missing = len(data.columns) - var_with_missing
    
    perc_var_no_missing = (var_with_no_missing)*100/(len(data.columns))
    
    print('Var without missing values: %', perc_var_no_missing.round(2))
    
    print('Var with missing values above 10%:')
    missing  = data.isnull()
    missing_high = missing.loc[:,missing.mean()>0.10]
    print(missing_high.mean().sort_values(ascending=False))

    
    return percent_missing, perc_var_no_missing



In [3]:
data_dir    = get_info('data_directory')
dir_master  = get_save_directory_train_test('Data_master')
dir_covid   = get_save_directory_train_test('Data')
print(dir_master)
print(dir_covid)

/SRO/Data_master/Complete_02-16-2019
/SRO/Data/Complete_Covid_03-26-2021


## Clean stress and mindset and psych variables

In [4]:
list_files =['meaningful_variables_stress_mindset', 'meaningful_variables_psych']

for list_file in list_files: 
    file_name = list_file + '.csv'
    
    print('*' *79)
    print('Cleaning this dataset')
    selected_var                = get_behav_data(file = file_name, verbose=True)
    vars_pt, pt                 = fit_transform_pt(selected_var, drop_failed=True )
    vars_pt_clean               = remove_outliers(vars_pt)
    vars_pt_clean.to_csv(path.join(dir_covid , list_file +'_pt_clean.csv'))

    vars_pt_imputed, error = missForest(vars_pt_clean)
    vars_pt_imputed.to_csv(path.join(dir_covid ,list_file+ '_pt_imputed.csv'))
    

*******************************************************************************
Cleaning this dataset
Getting dataset: /SRO/Data/Complete_Covid_03-26-2021...:
file: meaningful_variables_stress_mindset.csv 
 
**********************************************************************
Training dataset
**********************************************************************
* Number of variables still skewed: 0
* Successfully transformed 10 variables
* 0 variables could not be transformed successfully
Dropping 0 skewed data that could not be transformed successfully:

**********************************************************************
  missForest iteration
 
1
 
in progress...
done!

  missForest iteration
 
2
 
in progress...
done!

*******************************************************************************
Cleaning this dataset
Getting dataset: /SRO/Data/Complete_Covid_03-26-2021...:
file: meaningful_variables_psych.csv 
 
***************************************************************

## Clean self regulation

In [5]:
selected_var_covid_test  = get_behav_data(verbose=True)
selected_var_master      = get_behav_data(data_subset = 'Data_master', verbose=True)

Getting dataset: /SRO/Data/Complete_Covid_03-26-2021...:
file: meaningful_variables.csv 
 
Getting dataset: /SRO/Data_master/Complete_02-16-2019...:
file: meaningful_variables.csv 
 


In [6]:
#Some variables from  master have different names, as different layout of survey was used
#1.holt_laury_survey                             holt_laury_survey_correctlayout
#2.selection_optimization_compensation_survey    selection_optimization_compensation_survey_correctlayout
#3.sensation_seeking_survey                      sensation_seeking_survey_correctlayout
selected_var_master.columns = [i.replace('holt_laury_survey', 'holt_laury_survey_correctlayout') for i in selected_var_master]
selected_var_master.columns = [i.replace('selection_optimization_compensation_survey', 'selection_optimization_compensation_survey_correctlayout') for i in selected_var_master]
selected_var_master.columns = [i.replace('sensation_seeking_survey', 'sensation_seeking_survey_correctlayout') for i in selected_var_master]

**Split the master dataset to have subjects whose data were collected and analyzed only before onset covid(master, N = 386) and those whose data were collected and analyzed pre (master) and post (covid) onset COVID (N = 107)**

In [7]:
dict_train_master   = get_admin_data(data_dir, 'sro_train_turkers_master.json')
dict_test_master    = get_admin_data(data_dir, 'sro_test_turkers_master.json')

In [8]:
train_master = list(dict_train_master.keys()) 
test_master  = list(dict_test_master.keys()) 

In [9]:
train_master.sort()
test_master.sort()

In [10]:
selected_var_master_train  = selected_var_master.loc[train_master ,:]
selected_var_master_test   = selected_var_master.loc[test_master ,:]

print("Number of subject for master training, those tested only once:", len(selected_var_master_train.index))
print("Number of subject for master testing:", len(selected_var_master_test.index))


Number of subject for master training, those tested only once: 386
Number of subject for master testing: 107


In [11]:
assert len(selected_var_master_train)== len(train_master), 'data has wrong number of subjects'
assert train_master == list(selected_var_master_train.index),'data has wrong id subjects'

In [12]:
assert len(selected_var_master_test)== len(test_master), 'data has wrong number of subjects'
assert test_master == list(selected_var_master_test.index),'data has wrong id subjects'

In [13]:
print('Number of meaningful variables, master train:', len(selected_var_master_train.columns))
print('Number of meaningful variables, master test:', len(selected_var_master_test.columns))
print('Number of meaningful variables, covid test:', len( selected_var_covid_test.columns))


Number of meaningful variables, master train: 204
Number of meaningful variables, master test: 204
Number of meaningful variables, covid test: 204


**Transform variables with Power Transform**

In [14]:
master_train_pt, pt  = fit_transform_pt(selected_var_master_train, drop_failed=True )
master_test_pt       = transform_pt(pt, selected_var_master_test, drop_failed=True )
covid_test_pt        = transform_pt(pt, selected_var_covid_test, drop_failed=True )

**********************************************************************
Training dataset
**********************************************************************
* Number of variables still skewed: 4
* Successfully transformed 201 variables
* 3 variables could not be transformed successfully
Dropping 3 skewed data that could not be transformed successfully:
bickel_titrator.hyp_discount_rate_medium
bickel_titrator.hyp_discount_rate_small
bickel_titrator.hyp_discount_rate_large
**********************************************************************
**********************************************************************
Testing dataset
**********************************************************************
* Number of variables still skewed: 10
* Successfully transformed 200 variables
* 4 variables could not be transformed successfully
Dropping 4 skewed data that could not be transformed successfully:
bickel_titrator.hyp_discount_rate_medium
bickel_titrator.hyp_discount_rate_small
bickel_titrat

**Remove outliers and correlated variables**

In [15]:
master_train_pt_out, master_train_pt_clean = clean_dataset(selected_var_pt = master_train_pt)
master_test_pt_out, master_test_pt_clean   = clean_dataset(selected_var_pt = master_test_pt)
covid_test_pt_out, covid_test_pt_clean     = clean_dataset(selected_var_pt = covid_test_pt)

**************************************************
Dropping 17 variables with correlations above 0.85
**************************************************
angling_risk_task_always_sunny.release_score.pt
angling_risk_task_always_sunny.keep_score.pt
holt_laury_survey_correctlayout.risk_aversion.pt
kirby.percent_patient_small.pt
kirby.percent_patient_medium.pt
kirby.percent_patient_large.pt
kirby.percent_patient.pt
probabilistic_selection.value_sensitivity.pt
stim_selective_stop_signal.hddm_thresh.pt
stim_selective_stop_signal.hddm_drift.pt
stim_selective_stop_signal.reactive_control_hddm_drift.pt
stim_selective_stop_signal.hddm_non_decision.pt
stim_selective_stop_signal.SSRT.pt
tower_of_london.num_extra_moves.pt
tower_of_london.planning_time.pt
tower_of_london.num_optimal_solutions.pt
tower_of_london.avg_move_time.pt
**************************************************
Dropping 15 variables with correlations above 0.85
**************************************************
angling_risk_task_alwa

**Keep only variables common to all three datasets**

In [16]:
common_vars = set.intersection(set(master_train_pt_clean), set(master_test_pt_clean),set(covid_test_pt_clean))
master_train_pt_clean = drop_not_common_vars(master_train_pt_clean, common_vars)
master_test_pt_clean  = drop_not_common_vars(master_test_pt_clean, common_vars)
covid_test_pt_clean   = drop_not_common_vars(covid_test_pt_clean, common_vars)

In [17]:
print(len(master_train_pt_clean.columns))
print(len(master_test_pt_clean.columns))
print(len(covid_test_pt_clean.columns))

178
178
178


**Save cleaned common variables**

In [18]:
master_train_pt_clean.to_csv(path.join(dir_master , 'meaningful_variables_pt_clean_'  + 'train' + '.csv'))
master_test_pt_clean.to_csv(path.join(dir_master ,  'meaningful_variables_pt_clean_'   + 'test' + '.csv'))
covid_test_pt_clean.to_csv(path.join(dir_covid ,    'meaningful_variables_pt_clean_'   + 'test' + '.csv'))

**Impute datasets**

In [19]:
master_train_pt_imputed, error  = missForest(master_train_pt_clean)
master_test_pt_imputed, error   = missForest(master_test_pt_clean)
covid_test_pt_imputed, error    = missForest(covid_test_pt_clean)

  missForest iteration
 
1
 
in progress...
done!

  missForest iteration
 
2
 
in progress...
done!

  missForest iteration
 
3
 
in progress...
done!

  missForest iteration
 
4
 
in progress...
done!

  missForest iteration
 
5
 
in progress...
done!

  missForest iteration
 
1
 
in progress...
done!

  missForest iteration
 
2
 
in progress...
done!

  missForest iteration
 
3
 
in progress...
done!

  missForest iteration
 
4
 
in progress...
done!

  missForest iteration
 
1
 
in progress...
done!

  missForest iteration
 
2
 
in progress...
done!

  missForest iteration
 
3
 
in progress...
done!

  missForest iteration
 
4
 
in progress...
done!

  missForest iteration
 
5
 
in progress...
done!

  missForest iteration
 
6
 
in progress...
done!



**Save imputed common variables**

In [20]:
master_train_pt_imputed.to_csv(path.join(dir_master , 'meaningful_variables_pt_imputed_'  + 'train' + '.csv'))
master_test_pt_imputed.to_csv(path.join(dir_master ,  'meaningful_variables_pt_imputed_'   + 'test' + '.csv'))
covid_test_pt_imputed.to_csv(path.join(dir_covid ,    'meaningful_variables_pt_imputed_'   + 'test' + '.csv'))

In [21]:
print('*'*79)
print('Master train')
perc_missing, perc_var_no_missing= get_missing_values_data(master_train_pt_clean)
print('*'*79)
print('Master test')
perc_missing, perc_var_no_missing=get_missing_values_data(master_test_pt_clean)
print('*'*79)
print('Covid test')
perc_missing, perc_var_no_missing=get_missing_values_data(covid_test_pt_clean)

*******************************************************************************
Master train
Missing values: % 3.49
Var without missing values: % 35.39
Var with missing values above 10%:
probabilistic_selection.positive_learning_bias.pt                                   0.334197
motor_selective_stop_signal.hddm_non_decision.pt                                    0.318653
motor_selective_stop_signal.hddm_thresh.pt                                          0.303109
motor_selective_stop_signal.reactive_control_hddm_drift.pt                          0.300518
motor_selective_stop_signal.proactive_control_hddm_drift.pt                         0.300518
motor_selective_stop_signal.hddm_drift.pt                                           0.300518
motor_selective_stop_signal.SSRT.pt                                                 0.300518
discount_titrate.percent_patient.pt                                                 0.300518
two_stage_decision.model_free.pt                                     