This notebook provides a summary of the predictive analyses using task or survey data to predict demographic/health measures.  

In [2]:
import os,glob,sys
import pickle
import numpy,pandas
pandas.options.display.max_colwidth = 0
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, HTML
import seaborn as sns
import scipy.stats
from statsmodels.sandbox.stats.multicomp import multipletests
%load_ext rpy2.ipython
from scipy.cluster.hierarchy import dendrogram,ward,cut_tree,leaves_list
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler

import selfregulation.prediction.behavpredict_V1 as behavpredict

clf='lasso'
acc,features=pickle.load(open('singularity_analyses/wrangler/%s_data_collapsed.pkl'%clf,'rb'))
cont_measure='r2' # use r^2 or MAE for non-binary variables

Check all variables to make sure they have the correct number of observations (1000 for baseline and baseline_shuffle, 100 for all others), and create tables summarizing data.

In [3]:
allvars={}
datasets=[]
for k in acc.keys():
    if len(acc[k])==0:
        print('no data for',k)
        continue
    datasets.append(k)
    for v in acc[k][cont_measure]['scores_cv']:
        allvars[v]=cont_measure
    for v in acc[k]['AUROC']['scores_cv']:
        allvars[v]='AUROC'

alldata={'r2':pandas.DataFrame(),'MAE':pandas.DataFrame(),'AUROC':pandas.DataFrame(),
        'r2_pval':pandas.DataFrame()}
#['baseline_shuffle','baseline','task','survey','discounting',
#          'stopping','intelligence',
#          'impulsivity','big5','risktaking','grit','emotion','bisbas',
#          'drift','nondecision','thresh']
target_n={}
goodcount={}
for d in datasets:
    if len(acc[k])==0:
        print('no data for',k)
        continue
    
    goodcount[d]={}
    if d.find('baseline')>-1:
        target_n[d]=1000
    else:
        target_n[d]=100
    examplefeature=list(features[d].keys())[0]
    print(d,features[d][examplefeature].shape[1])

    for v in acc[d]['r2']['scores_cv']:
        if not v in acc[d][allvars[v]]['scores_cv']:
            goodcount[d][v]=0
        else:
            goodcount[d][v]=numpy.isfinite(acc[d][allvars[v]]['scores_cv'][v]).sum()
        if goodcount[d][v]<target_n[d]:
            print(d,v,goodcount[d][v],features[d][v].shape[1])

for v in allvars:
    if allvars[v]==cont_measure:
        vars={}
        for k in datasets:
            if not 'r2' in acc[k]:
                continue
            vars[k]=acc[k]['r2']['scores_cv'].mean().T
        df=pandas.DataFrame(vars,index=[v])
        alldata['r2']=alldata['r2'].append(df)
        
        vars={}
        for k in datasets:
            if not 'MAE' in acc[k]:
                continue
            vars[k]=acc[k]['MAE']['scores_cv'].mean().T
        df=pandas.DataFrame(vars,index=[v])
        alldata['MAE']=alldata['MAE'].append(df)
    else:
        vars={}
        for k in datasets:
            if not 'AUROC' in acc[k]:
                continue
            vars[k]=acc[k]['AUROC']['scores_cv'].mean().T
        df=pandas.DataFrame(vars,index=[v])
        alldata['AUROC']=alldata['AUROC'].append(df)
   


no data for emotion
intelligence 5
stroop 6
nondecision 15
stopping 14
stop_signal 8
bisbas 6
grit 3
discounting 8
threebytwo 6
attention_network_task 8
motor_selective_stop_signal 2
discount_titrate 3
thresh 15
risktaking 21
baseline_shuffle 2
baseline_shuffle DivorceCount 999 2
tower_of_london 6
dot_pattern_expectancy 9
baseline 2
kirby 5
big5 7
task 107
task HowOftenGuiltRemorseDrinking 33 107
task AlcoholHowOften6Drinks 40 107
task HowOftenFailedActivitiesDrinking 24 107
impulsivity 13
survey 67
drift 27
columbia_card_task_hot 7


Compute p values

In [4]:
target='baseline'
null='baseline_shuffle'
def get_pval(target,null,allvars,datasets,acc):
    data=[]
    vars=list(allvars.keys())
    vars.sort()
    for v in vars:
        #print(target,null,v)
        if not v in acc[target][allvars[v]]['scores_cv'] or not v in acc[null][allvars[v]]['scores_cv']:
            data.append([allvars[v],numpy.nan,numpy.nan,numpy.nan,numpy.nan,numpy.nan])
            continue
        targdist=acc[target][allvars[v]]['scores_cv'][v].dropna()
        targmean=targdist.mean()
        nulldist=acc[null][allvars[v]]['scores_cv'][v].dropna()
        nullmean=nulldist.mean()
        targstd=targdist.std()
        pval=1-scipy.stats.percentileofscore(nulldist,targmean)/100.
        if targstd>0:
            #es=(targmean-nullmean)/targstd
            es=targmean-nullmean
        else:
            es=numpy.nan
        insample=acc[target][allvars[v]]['scores_insample'][v].mean()
        data.append([allvars[v],targmean,nullmean,es,insample,pval])
    df=pandas.DataFrame(data,index=vars,columns=['Measure','Target mean','Null Mean','Effect size','In-sample','p_unc'])
    return(df)
        
pvals={}
pvals[('baseline','baseline_shuffle')]=get_pval('baseline','baseline_shuffle',allvars,datasets,acc)
for d in datasets:
    if d.find('baseline')>-1 or len(acc[d])==0:
        continue
    print(d)
    pvals[(d,'baseline')]=get_pval(d,'baseline',allvars,datasets,acc)

pvals_fdr={}
for k in pvals:
    tmp=multipletests(pvals[k]['p_unc'])
    pvals[k]['p_fdr']=tmp[1]

intelligence
stroop
nondecision
stopping
stop_signal
bisbas
grit
discounting
threebytwo
attention_network_task
motor_selective_stop_signal
discount_titrate
thresh
risktaking
tower_of_london
dot_pattern_expectancy
kirby
big5
task
impulsivity
survey
drift
columbia_card_task_hot


In [5]:
def get_importances(v,dt,features,nfeats=3):
    if not v in features[dt]:
        print(v,'is not in features for',dt)
        return None
    
    #print(dt,'importances for:',v)
    imp=pandas.DataFrame({'importance':(features[dt][v].abs()>0).mean(0)})
    imp['mean']=features[dt][v].mean(0)
    imp=imp.sort_values(by='importance',ascending=False)
    
    if nfeats>(imp.shape[0]):
        nfeats=imp.shape[0]
    topfeats=imp.iloc[:nfeats]
    topfeats=topfeats.query('importance>0')
    return topfeats

def get_importance_list(sigp,dt,features):
    implist=[]
    for v in sigp.index:
        i=get_importances(v,dt,features)
        implist.append([list(i.index)])
    df=pandas.DataFrame({'top features':implist})
    df.index=sigp.index
    return df

# plot var for all datasets
def plotvars(v,pvals,datasets,allvars,plotcutoff=True,
            plotbaseline=False):

    df=[]
    errors=[]
    ds=[]
    for k in datasets:
        if not allvars[v] in acc[k]:
            continue
        if not v in acc[k][allvars[v]]['scores_cv']:
            continue
        targdist=acc[k][allvars[v]]['scores_cv'][v].dropna()
        df.append(targdist.mean())
        ds.append(k)
        errors.append(targdist.std())
    df=pandas.DataFrame({'mean':df},index=ds)
    errors=pandas.DataFrame({'mean':errors},index=ds)
    if allvars[v]=='AUROC':
        df.plot.bar(yerr=errors,legend=False,ylim=(0.45,numpy.max(df.values)*1.1))
    else:
        df.plot.bar(yerr=errors,legend=False)
    plt.title(v)
    plt.ylabel(allvars[v]+' +/- SE across CV runs')
    if plotcutoff:
        cutoff=acc['baseline'][allvars[v]]['scores_cv'][v].dropna().quantile(0.95)
        plt.plot([0,1000],[cutoff,cutoff],'k--',linewidth=0.5)
    if plotbaseline:
        baseline=acc['baseline'][allvars[v]]['scores_cv'][v].dropna().mean()
        plt.plot([0,1000],[baseline,baseline],'k--',linewidth=0.5)


### Assess survey variables in terms of their overall predictive utility


In [6]:
k=('survey','baseline')
df=pandas.DataFrame()
absfeat=pandas.DataFrame()

for v in features['survey']:
    df[v]=features['survey'][v].mean(0)
    absfeat[v]=(features['survey'][v].abs()>0).mean()
    
mean_imp=df.mean(1)
meanabs_survey=pandas.DataFrame({'meanabs':absfeat.mean(1)}).sort_values(by='meanabs',ascending=False)


In [7]:
df=pandas.DataFrame()
absfeat=pandas.DataFrame()

for v in features['task']:
    df[v]=features['task'][v].mean(0)
    absfeat[v]=(features['task'][v].abs()>0).mean()
    
mean_imp=df.mean(1)
meanabs_task=pandas.DataFrame({'meanabs':absfeat.mean(1)}).sort_values(by='meanabs',ascending=False)


## Visualize structure of demographic target variables

In [8]:
bp=behavpredict.BehavPredict(verbose=True,
     drop_na_thresh=100,
     skip_vars=['RetirementPercentStocks',
     'HowOftenFailedActivitiesDrinking',
     'HowOftenGuiltRemorseDrinking',
     'AlcoholHowOften6Drinks'],
     add_baseline_vars=True,
     freq_threshold=0.1)
bp.load_demog_data()
bp.get_demogdata_vartypes()


replacing bad WeightPounds value for Index(['s028'], dtype='object')
replacing bad HeightInches value for Index(['s462', 's513', 's517'], dtype='object')
replacing bad CaffienatedSodaCansPerDay value for Index(['s108'], dtype='object')
dropping categorical variable: HispanicLatino
dropping categorical variable: Race
dropping categorical variable: DiseaseDiagnoses
dropping categorical variable: DiseaseDiagnosesOther
dropping categorical variable: MotivationForParticipation
dropping categorical variable: MotivationOther
dropping categorical variable: NeurologicalDiagnoses
dropping categorical variable: NeurologicalDiagnosesDescribe
dropping categorical variable: OtherDebtSources
dropping categorical variable: OtherDrugs
dropping categorical variable: OtherRace
dropping categorical variable: OtherTobaccoProducts
dropping categorical variable: PsychDiagnoses
dropping categorical variable: PsychDiagnosesOther
dropping skipped variable: RetirementPercentStocks
dropping skipped variable: HowO

In [9]:
demogdata=bp.demogdata.copy()
for i in demogdata.columns:
    if not i in features['task'] and not i in features['survey']:
        del demogdata[i]
        print('removing',i)
demogdata=demogdata.T
demogdata['goodvar']=demogdata.isnull().sum(1)<10
demogdata_clean=demogdata.query('goodvar==True')
print(demogdata.shape)
del demogdata_clean['goodvar']
demogdata_clean=demogdata_clean.T

# these are bad vars that don't have features
dropvars=['HowOftenCantStopDrinking',
'HowOftenFailedActivitiesDrinking',
'HowOftenGuiltRemorseDrinking','AlcoholHowOften6Drinks']

for v in dropvars:
    if v in demogdata_clean:
        del demogdata_clean[v]
        print('removing',v)
from sklearn.linear_model import LinearRegression
from fancyimpute import SimpleFill

def residualize_baseline(df):
    # remove baseline vars
    baseline=df[['Age','Sex']]
    data=df.copy()
    del data['Age']
    del data['Sex']
    #x=SimpleFill().complete(baseline)
    lr=LinearRegression()
    for v in data:
        #print('residualizing',v)
        if data[v].isnull().sum()>0:
            y=SimpleFill().complete(data[v].values[:,numpy.newaxis])
        else:
            y=data[v]
        lr.fit(baseline,y)
        data[v]=y - lr.predict(baseline)
    return data
df_resid=residualize_baseline(demogdata_clean)


removing GamblingProblem
removing TrafficTicketsLastYearCount
removing HowOftenDrinkMorning
removing HowOftenCantStopCannabis
removing HowOftenFailedActivitiesCannabis
removing HowOftenDevotedTimeCannabis
removing HowOftenMemoryConcentrationProblemCannabis
removing HowOftenHazardousCannabis
removing CannabisConsideredReduction
removing AbuseMoreThanOneDrugAtATime
removing BlackoutFlashbackDrugUse
removing FeelBadGuiltyDrugUse
removing SpouseParentsComplainDrugUse
removing NeglectedFamilyDrugUse
removing EngagedInIllegalActsToObtainDrugs
removing WidthdrawalSymptoms
removing MedicalProblemsDueToDrugUse
removing DoctorVisitsLastMonth
(51, 523)
removing HowOftenCantStopDrinking


In [11]:
df_resid.to_csv('../Data/Derived_Data/Complete_10-08-2017/demog_residAgeSex.csv')

In [9]:
dthresh=2.0
dist=1-numpy.abs(df_resid.corr(method='spearman'))
k=ward(numpy.triu(dist))
c=cut_tree(k,height=dthresh)
ll=leaves_list(k)

matches={}
matchnums={}
clustdict={}
for i in numpy.unique(c):
    matches[i]=[]
    matchnums[i]=[]
    for j in numpy.where(c==i)[0]:
        matches[i].append(df_resid.columns[j])
        clustdict[df_resid.columns[j]]=i
        matchnums[i].append(j)

matchdesc={0:'education/height/weight',1:'relationships',2:'domestic',3:'financial/coffee',
          4:'caffeine',5:'legal problems',6:'smoking',7:'alcohol use',
          8:'alcohol/drug problems',9:'mental health',10:'obesity'}

matches

{0: ['HighestEducation', 'HeightInches', 'WeightPounds'],
 1: ['RelationshipStatus', 'DivorceCount', 'LongestRelationship'],
 2: ['RelationshipNumber', 'ChildrenNumber', 'HouseholdIncome'],
 3: ['RetirementAccount', 'RentOwn', 'CoffeeCupsPerDay'],
 4: ['TeaCupsPerDay',
  'CaffienatedSodaCansPerDay',
  'CaffieneOtherSourcesDayMG'],
 5: ['TrafficAccidentsLifeCount', 'ArrestedChargedLifeCount'],
 6: ['LifetimeSmoke100Cigs',
  'HowLongSmoked',
  'SmokeEveryDay',
  'CigsPerDay',
  'HowSoonSmokeAfterWaking'],
 7: ['AlcoholHowOften',
  'AlcoholHowManyDrinksDay',
  'HowOftenUnableRememberDrinking'],
 8: ['InjuredDrinking',
  'RelativeFriendConcernedDrinking',
  'CannabisPast6Months',
  'AbleToStopDrugs'],
 9: ['Nervous',
  'Hopeless',
  'RestlessFidgety',
  'Depressed',
  'EverythingIsEffort',
  'Worthless',
  'Last30DaysUsual'],
 10: ['BMI', 'Obese']}

## Clustering on predictor loadings


looks pretty crappy

In [None]:
surveyfiles=glob.glob('/Users/poldrack/code/Self_Regulation_Ontology/prediction_analyses/R_exports_lasso/features/survey*')
dropvars=['Age','Sex']
loadingdata={'survey':None}
include_task=False

for f in surveyfiles:
    varname=f.split('survey')[1].split('_')[1]
    for d in dropvars:
        if f.find(d)>-1:
            continue
    sdata=pandas.read_csv(f).mean(0)
    if include_task:
        tf=f.replace('features/survey_','features/task_')
        if not os.path.exists(tf):
            print('skipping',varname)
            continue
        tdata=pandas.read_csv(tf).mean(0)
        alldata=pandas.concat((tdata,sdata))
    else:
        alldata=sdata
    if loadingdata['survey'] is None:
        loadingdata['survey']=pandas.DataFrame({varname:alldata})
    else:
        loadingdata['survey'][varname]=alldata
        
loadingdata['survey']=loadingdata['survey'].drop('Age').drop('Sex')
del loadingdata['survey']['Age']
del loadingdata['survey']['Sex']
allvars=[i for i in list(loadingdata['survey'].columns) if not i.find('.binarized')>-1]
for c in allvars:
    if '%s.binarized'%c in loadingdata['survey']:
        del loadingdata['survey']['%s.binarized'%c]

## Factor analysis on outcome measures 
Exploratory - don't use this

In [None]:
%%R -i df_resid -o scores,loadings,varnames


dropvars <- names(df_resid) %in% c("HeightInches", "WeightPounds", "CigsPerDay") 
print(dropvars)
df <- df_resid[,!dropvars]


library(psych)
library(semPlot)
vss.result=VSS(df,16,fm='mle',plot=FALSE)
#print(vss.result)
nfactor=which.min(vss.result$vss.stats$BIC)
fa.result=fa(df,nfactors=nfactor,fm='mle')
loadings=fa.result$loadings
print(fa.result,cut=0.2,sort=TRUE)
scores=factor.scores(df,fa.result,method='tenBerge')$scores
semPaths(fa.result)
#clst=iclust(df_resid)

In [None]:
scores_df=pandas.DataFrame(scores,columns=['smoking severity','mental illness',
                                           'smoking','obesity',
                                           'alcohol','domestic'],index=df_resid.index)
scores_df.to_csv("../Data/Derived_Data/Complete_10-08-2017/factor_scores.csv")

In [None]:
%%R

library(psych)
library(MASS)
library(semPlots)