# Description/How to Use

Please message Susan with any issues.

**Making sure that the code is up to date with my code**
- git remote add upstream https://github.com/susanhao/DPA_Project.git
- git remote -v
- git fetch upstream
- git checkout master
- git merge upstream/master

**Directory Setup**
- you should being using git to directly pull and push the folders Analysis and Data
- in the Data folder --> behavioral logs will go into the behavioral folder.  
 -Put the log files from google drive in the appropriate folders.  
   -download whole behavioral logs folder from google drive
 -Download the qualtrics csv from the behavioral session and put it in the correct folder.
- Once you have all the data in the Data folder up to date, you can start running this script.

**Running this Script**
- you should have jupyter notebook already installed (that's how you can view this)
- you should also have nbextensions installed
 - https://github.com/ipython-contrib/jupyter_contrib_nbextensions
 - please make sure that TOC (table of contents) is turned on
 - in the script you can press the TOC icon at the top to see the contents
- please run every cell of this script
 - you can press CTRL + SHIFT to run something
- at the end of the script, we will save our results 

**Pushing to Github**
- git add .
- git commit -m "message (ie what subject you analyzed)"
- git push -u origin master


# Setup

In [1]:
import csv
import os
import pandas as pd
import numpy as np
import re
import itertools

#visualization imports
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.animation
import seaborn as sns
sns.set_style("white")
import matplotlib.pyplot as plt


#stats imports
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression

#pandas stuff
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 70


  import pandas.util.testing as tm


In [None]:
#put data directory here
dir_logs = '../../Data/behavioral/behavioral logs/'
dir_answer = 'answer_keys/'

raw_scores_df = pd.DataFrame()
perc_scores_df = pd.DataFrame()
task_logs = {i.split('_')[0]: i for i in os.listdir(dir_logs)}

#Put excluded/withdrawn subjs here
excluded_subjs = ['DPA_F016','DPA_F039', 'DPA_F046', 'DPA_F047', 'DPA_F050', 'DPA_F051',
                 'DPA_F057', 'DPA_F061', 'DPA_F063', 'DPA_F064', 'DPA_F069', 'DPA_F070', 'DPA_F072']

In [2]:
#basic imports
import csv
import os
import pandas as pd
import numpy as np
import re
import itertools

#visualization imports
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.animation
import seaborn as sns
sns.set_style("white")
import matplotlib.pyplot as plt


#stats imports
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn.linear_model import LinearRegression

#pandas stuff
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 70


# Get Task Scores

## Bhisma's Emotion Task

In [None]:
answer_df = pd.read_csv(dir_answer + 'BET_answers.csv', names =['Stim', 'Answer'])
bet_scores_df = pd.DataFrame(columns = ['BET_score_raw', 'BET_score_perc'])
#loop thru log files
for file in os.listdir(dir_logs + task_logs['BET']):
    if 'BET' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
    
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['BET'] +'/' + file, ['Stim.*', 'Final.*'])
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace everything up to the last /
            clean_log['Stim'] = clean_log['Stim'].str.replace('.*\/', '')
            clean_log['Response'] = (clean_log['Response'].str.replace('Final button press: ', '')).str.lower()

            #merge clean log and answerkeys on stim
            subj_df = clean_log.merge(answer_df, how = 'left', left_on='Stim', right_on= 'Stim')
            subj_df['Correct'] = (subj_df['Response'] == subj_df['Answer']).astype(int)

            bet_scores_df.loc[subj, 'BET'] = subj_df['Correct'].sum()
            bet_scores_df.loc[subj, 'BET_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100

raw_scores_df = bet_scores_df[['BET']]
perc_scores_df = bet_scores_df[['BET_score_perc']]

## Brad's Identity Matching

In [None]:
answer_df = pd.read_csv(dir_answer + 'Brad_shuffled_answer.csv', names =['Target', 'Stim1', 'Stim2', 'Stim3', 'Answer'])
brad_scores_df = pd.DataFrame(columns = ['brad_score_raw', 'brad_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['Brads']):
    if 'Brad' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['Brads'] +'/' + file, ['Prompt.*', 'KEYPRESSED.*'])
            #first 3 rows are practice so discard
            clean_log = clean_log.iloc[3:]
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace KEYPRESS
            clean_log['Response'] = clean_log['Response'].str.replace('KEYPRESSED: ', '')

            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.join(answer_df)
            subj_df['Correct'] = (subj_df['Response'].astype(int) == subj_df['Answer']).astype(int)

            brad_scores_df.loc[subj, 'BFMT'] = subj_df['Correct'].sum()
            brad_scores_df.loc[subj, 'brad_score_perc'] =( subj_df['Correct'].sum()/len(subj_df['Correct']))*100

raw_scores_df = raw_scores_df.join(brad_scores_df[['BFMT']], how = 'outer')
perc_scores_df = perc_scores_df.join(brad_scores_df[['brad_score_perc']], how = 'outer')

## Films Emotion Task

In [None]:
answer_df = pd.read_csv(dir_answer + 'Films_answers.csv', names =['Target', 'Stim1', 'Stim2', 'Stim3', 'Answer'])
films_scores_df = pd.DataFrame(columns = ['films_score_raw', 'films_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['films']):
    if 'Films' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['films'] +'/' + file, ['Prompt.*', 'KEYPRESSED.*'])

            #first 3 rows are practice so discard
            clean_log = clean_log.iloc[3:]
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace KEYPRESS
            clean_log['Response'] = clean_log['Response'].str.replace('KEYPRESSED: ', '')

            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.join(answer_df)
            subj_df['Correct'] = (subj_df['Response'].astype(int) == subj_df['Answer']).astype(int)

            films_scores_df.loc[subj, 'Films'] = subj_df['Correct'].sum()
            films_scores_df.loc[subj, 'films_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100

raw_scores_df = raw_scores_df.join(films_scores_df[['Films']], how = 'outer')
perc_scores_df = perc_scores_df.join(films_scores_df[['films_score_perc']], how = 'outer')

## CFMT

In [None]:
answer_df = pd.read_csv(dir_answer + 'CFMT_key.csv', names =['keypress', 'prompt'])
CFMT_scores_df = pd.DataFrame(columns = ['CFMT_score_raw', 'CFMT_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['CFMT']):
    if 'CFMT' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['CFMT'] +'/' + file, ['Prompt.*', 'Final.*'])

            #first 3 rows are practice so discard
            clean_log = clean_log.iloc[3:]
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace KEYPRESS
            clean_log['Response'] = clean_log['Response'].str.replace('Final key press: ', '')
            #replace prompt         
            clean_log['Stim'] = clean_log['Stim'].str.replace('Prompt: ', '')
            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.join(answer_df)

            subj_df['Correct'] = (subj_df['Response'].astype(int) == subj_df['prompt']).astype(int)
            CFMT_scores_df.loc[subj, 'CFMT'] = subj_df['Correct'].sum()
            CFMT_scores_df.loc[subj, 'CFMT_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100
        
raw_scores_df = raw_scores_df.join(CFMT_scores_df[['CFMT']], how = 'outer')
perc_scores_df = perc_scores_df.join(CFMT_scores_df[['CFMT_score_perc']], how = 'outer')

## Famous Faces

In [None]:
ff_scores_df = pd.DataFrame(columns = ['FF_score_raw', 'FF_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['FF']):
    if 'FF' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['FF'] +'/' + file, [ 'Prompt.*'])
            clean_log.rename({0:'Response'}, axis = 1, inplace = True)

            #replace prompt answer
            clean_log['Response'] = clean_log['Response'].str.replace('Prompt Answer: ', '')


            correct = clean_log['Response'].value_counts()['Yes! I got it right']
            if ('I got it wrong and I am familiar with this face.' in clean_log['Response'].value_counts().index):
                total_known = clean_log['Response'].value_counts()['Yes! I got it right'] + clean_log['Response'].value_counts()['I got it wrong and I am familiar with this face.']
            else:
                total_known = correct

            ff_scores_df.loc[subj, 'FF'] = (correct/total_known)*100
            ff_scores_df.loc[subj, 'FF_score_perc'] = (correct/total_known)*100

raw_scores_df = raw_scores_df.join(ff_scores_df[['FF']], how = 'outer')
perc_scores_df = perc_scores_df.join(ff_scores_df[['FF_score_perc']], how = 'outer')

## Mind in the Eyes

In [None]:
answer_df = pd.read_csv(dir_answer + 'MOE_answers.csv', names =['Target',  'Answer'])
MOE_scores_df = pd.DataFrame(columns = ['MOE_score_raw', 'MOE_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['MOE']):
    if 'MOE' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['MOE'] +'/' + file, ['Test.*', 'Final.*'])

            #first row is practice so discard
            clean_log = clean_log.iloc[1:]
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace strings
            clean_log['Stim'] = clean_log['Stim'].str.replace('Test: stims_MOE/', '')
            clean_log['Response'] = clean_log['Response'].str.replace('Final button press: ', '')

            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.join(answer_df)
            subj_df['Correct'] = (subj_df['Response'] == subj_df['Answer'])

            MOE_scores_df.loc[subj, 'MIE'] = subj_df['Correct'].sum()
            MOE_scores_df.loc[subj, 'MOE_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100
        
raw_scores_df = raw_scores_df.join(MOE_scores_df[['MIE']], how = 'outer')
perc_scores_df = perc_scores_df.join(MOE_scores_df[['MOE_score_perc']], how = 'outer')

## Emotion Hexagon

DPA_F047 isn't right - somehow didn't record everything. But we excluded them

In [None]:
answer_df = pd.read_csv(dir_answer + 'EH_answers.csv', names =['Stim',  'Emotion1', 'Emotion2', 'Answer'])
EH_scores_df = pd.DataFrame(columns = ['EH_score_raw', 'EH_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['EH']):
    if 'EH' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['EH'] +'/' + file, ['Stim.*', 'Final.*'])
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace strings
            clean_log['Stim'] = clean_log['Stim'].str.replace('Stim: stims/', '')
            clean_log['Stim'] = clean_log['Stim'].str.replace('.jpg', '')
            clean_log['Response'] = clean_log['Response'].str.replace('Final button press: ', '')
            clean_log['Response'] = clean_log['Response'].str.lower()

            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.merge(answer_df, how='left', left_on='Stim', right_on='Stim')

            #get rid of 50/50 rows - nan
            subj_df =  subj_df[~subj_df['Answer'].isna()]

    #         #comparing diff morphs
    #         morphs_90_10 = subj_df[subj_df['Emotion1'].str.contains('90') | subj_df['Emotion1'].str.contains('10')]
    #         morphs_70_30 = subj_df[subj_df['Emotion1'].str.contains('30') | subj_df['Emotion1'].str.contains('70')]
    #         morphs_90_10['Correct'] = (morphs_90_10['Response'] == morphs_90_10['Answer'])
    #         morphs_70_30['Correct'] = (morphs_70_30['Response'] == morphs_70_30['Answer'])
    #         EH_90_10.loc[subj] = morphs_90_10['Correct'].sum()/len(morphs_90_10['Correct'])
    #         EH_70_30.loc[subj] = morphs_70_30['Correct'].sum()/len(morphs_70_30['Correct'])

            subj_df['Correct'] = (subj_df['Response'] == subj_df['Answer'])
            EH_scores_df.loc[subj, 'EH'] = subj_df['Correct'].sum()
            EH_scores_df.loc[subj, 'EH_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct'])) * 100
            
raw_scores_df = raw_scores_df.join(EH_scores_df[['EH']], how = 'outer')
perc_scores_df = perc_scores_df.join(EH_scores_df[['EH_score_perc']], how = 'outer')

## CCMT

In [None]:
answer_df = pd.read_csv(dir_answer + 'CCMT_answers.csv', names =['prompt'])
CCMT_scores_df = pd.DataFrame(columns = ['CCMT_score_raw', 'CCMT_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['CFMTcars']):
    if 'CFMTcars' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['CFMTcars'] +'/' + file, ['Prompt.*', 'Final.*'])

            #first 3 rows are practice so discard
            clean_log = clean_log.iloc[3:]
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)

            #replace KEYPRESS
            clean_log['Response'] = clean_log['Response'].str.replace('Final key press: ', '')
            #replace prompt         
            clean_log['Stim'] = clean_log['Stim'].str.replace('Prompt: ', '')
            #merge  on index for clean log and answerkeys on stim
            subj_df = clean_log.set_index('Stim').join(answer_df)

            subj_df['Correct'] = (subj_df['Response'].astype(int) == subj_df['prompt']).astype(int)
            CCMT_scores_df.loc[subj, 'CCMT'] = subj_df['Correct'].sum()
            CCMT_scores_df.loc[subj, 'CCMT_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100
        
raw_scores_df = raw_scores_df.join(CCMT_scores_df[['CCMT']], how = 'outer')
perc_scores_df = perc_scores_df.join(CCMT_scores_df[['CCMT_score_perc']], how = 'outer')

## Gaze Direction

In [None]:
answer_df = pd.read_csv(dir_answer + 'GD_answers.csv', names =['Stim','Answer'])
GD_scores_df = pd.DataFrame(columns = ['GD_score_raw', 'GD_score_perc'])

#loop thru log files
for file in os.listdir(dir_logs + task_logs['GD']):
    if 'GD' in file:
        subj = re.search('DPA_F[0-9]{3}', file).group(0)
        
        if subj not in excluded_subjs:
            clean_log = cleanup_logs(dir_logs + task_logs['GD'] +'/' + file, ['Stim.*', 'Final.*'])
            clean_log.rename({0:'Stim', 1:'Response'}, axis = 1, inplace = True)
            
            #first 3 rows are practice so discard
            clean_log = clean_log.iloc[3:]

            #replace everything up to the last /
            clean_log['Stim'] = clean_log['Stim'].str.replace('.*\/', '')
            clean_log['Response'] = (clean_log['Response'].str.replace('Final button press: ', '')).str.lower()
            clean_log.drop(clean_log.index[84],inplace=True)

            #merge clean log and answerkeys on stim
            subj_df = clean_log.merge(answer_df, how = 'left', left_on='Stim', right_on= 'Stim')
            subj_df['Correct'] = (subj_df['Response'] == subj_df['Answer']).astype(int)
            
            GD_scores_df.loc[subj, 'GD'] = subj_df['Correct'].sum()
            GD_scores_df.loc[subj, 'GD_score_perc'] = (subj_df['Correct'].sum()/len(subj_df['Correct']))*100
        
raw_scores_df = raw_scores_df.join(GD_scores_df[['GD']], how = 'outer')
perc_scores_df = perc_scores_df.join(GD_scores_df[['GD_score_perc']], how = 'outer')

NameError: name 'dir_answer' is not defined

## Implicit Association Task

In [32]:
log_df = pd.read_csv('iat_DPA_F020_black_1st.log', delimiter = '\t', names = ['time', 'level', 'msg'])
data_df = log_df[log_df['level'] == 'CRITICAL ']
data_df.drop(columns = ['level'], inplace = True)

block = 0
score_blocks_df = pd.DataFrame(columns = ['time', 'msg'])
for index, row in data_df.iterrows():
    if 'Instructions' in row["msg"]:
        block += 1
    if block in [3, 4, 6, 7]:
        score_blocks_df = score_blocks_df.append({'time': row['time'], 'msg': row['msg']}, ignore_index = True)

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #print(score_blocks_df)
score_blocks_df.head(len(score_blocks_df))
test = score_blocks_df
score_blocks_df

Unnamed: 0,time,msg
0,100.8715,***Block 3 Instructions***
1,106.9424,***Block 3 Experiment***
2,106.9424,Stim: bf23_nc.jpg
3,107.8777,Final keypress: e
4,108.1434,Stim: Terrible
...,...,...
243,254.1990,Final keypress: e
244,254.4645,Stim: wm1_nc.jpg
245,255.0662,Final keypress: e
246,255.3318,Stim: wf6_nc.jpg


In [50]:
test.index[test['msg']=='***Block 7 Instructions***'].tolist()
block_3 = score_blocks_df.iloc[2:42]
block_4 = score_blocks_df.iloc[44:124]
block_6 = score_blocks_df.iloc[126:166]
block_7 = score_blocks_df.iloc[168:]

def array_maker1(length):
  counter = 1
  result = []
  for i in np.arange(length/2):
    result += [counter]
    result += [counter]
    counter += 1
  return result

array_maker1(6)

[1, 1, 2, 2, 3, 3]

In [69]:
def ammd(series):
  return max(series) - min(series)

def retrieve_stimu(series):
  return series.iloc[0]

block_3_ind = array_maker1(len(block_3))
block_3['ind'] = block_3_ind
block_3.index = block_3['ind']
block_3
block_3_stimu = block_3[['time', 'msg']].groupby(['ind']).agg(retrieve_stimu)
block_3_time = block_3[['time', 'msg']].groupby(['ind']).agg(ammd)
block_3_stimu['time_diff'] = block_3_time
b3_time_diff = block_3_stimu[['msg', 'time_diff']]
b3_time_diff.index[b3_time_diff['time_diff'] < 0.3]
b3_time_diff.index[b3_time_diff['time_diff'] > 10]

Int64Index([], dtype='int64', name='ind')

In [68]:
block_4_ind = array_maker1(len(block_4))
block_4['ind'] = block_4_ind
block_4.index = block_4['ind']
block_4_stimu = block_4[['time', 'msg']].groupby(['ind']).agg(retrieve_stimu)
block_4_time = block_4[['time', 'msg']].groupby(['ind']).agg(ammd)
block_4_stimu['time_diff'] = block_4_time
b4_time_diff = block_4_stimu[['msg', 'time_diff']]
b4_time_diff.index[b4_time_diff['time_diff'] < 0.3]
b4_time_diff.index[b4_time_diff['time_diff'] > 10]

Int64Index([], dtype='int64', name='ind')

In [67]:
block_6_ind = array_maker1(len(block_6))
block_6['ind'] = block_6_ind
block_6.index = block_6['ind']
block_6
block_6_stimu = block_6[['time', 'msg']].groupby(['ind']).agg(retrieve_stimu)
block_6_time = block_6[['time', 'msg']].groupby(['ind']).agg(ammd)
block_6_stimu['time_diff'] = block_6_time
b6_time_diff = block_6_stimu[['msg', 'time_diff']]
b6_time_diff.index[b6_time_diff['time_diff'] < 0.3]
b6_time_diff.index[b6_time_diff['time_diff'] > 10]

Int64Index([], dtype='int64', name='ind')

In [66]:
block_7_ind = array_maker1(len(block_7))
block_7['ind'] = block_7_ind
block_7.index = block_7['ind']
block_7
block_7_stimu = block_7[['time', 'msg']].groupby(['ind']).agg(retrieve_stimu)
block_7_time = block_7[['time', 'msg']].groupby(['ind']).agg(ammd)
block_7_stimu['time_diff'] = block_7_time
b7_time_diff = block_7_stimu[['msg', 'time_diff']]
b7_time_diff.index[b7_time_diff['time_diff'] < 0.3]
b7_time_diff.index[b7_time_diff['time_diff'] > 10]

Int64Index([], dtype='int64', name='ind')

In [70]:
b3_time_diff
b4_time_diff
b6_time_diff
b7_time_diff

Unnamed: 0_level_0,msg,time_diff
ind,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Stim: Lovely,0.7685
2,Stim: Horrible,0.7351
3,Stim: bf14_nc.jpg,0.6016
4,Stim: Painful,0.7351
5,Stim: Glorious,0.7852
6,Stim: Superb,0.9686
7,Stim: bm14_nc.jpg,0.5183
8,Stim: wf2_nc.jpg,0.619
9,Stim: Humiliate,0.635
10,Stim: bm56_nc.jpg,0.4849


## Online

In [None]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name(
         '../online/DPAProject-083039688e6a.json', scope) # Your json file here

gc = gspread.authorize(credentials)
wks = gc.open("Subject Log").sheet1

data = wks.get_all_values()
headers = data.pop(0)

df_subj_qual = pd.DataFrame(data, columns=headers)

In [None]:
wks = gc.open("Subject Log").worksheets()[1]

data = wks.get_all_values()
headers = data.pop(0)

df_online = pd.DataFrame(data, columns=headers)

In [None]:
all_subjs = df_subj_qual[df_subj_qual['Date Finished'] == ""]["Subject ID"]
all_subjs_df = df_online[df_online['Subject ID'].isin(all_subjs)]

In [None]:
raw_scores_df = raw_scores_df.join(all_subjs_df.set_index('Subject ID')[['AQ Score', 'EQ Score', 'STAI Y1', 'STAI Y2', 'CFMT - Australian', 'PI 20']])
raw_scores_df = raw_scores_df.replace('-', np.nan)
raw_scores_df = raw_scores_df.replace('', np.nan)
raw_scores_df = raw_scores_df.astype(float)
raw_scores_df.rename(columns ={'AQ Score': 'AQ', 'EQ Score': 'EQ', 'STAI Y1': 'STAI_Y1', 'STAI Y2': 'STAI_Y2', 'CFMT - Australian': 'CFMT_aus', 'PI 20': 'PI20'}, inplace=True)

## Export

In [None]:
raw_scores_df.to_csv('results/raw_scores_32220.csv')
perc_scores_df.to_csv('results/perc_scores_32220.csv')

# Analysis

## Factor Analysis

In [None]:
from factor_analyzer import FactorAnalyzer
x_factor = raw_scores_df.loc[:, :'GD'].join(raw_scores_df['CFMT_aus'])
fa = FactorAnalyzer(rotation=None)
fa.fit(x_factor)

ModuleNotFoundError: No module named 'factor_analyzer'

In [None]:
eig, v = fa.get_eigenvalues()
plt.scatter(range(0, len(eig)), eig)
plt.plot(range(0, len(eig)), eig)

In [None]:
fa = FactorAnalyzer(rotation='promax', n_factors =4)
fa.fit(x_factor)

In [None]:
factor_df = pd.DataFrame(index = x_factor.columns, columns = ['factor1', 'factor2', 'factor3', 'factor4'], data = fa.loadings_)
# factor_df.reindex(['CFMT', 'CFMT_aus', 'BFMT', 'FF', 'CCMT', 'BET', 'EH', 'MIE', 'Films', 'GD'])

In [None]:
def label_point_from_df(x_label, y_label, df, ax):
    texts = []
    for i, point in df.iterrows():
        texts.append(ax.text(point[x_label], point[y_label], point.name))
    adjust_text(texts)

In [None]:
from adjustText import adjust_text


plt.rcParams.update({'font.size': 13})
ax = sns.scatterplot(x= 'factor1', y='factor2', data = factor_df)
label_point_from_df('factor1', 'factor2', factor_df, ax)

In [None]:
plt.rcParams.update({'font.size': 13})
ax = sns.scatterplot(x= 'factor2', y='factor3', data = factor_df)
label_point_from_df('factor2', 'factor3', factor_df, ax)

In [None]:
plt.rcParams.update({'font.size': 13})
ax = sns.scatterplot(x= 'factor3', y='factor4', data = factor_df)
label_point_from_df('factor3', 'factor4', factor_df, ax)

In [None]:
transform_df = pd.DataFrame(index = x_factor.index, columns = ['factor1', 'factor2', 'factor3', 'factor4'], data = fa.transform(x_factor))
transform_df = transform_df.join(raw_scores_df[['AQ', 'EQ', 'STAI_Y1', 'STAI_Y2']])

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(transform_df.corr(), annot=True)

In [None]:
# p_values = calculate_pvalues(transform_df.astype(float)).values

perm_pvalue = []
transform_df = transform_df.astype(float)
for col_1 in transform_df.columns:
    row_pvalue = []
    for col_2 in transform_df.columns:
        print(col_1, col_2)
        perm_df = permutation_test(transform_df, col_1, col_2, 1000)
        corr_par = transform_df[[col_1, col_2]].corr().iloc[0, 1]
        row_pvalue.append(np.sum(np.abs(perm_df['perm_corr']) > np.abs(corr_par))/1000)
    perm_pvalue.append(row_pvalue)
        

In [None]:
pvalues_df = pd.DataFrame(perm_pvalue, columns = transform_df.columns, index = transform_df.columns)

In [None]:
plt.figure(figsize=(15, 15))
p_values_df = pvalues_df.loc[transform_df.columns, transform_df.columns]
labels =transform_df.columns

# mask = np.triu(np.ones_like(transform_df.astype(float).corr(), dtype=np.bool))
# ax = sns.heatmap(transform_df.astype(float).corr(), mask=mask, annot = True, cmap = 'RdBu', xticklabels=labels, yticklabels=labels)
ax = sns.heatmap(transform_df.astype(float).corr(), annot = True, cmap = 'RdBu_r', xticklabels=labels, yticklabels=labels)
for y_i, row in enumerate(p_values_df.values):
    for x_i, cell in enumerate(row):
        if cell < 0.05:
            ax.add_patch(Rectangle((x_i, y_i), 1, 1, fill=False, edgecolor = 'black', lw=3))
plt.show()

## PCA

In [None]:
from sklearn.decomposition import PCA
pca_df = x_factor
D = pca_df.values
pca_n = D.shape[0]
#Center Data
pca_means = np.mean(D, axis = 0)
X = (D - pca_means)/np.sqrt(pca_n)

pca = PCA()
pcs = pca.fit_transform(X)

In [None]:
pca_df['pc1'] = pcs[:, 0]
pca_df['pc2'] = pcs[:, 1]
pca_df['pc3'] = pcs[:, 2]

In [None]:
features_df = pd.DataFrame(columns = ['pc1', 'pc2', 'pc3'], index=pca_df.columns[:-3])
t_features = pca.components_.T * np.sqrt(pca.explained_variance_)
features_df['pc1'] = t_features[:, 0]
features_df['pc2'] = t_features[:, 1]
features_df['pc3'] = t_features[:, 2]




In [None]:
from adjustText import adjust_text


plt.rcParams.update({'font.size': 13})
ax = sns.scatterplot(x= 'pc1', y='pc2', data = features_df)
label_point_from_df('pc1', 'pc2', features_df, ax)

In [None]:
plt.rcParams.update({'font.size': 13})
ax = sns.scatterplot(x= 'pc2', y='pc3', data = features_df)
label_point_from_df('pc2', 'pc3', features_df, ax)

In [None]:
pca_df.join(raw_scores_df[['AQ', 'EQ', 'STAI_Y1', 'STAI_Y2']]).loc[:, 'pc1':].corr().round(2)

## Correlations

In [None]:
from scipy.stats import pearsonr
import pandas as pd

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

In [None]:
from sklearn.utils import shuffle


def permutation_test(df, y, x, times_shuffle):
    np.random.seed(1000)
    perm_df = pd.DataFrame()
    for i in range(times_shuffle):
        y_shuffle = shuffle(df[y]).reset_index(drop=True)
        shuffle_df = df[[x]].reset_index(drop=True)
        shuffle_df[y] = y_shuffle

        corr = shuffle_df[[y, x]].corr().iloc[0, 1]
        perm_df.loc[i, 'perm_corr'] = corr
    return perm_df

In [None]:
# p_values = calculate_pvalues(raw_scores_df.astype(float)).values

perm_pvalue = []
raw_scores_df = raw_scores_df.astype(float)
for col_1 in raw_scores_df.columns:
    row_pvalue = []
    for col_2 in raw_scores_df.columns:
        print(col_1, col_2)
        perm_df = permutation_test(raw_scores_df, col_1, col_2, 1000)
        corr_par = raw_scores_df[[col_1, col_2]].corr().iloc[0, 1]
        row_pvalue.append(np.sum(np.abs(perm_df['perm_corr']) > np.abs(corr_par))/1000)
    perm_pvalue.append(row_pvalue)
        

In [None]:
pvalues_df = pd.DataFrame(perm_pvalue, columns = raw_scores_df.columns, index = raw_scores_df.columns)

In [None]:
plt.figure(figsize=(15, 15))
raw_scores_df = raw_scores_df[['CFMT', 'CFMT_aus', 'BFMT', 'FF', 'CCMT', 'PI20', 'BET', 'EH', 'MIE', 'Films', 'GD', 'AQ', 'EQ', 'STAI_Y1', 'STAI_Y2']]
p_values_df = pvalues_df.loc[raw_scores_df.columns, raw_scores_df.columns]
labels = ['CFMT', 'CFMT Australian', 'Brads FMT', 'Famous Faces', 'CCMT', 'PI20', 'Bhismas Emotion Task', 'Emotion Hexagon', 'Mind in the Eyes', 'Films Task', 'Gaze Direction', 'AQ', 'EQ', 'STAI_Y1', 'STAI_Y2']

# mask = np.triu(np.ones_like(raw_scores_df.astype(float).corr(), dtype=np.bool))
# ax = sns.heatmap(raw_scores_df.astype(float).corr(), mask=mask, annot = True, cmap = 'RdBu', xticklabels=labels, yticklabels=labels)
ax = sns.heatmap(raw_scores_df.astype(float).corr(), annot = True, cmap = 'RdBu_r', xticklabels=labels, yticklabels=labels)
for y_i, row in enumerate(p_values_df.values):
    for x_i, cell in enumerate(row):
        if cell < 0.05:
            ax.add_patch(Rectangle((x_i, y_i), 1, 1, fill=False, edgecolor = 'black', lw=3))
plt.show()

In [None]:
sns.regplot(x = 'AQ', y = 'CFMT_aus', data = raw_scores_df.astype(float))
print (pearsonr(raw_scores_df['AQ'].astype(float), raw_scores_df['CFMT_aus'].astype(float)))

In [None]:
sns.regplot(x = 'STAI_Y2', y = 'CFMT_aus', data = raw_scores_df.astype(float))
print (pearsonr(raw_scores_df['STAI_Y2'].astype(float), raw_scores_df['CFMT'].astype(float)))

In [None]:
p_values = calculate_pvalues(raw_scores_df.astype(float)).values
pd.DataFrame(data= p_values)

## Histograms

In [None]:
fig, ax = plt.subplots(3, 5)
fig.set_figwidth(30)
fig.set_figheight(20)
ax = ax.ravel()
for i, col in enumerate(raw_scores_df.columns):
    ax[i].hist(raw_scores_df[col], bins=10)
    ax[i].set_title(col)