# Summarize Performance per subject
---------------------
Author:  G.FragaGonzalez for SINON study

Description: read individual Gorilla outputs, preprocesss, summarize (Correctness per level, block , noise, etc) and save summary per file

---------------------

## Gather individual stats
### Import libraries and define paths
Use location of this file to define relative paths

In [99]:
from datetime import date
today = date.today()
print("Today's date:", today)
import sys
import os
import glob
import re
import pandas as pd
import numpy as np

# filepaths - Use current script path as reference 
thisScriptDir = os.path.dirname(os.path.abspath("__file__"))
baseDir = thisScriptDir[:thisScriptDir.find("Scripts")]
dirinput = os.path.join(baseDir + 'Data', 'preprocessed','pilot_2','data_exp_116083-v2')
diroutput = os.path.join(baseDir + 'Analysis', 'pilot_2')

Today's date: 2023-11-03


### Find files 

In [120]:
validfiles = [
    files for files in glob.glob(dirinput + '/**/**/*.csv', recursive=True) if 
    'gathered/Concat' not in files and re.search(r'\d+\.csv', files) and 
    (os.path.basename(files).startswith('2FC') or os.path.basename(files).startswith('PM'))
]

print('Found ', len(validfiles), ' valid files')

Found  87  valid files


### Preprocess (loop per file)

In [122]:
df_list = []
for fileinput in validfiles: 
    # read data
    dat = pd.read_csv(fileinput)      
    print('Read table size: ', dat.shape )

    #Rename variable to use as subject id
    dat.rename(columns={'Participant Private ID':'SubjectID'}, inplace=True)
    dat.SubjectID = pd.Series(dat.SubjectID,dtype="object")

    # Keep only rows with trial response(always the one after 'audio play requested')
    idx_resp = dat.index[(dat.Response == 'AUDIO PLAY REQUESTED')] + 1    
    df = dat.iloc[idx_resp]
    df = df[df.display.str.contains('trial_')]

    # Replace Correct responses of 'miss' trials by NAs
    df.loc[df['Timed Out']==1,'Correct'] = 'miss'
    df.loc[df['Timed Out']==1,'Reaction Time'] = np.nan

    #% Find the presented audio based on the columns indicating list
    # First rename column with the list name (the exact column varies depending on  task, i.e., across csv files )
    cols2search = [i for i,val in enumerate(df.columns.str.contains('counterbalance*')) if val] 
    colWithList = [cols2search[i] for i,val in enumerate(cols2search) if any(df.iloc[:,val].astype(str).str.contains('list*'))]      
    df.insert(2,"STIMLIST",df.iloc[:,colWithList]) #Add conveniently named column with list info    

    # % create a variable 'audio' with the filename of presented audio
    audio=[]
    for row in range(len(df)):
        audio.append(df[df.iloc[row]['STIMLIST']].iloc[row])
    df.insert(len(df.columns),'AUDIO',audio)           

    # %% Extract trial type info  from the audiofilenames  
    df.insert(2,"TYPE",df['AUDIO'].str.split('norm').str[0].str.split('_').str[0])

    # RECODE Levels of degradation
    df.insert(2,"LV",df['AUDIO'].str.split('norm').str[1].str.replace('_','',regex=True).str.replace('.wav','',regex=True)) # use string split from filenames
    replace_map = {'-10db': 'L1','-7.5db': 'L2', '-5db': 'L3', '-2.5db': 'L4','0db': 'L5',\
                           '0.7p': 'L1','0.75p': 'L2', '0.8p': 'L3', '0.85p': 'L4','0.9p': 'L5'}
    df['LV'] = df['LV'].map(replace_map)

    # Some Renaming
    df.rename(columns={'Trial Number': 'trial'}, inplace=True)    
    df.rename(columns={'Task Name': 'task'}, inplace=True)    
    df.rename(columns={'Reaction Time': 'RT'}, inplace=True)    
    df.rename(columns={'Timed Out': 'Miss'}, inplace=True)    
    df.rename(columns={'Correct': 'Correctness'}, inplace=True)    
    df['task'] = df['task'].str.replace('SINON_task_','')

    # some  formatting        
    df['block'] = df['block'].astype('object')
    df['LV'] = df['LV'].astype('object')
    df['RT'] = df['RT'].astype('float64')
    df['Correctness'] = df['Correctness'].astype('str')
    df['Correctness'].replace({'1.0':'cor','0.0':'inc','miss':'miss'}, inplace=True)     

    # RECODE blocks for clarity in plots (1 and 2 value only)
    df['block'].replace({1:1,2.0:1,3.0:2,4.0:2}, inplace=True)     
     
    # Select columns to save 
    #--------------------------
    df = df.loc[:,['SubjectID','task','STIMLIST', 'set','block','trial', 'AUDIO', 'LV','TYPE','Correctness','RT']]
    #Reset index 
    df = df.reset_index(drop=True)
     
    # add to list
    df_list.append(df)

print('Created list with ' , len(df_list), ' preprocessed files')

Read table size:  (1170, 84)
Read table size:  (1175, 84)
Read table size:  (1171, 84)
Read table size:  (1180, 84)
Read table size:  (1177, 84)
Read table size:  (1170, 84)
Read table size:  (1170, 84)
Read table size:  (1173, 84)
Read table size:  (1171, 84)
Read table size:  (1170, 84)
Read table size:  (2350, 84)
Read table size:  (1170, 84)
Read table size:  (1174, 84)
Read table size:  (1170, 84)
Read table size:  (1938, 87)
Read table size:  (1938, 87)
Read table size:  (1938, 87)
Read table size:  (1947, 87)
Read table size:  (1943, 87)
Read table size:  (1939, 87)
Read table size:  (1957, 87)
Read table size:  (1938, 87)
Read table size:  (1938, 87)
Read table size:  (1938, 87)
Read table size:  (1944, 87)
Read table size:  (1938, 87)
Read table size:  (1945, 87)
Read table size:  (1938, 87)
Read table size:  (1939, 87)
Read table size:  (1170, 84)
Read table size:  (1175, 84)
Read table size:  (1171, 84)
Read table size:  (1180, 84)
Read table size:  (1177, 84)
Read table siz

#### Describe  

In [103]:
summary_list = []
for i, df in enumerate(df_list):    
     
    # Get number of trials per degradation level in a block (changes across tasks)
    nblocks = 2 
    nreps = len(df.trial.unique()) / (nblocks*len(df.LV.unique()) * len(df.TYPE.unique()))   
    counts = df['TYPE'].value_counts()

    # DESCRIPTIVE STATS per block, type and level (averaging trials)          
    #----------------------------------------------------------------
    vars2summarize = ['SubjectID','task', 'block', 'TYPE', 'LV','Correctness']

    # Accuracy summary 
    df.groupby(vars2summarize)['trial'].agg(['count']).reset_index()
    accu = df.groupby(vars2summarize)['trial'].agg(['count']).reset_index()
    accu['propTrials'] = round(accu['count']/nreps,ndigits=2)
 
    #Fix header (join by '-')
    rts = df.groupby(vars2summarize)[['RT']].agg(['mean', 'std']).reset_index()
    rts.columns  =  ['_'.join(i) if len(i[1]) else ''.join(i) for i in rts.columns.tolist() ]

    summary = pd.merge(accu, rts, on=vars2summarize)

    # % Expand with all combinations of the variables 
    unique_categories = [summary[col].unique() for col in vars2summarize]    
    multiindex = pd.MultiIndex.from_product(unique_categories, names=vars2summarize)

    # reindexing
    summary = (summary
                 .set_index(vars2summarize) 
                 .reindex(multiindex,fill_value= '')
                 .reset_index())


    summary['SubjectID'] = summary['SubjectID'].astype('object')
    summary['block'] = summary['block'].astype('object')

    
    # add to list
    summary_list.append(summary)
    
print('Created list with ', len(summary_list), ' summary tables')

Created list with  14  summary tables


### Concatenate and save 

In [124]:
concat_df = pd.concat(summary_list, ignore_index=True)
print('Concatenated all into a data frame of dimensions:', concat_df.shape)

# round numeric 
concat_df = concat_df.round(4)

# Save the concatenated DataFrame to a new CSV file
concat_df.to_csv(os.path.join(diroutput,'Gathered_summary_long.csv'), index=False)
print('saved in ', os.path.join(diroutput,'Gathered_summary_long.csv'))

Concatenated all into a data frame of dimensions: (800, 10)
saved in  V:\Projects\Spinco\SINON\Data\analysis\Gathered_summary_long.csv
