In [5]:
import os
import pandas as pd
import numpy as np
import pingouin as pg

# Extract behavioral data from log files

# Define paths
src_dir = r'E:\MeMoSLAP\data\logs_task'
output_file = r'E:\MeMoSLAP\data\behav_data.tsv'

# Define subject IDs
subject_nums = [0, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 21, 23, 24]
subjects = [f'sub-20{num:02}' for num in subject_nums]

results = []

# Process each subject
for subject in subjects:
    for session in [0, 1, 2]:
        # Construct session directory and filename pattern
        if session == 0:
            session_dir = os.path.join(src_dir, f'{subject}_ses-base')
            pattern = f'{subject}_tDCS_TWMD_ses-baseline'
        else:
            session_dir = os.path.join(src_dir, f'{subject}_ses-task')
            pattern = f'{subject}_tDCS_TWMD_ses-{session:02d}'

        if not os.path.exists(session_dir):
            continue

        # List and filter log files
        log_files = [
            os.path.join(session_dir, fname)
            for fname in os.listdir(session_dir)
            if pattern in fname and fname.endswith('.tsv')
        ]

        valid_logs = []
        performance_scores = []

        # Filter logs by length (expecting 48 trials)
        for log_path in log_files:
            df = pd.read_csv(log_path, sep='\t')
            if len(df) != 48:
                continue
            valid_logs.append(log_path)
            if session == 0:
                performance_scores.append(df['Correct'].mean())

        print(f'{len(valid_logs)} valid logs found for {subject}, session {session}')

        # Select the best baseline run based on performance
        if session == 0 and performance_scores:
            best_idx = np.argmax(performance_scores)
            valid_logs = [valid_logs[best_idx]]

        # Process valid log files
        for run_idx, log_path in enumerate(valid_logs):
            df = pd.read_csv(log_path, sep='\t')
            percent_correct = df['Correct'].mean()
            run = 0 if session == 0 else run_idx + 1

            results.append({
                'subject': subject,
                'session': session,
                'run': run,
                'percent_correct': percent_correct
            })

# Save results to TSV
df_output = pd.DataFrame(results)
df_output.to_csv(output_file, sep='\t', index=False)

print(f"\nBehavioral data written to: {output_file}")


2 valid logs found for sub-2000, session 0
4 valid logs found for sub-2000, session 1
4 valid logs found for sub-2000, session 2
2 valid logs found for sub-2002, session 0
4 valid logs found for sub-2002, session 1
4 valid logs found for sub-2002, session 2
1 valid logs found for sub-2003, session 0
4 valid logs found for sub-2003, session 1
4 valid logs found for sub-2003, session 2
1 valid logs found for sub-2005, session 0
4 valid logs found for sub-2005, session 1
4 valid logs found for sub-2005, session 2
1 valid logs found for sub-2006, session 0
4 valid logs found for sub-2006, session 1
4 valid logs found for sub-2006, session 2
1 valid logs found for sub-2007, session 0
4 valid logs found for sub-2007, session 1
4 valid logs found for sub-2007, session 2
1 valid logs found for sub-2009, session 0
4 valid logs found for sub-2009, session 1
4 valid logs found for sub-2009, session 2
1 valid logs found for sub-2010, session 0
4 valid logs found for sub-2010, session 1
4 valid log

In [7]:
# Load data and get summary stats

data = pd.read_csv('E:/MeMoSLAP/data/behav_data.tsv', sep='\t')

In [13]:
subdata = data[data['subject'].isin(subjects) & data['session'].isin([1, 2])]
subdata.head()

Unnamed: 0,subject,session,run,percent_correct
1,sub-2000,1,1,0.541667
2,sub-2000,1,2,0.645833
3,sub-2000,1,3,0.729167
4,sub-2000,1,4,0.666667
5,sub-2000,2,1,0.541667


In [15]:
mean_over_runs = subdata.groupby(['subject', 'session'])['percent_correct'].mean().reset_index()

# Calculate the mean and SD for each session across subjects
mean_sd_per_session = mean_over_runs.groupby('session')['percent_correct'].agg(['mean', 'std']).reset_index()
print("Mean and SD for each session:")
print(round(mean_sd_per_session*100,2))

# Calculate the overall mean and SD across all subjects and sessions
overall_mean = mean_over_runs['percent_correct'].mean()
overall_sd = mean_over_runs['percent_correct'].std()
print(f"\nOverall: {round(overall_mean*100,2)} +- {round(overall_sd*100,2)}")

Mean and SD for each session:
   session   mean   std
0      100  61.21  8.84
1      200  62.42  8.35

Overall: 61.81 +- 8.5


In [22]:
# Control analysis: Performance by run / session

# 2x4 repeated measures ANOVA
anova_results = pg.rm_anova(
    dv='percent_correct',
    within=['session', 'run'],
    subject='subject',
    data=subdata,
    detailed=True
)

print(anova_results)

          Source        SS  ddof1  ddof2        MS         F     p-unc  \
0        session  0.005528      1     18  0.005528  0.411594  0.529247   
1            run  0.027972      3     54  0.009324  1.061054  0.373320   
2  session * run  0.002901      3     54  0.000967  0.193258  0.900545   

   p-GG-corr       ng2       eps  
0   0.529247  0.003047  1.000000  
1   0.368189  0.015228  0.870514  
2   0.840180  0.001601  0.717662  


In [19]:
# Control analyses: order of stimuli

nses = 1

# Initialize the performance arrays (4 columns for the 4 runs)
perf_first = np.full((len(subjects), 4), np.nan)
perf_second = np.full((len(subjects), 4), np.nan)
perf_before = np.full((len(subjects), 4), np.nan)
perf_after = np.full((len(subjects), 4), np.nan)

# Iterate through the subjects
for sc, s in enumerate(subjects):

    # Construct the log file path for the subject and session
    subject_folder = os.path.join(src_dir, f'{s}_ses-task')
    logfile_pattern = f'{s}_tDCS_TWMD_ses-{nses:02d}_run'

    # Find all the matching log files in the folder (4 runs expected)
    logfile = [f for f in os.listdir(subject_folder) if logfile_pattern in f and f.endswith('.tsv')]

    # Ensure there are exactly 4 runs (this is important for the correct indexing)
    if len(logfile) != 4:
        print(f"Warning: Expected 4 runs for subject {s}, but found {len(logfile)}.")
        continue  # Skip this subject if the number of runs is not 4

    # Iterate through the runs (assuming there are exactly 4 runs)
    for r, lf_name in enumerate(logfile):
        # Construct the full path of the log file
        lf_path = os.path.join(subject_folder, lf_name)

        # Read the TSV file into a pandas DataFrame
        sublog = pd.read_csv(lf_path, sep='\t')

        # Calculate the performance values based on the conditions
        perf_first[sc, r] = sublog[sublog['Target Stimulus'] == 1]['Correct'].mean()
        perf_second[sc, r] = sublog[sublog['Target Stimulus'] == 2]['Correct'].mean()
        perf_before[sc, r] = sublog[sublog['Stimulus 3'] == 'T']['Correct'].mean()
        perf_after[sc, r] = sublog[sublog['Stimulus 3'] == 'F']['Correct'].mean()



In [20]:
# Assuming these are your performance data
perf_before = perf_before.mean(axis=1)  # Replace with actual data
perf_after = perf_after.mean(axis=1)
perf_first = perf_first.mean(axis=1)
perf_second = perf_second.mean(axis=1)

# Create a DataFrame in long format
df = pd.DataFrame({
    'Subject': [i+1 for i in range(19)] * 4,  # Repeat subject 4 times for each condition
    'Time': ['Before', 'After', 'Before', 'After'] * 19,  # Time condition for each subject
    'Memory': ['First', 'First', 'Second', 'Second'] * 19,  # Memory condition for each subject
    'Performance': list(perf_before) + list(perf_after) + list(perf_first) + list(perf_second)
})

# Check if the data looks okay
print(df.head())

   Subject    Time  Memory  Performance
0        1  Before   First     0.708333
1        2   After   First     0.562500
2        3  Before  Second     0.656250
3        4   After  Second     0.614583
4        5  Before   First     0.437500


In [21]:
# Perform a repeated measures ANOVA
anova = pg.rm_anova(dv='Performance', within=['Time', 'Memory'], subject='Subject', data=df, detailed=True)

# Show the results
print(anova)

          Source        SS  ddof1  ddof2        MS         F     p-unc  \
0           Time  0.001462      1     18  0.001462  0.158875  0.694883   
1         Memory  0.014854      1     18  0.014854  1.647013  0.215643   
2  Time * Memory  0.004477      1     18  0.004477  2.245703  0.151319   

   p-GG-corr       ng2  eps  
0   0.694883  0.001576  1.0  
1   0.215643  0.015788  1.0  
2   0.151319  0.004812  1.0  
