# Solution to assignment

_By: Your Name_

_Date: _

## Import statements, function definitions and default variables

In [1]:
from os.path import join, basename
import string
from fddhs import grep  # grep for a string in a file
from fddhs import find  # find files matching a pattern

from fddhs import read_log_file, mean, median

logfile_dir = '../src/logs'

# take a look at this CSV-file to determine its structure
subject_codes = join(logfile_dir, 'subj_codes.csv')

output_dir = 'assignment'  # did you remember to create it??
output_file = join(output_dir, 'solution_YOUR_NAME.csv')

## Find all the log files, place into a list

In [2]:
all_logs = find(logfile_dir, '*.log')  # what pattern/wildcard to use?

Take a look at the first file name; _subject ID_ is the first N characters of the _basename_ of the file.

In [None]:
print(all_logs[0])
print(basename(all_logs[0]))
print(basename(all_logs[0])[:N])  # print the first N characters of the basename

## Loop over logs, writing out results table as you go

In [4]:
outfile = open(output_file, 'wt')
delimiter = ','  # or whatever you like

# the opposite of 'split' is 'join', which has a slightly odd syntax
header = delimiter.join(['Subjid', 'Group', 'Cond', 'Mean RT', 'Median RT', 'Accuracy'])

# write out the header-line first
outfile.write(header + '\n')  # remember to add the newline-character!

# loop over all log files
for log in all_logs:
    subj_ID = basename(log)[:N]  # N=?; how many characters is the subject ID?

    # Is the subject a patient or a control?
    # first get the line that contains the subject code
    group_line = grep(subject_codes, subj_ID)
    # then we split the line on the delimiter (;) and take the second element
    group = group_line.split(';')[1]
    
    # Now we can simply call our single-logfile function and get the results!
    (mean_rt_freq, median_rt_freq, accuracy_freq,
     mean_rt_rare, median_rt_rare, accuracy_rare) = read_log_file(log)
    
    # NB: all those variables are numbers (floats); to write them into a text file,
    # we must first convert them into string-objects (with 2 decimal precision)
    freq_results_str = delimiter.join(['{:.2f}'.format(mean_rt_freq),
                                       '{:.2f}'.format(median_rt_freq),
                                       '{:.2f}'.format(accuracy_freq)])
    rare_results_str = delimiter.join(['{:.2f}'.format(mean_rt_rare),
                                       '{:.2f}'.format(median_rt_rare),
                                       '{:.2f}'.format(accuracy_rare)])

    
    # first write a line for the frequent stimuli
    line_out = delimiter.join([subj_ID, group, 'Freq', freq_results_str])
    outfile.write(line_out + '\n')

    # then write a line for the rare stimuli
    line_out = delimiter.join([subj_ID, group, 'Rare', rare_results_str])
    outfile.write(line_out + '\n')

outfile.close()

# Optional exercise 1: summary statistics

Can we reproduce the paper's findings?

## Install `pandas`

We'll use a Python-module called [pandas](https://pandas.pydata.org) for this, which we've forgotten to include in the `environment.yaml`-file! But fear not, `conda` is your friend.

* on Windows: open 'Anaconda Prompt' & execute: `activate fddhs`
* on Mac/Linux: open a Terminal app & execute: `source activate fddhs`
* execute the following command in the Prompt/Terminal:

`conda install pandas`

and answer 'y'.

___You'll also need to restart `jupyter lab` for the module to be found.___

In [5]:
import pandas as pd

In [15]:
# yes, this is how easy reading a csv-file _really_ can be...
df = pd.read_csv(output_file, delimiter=delimiter)

In [16]:
# print the first 5 lines
df.head()

Unnamed: 0,Subjid,Group,Cond,Mean RT,Median RT,Accuracy
0,0010_BQR,Patient,Freq,581.16,549.3,94.53
1,0010_BQR,Patient,Rare,658.55,639.2,85.16
2,0011_XYJ,Control,Freq,501.09,469.2,97.27
3,0011_XYJ,Control,Rare,583.5,557.1,88.67
4,0012_WCT,Patient,Freq,587.64,555.9,93.75


In [19]:
# group the numerical values by Group and Condition,
# and display the mean of each
df.groupby(by=['Group', 'Cond']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean RT,Median RT,Accuracy
Group,Cond,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Control,Freq,498.775,466.405,96.108
Control,Rare,572.325,542.625,89.0235
Patient,Freq,580.759,542.835,94.5265
Patient,Rare,681.1435,649.65,84.9605
