# Data Exclusion
Experimental data is excluded below based on the following criteria:
1. A subject is excluded if they self-report to have not understood the task instructions in the post-task questionnaire
2. A musical set (one of the test conditions each subject interacts with) is excluded from a subject's data if there's insufficient responses to perform statistics on the data (fewer than 10 responses)

## Imports

In [1]:
import math

import seaborn as sns
import StudyII_All_5_note_Sets.paths as StudyII_paths
import StudyI_Pentatonic_vs_Chromatic.paths as StudyI_paths
import Study_Likert.paths as Study_likert_paths
import pandas as pd

In [2]:
studyI_GL = pd.read_csv(StudyI_paths.processed_dir + 'group_level_results.csv')
studyII_GL = pd.read_csv(StudyII_paths.processed_dir + 'group_level_results.csv')
decoys = pd.read_csv(StudyII_paths.processed_dir + 'group_level_decoy_results.csv')
study_likert = pd.read_csv(Study_likert_paths.processed_dir + 'set_level_results.csv')

## Statistics scripts (do not change data, but report the impact of the exclusion on the data)

In [3]:
# Standard Error of the Mean
def get_SEM(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    SEM = temp.groupby("set")['rate shifted - rate swapped (NN)'].sem().reset_index()['rate shifted - rate swapped (NN)'].median()
    return SEM

# Standard Deviation
def get_STD(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    STD = temp.groupby("set")['rate shifted - rate swapped (NN)'].std().reset_index()['rate shifted - rate swapped (NN)'].median()
    return STD

# Mean
def get_MEAN(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    MEAN = temp.groupby("set")['rate shifted - rate swapped (NN)'].mean().reset_index()['rate shifted - rate swapped (NN)'].median()
    return MEAN

## Exclude subjects who "didn't understand" the task
### Study I

In [4]:
#Study I: remove subjects that selected "didn't understand the study"
temp = studyI_GL
before_removal = studyI_GL.groupby('subject').count().shape[0]
studyI_GL = studyI_GL[studyI_GL['understood task']==True]
after_removal = studyI_GL.groupby('subject').count().shape[0]
print("Removed {} subjects".format(before_removal-after_removal))

Removed 4 subjects


### Study II

In [5]:
#Study II: remove subjects that selected "didn't understand the study"
before_removal = studyII_GL.groupby('subject').count().shape[0]
SEM_before = get_SEM()
STD_before = get_STD()
MEAN_before = get_MEAN()
studyII_GL = studyII_GL[studyII_GL['understood task']==True]
after_removal = studyII_GL.groupby('subject').count().shape[0]
SEM_after = get_SEM()
STD_after = get_STD()
MEAN_after = get_MEAN()
print("There are {} subjects in total prior to exclusion".format(before_removal))
print("Removed {} subjects".format(before_removal-after_removal))
print("SEM changed from {} to {} (a {} % diff)".format(SEM_before,SEM_after,SEM_before/SEM_after))
print("STD changed from {} to {} (a {} % diff)".format(STD_before,STD_after,STD_before/STD_after))
print("MEAN changed from {} to {} (a {} % diff)".format(MEAN_before,MEAN_after,MEAN_before/MEAN_after))

There are 843 subjects in total prior to exclusion
Removed 51 subjects
SEM changed from 0.035818657571183396 to 0.038105163956511666 (a 0.9399948419605859 % diff)
STD changed from 0.3101986738610064 to 0.3165252642215655 (a 0.9800123684404208 % diff)
MEAN changed from 0.3263694705707089 to 0.33173443630099975 (a 0.9838275284588697 % diff)


## Exclude sets with too few responses
### Study I

In [6]:
min_responses = 15

In [7]:
#Study I: remove sets with fewer than 15 no-neither responses
before_removal = studyI_GL.groupby(['subject','set']).count().shape[0]
studyI_GL = studyI_GL[studyI_GL['# no_neither_trials']>=min_responses].reset_index()
after_removal = studyI_GL.groupby(['subject','set']).count().shape[0]
print("Removed {} sets".format(before_removal-after_removal))

Removed 1 sets


### Study II

In [8]:
#Study II: remove sets with fewer than 15 no-neither responses
before_removal_sets = studyII_GL.groupby(['subject','set']).count().shape[0]
before_removal_subs = studyII_GL.groupby(['subject']).count().shape[0]
SEM_before = get_SEM()
STD_before = get_STD()
MEAN_before = get_MEAN()
studyII_GL = studyII_GL[studyII_GL['# no_neither_trials']>=min_responses].reset_index()
SEM_after = get_SEM()
STD_after = get_STD()
MEAN_after = get_MEAN()
after_removal_sets = studyII_GL.groupby(['subject','set']).count().shape[0]
after_removal_subs = studyII_GL.groupby(['subject']).count().shape[0]
print("Removed {} musical sets".format(before_removal_sets-after_removal_sets))
print("Resulting in the removal of {} task-sets".format(before_removal_subs-after_removal_subs))
print("There are {} subjects left after exclusion".format(after_removal_subs))
print("SEM changed from {} to {} (a {} % diff)".format(SEM_before,SEM_after,SEM_before/SEM_after))
print("STD changed from {} to {} (a {} % diff)".format(STD_before,STD_after,STD_before/STD_after))
print("MEAN changed from {} to {} (a {} % diff)".format(MEAN_before,MEAN_after,MEAN_before/MEAN_after))

Removed 2339 musical sets
Resulting in the removal of 162 task-sets
There are 630 subjects left after exclusion
SEM changed from 0.038105163956511666 to 0.05184918713043115 (a 0.7349230733483787 % diff)
STD changed from 0.3165252642215655 to 0.32379806984812654 (a 0.9775390704769419 % diff)
MEAN changed from 0.33173443630099975 to 0.32046982085681774 (a 1.035150315914505 % diff)


## Save data post-exlusion
### Study I

In [9]:
# Save Study I post-exclusion
studyI_GL.to_pickle(StudyI_paths.processed_dir + StudyI_paths.post_exclusion_data_pickle_filename)

### Study II

In [10]:
# Save Study II post-exclusion
studyII_GL.to_pickle(StudyII_paths.processed_dir + StudyII_paths.post_exclusion_data_pickle_filename)