# Data Exclusion
Experimental data is excluded below based on the following criteria:
1. A subject is excluded if they self-report to have not understood the task instructions in the post-task questionnaire
2. A musical set (one of the test conditions each subject interacts with) is excluded from a subject's data if there's insufficient responses to perform statistics on the data (fewer than 10 responses)

## Imports

In [97]:
import math

import seaborn as sns
import StudyII_All_5_note_Sets.paths as StudyII_paths
import StudyI_Pentatonic_vs_Chromatic.paths as StudyI_paths
import Study_Likert.paths as Study_likert_paths
import Study_Uniform.paths as Study_uniform_paths
import pandas as pd

In [98]:
studyI_GL = pd.read_csv(StudyI_paths.processed_dir + 'group_level_results.csv')
studyII_GL = pd.read_csv(StudyII_paths.processed_dir + 'group_level_results.csv')
study_uniform = pd.read_csv(Study_uniform_paths.processed_dir + 'group_level_results.csv')
decoys = pd.read_csv(StudyII_paths.processed_dir + 'group_level_decoy_results.csv')
study_likert = pd.read_csv(Study_likert_paths.processed_dir + 'group_level_results.csv')

## Statistics scripts (do not change data, but report the impact of the exclusion on the data)

In [99]:
# Standard Error of the Mean
def get_SEM(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    SEM = temp.groupby("set")['rate shifted - rate swapped (NN)'].sem().reset_index()['rate shifted - rate swapped (NN)'].median()
    return SEM

# Standard Deviation
def get_STD(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    STD = temp.groupby("set")['rate shifted - rate swapped (NN)'].std().reset_index()['rate shifted - rate swapped (NN)'].median()
    return STD

# Mean
def get_MEAN(pentatonic_only=True):
    temp = studyII_GL
    if(pentatonic_only):
        temp = temp[temp['set']=="0 2 4 7 9"]
    MEAN = temp.groupby("set")['rate shifted - rate swapped (NN)'].mean().reset_index()['rate shifted - rate swapped (NN)'].median()
    return MEAN

## Total number of subjects

In [100]:
tasks_before_removal = studyI_GL.groupby('subject').count().shape[0]
print("There are {} tasks in total prior to exclusion".format(tasks_before_removal))

There are 102 tasks in total prior to exclusion


## Exclude subjects who "didn't understand" the task
### Study I

In [64]:
#Study I: remove subjects that selected "didn't understand the study"
temp = studyI_GL
tasks_before_removal = studyI_GL.groupby('subject').count().shape[0]
subjects_before_removal = studyI_GL.groupby('sona').count().shape[0]
studyI_GL = studyI_GL[studyI_GL['understood task']==True]
tasks_after_removal = studyI_GL.groupby('subject').count().shape[0]
subjects_after_removal = studyI_GL.groupby('sona').count().shape[0]
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

Removed 4 tasks
Removed 4 subjects


### Study II

In [44]:
tasks_before_removal = studyII_GL.groupby('sona').count().shape[0]
print("There are {} subjects in total prior to exclusion".format(tasks_before_removal))

There are 742 subjects in total prior to exclusion


In [45]:
#Study II: remove subjects that selected "didn't understand the study"
tasks_before_removal = studyII_GL.groupby('subject').count().shape[0]
subjects_before_removal = studyII_GL.groupby('sona').count().shape[0]
studyII_GL = studyII_GL[studyII_GL['understood task']==True]
tasks_after_removal = studyII_GL.groupby('subject').count().shape[0]
subjects_after_removal = studyII_GL.groupby('sona').count().shape[0]
print("There are {} tasks in total prior to exclusion".format(tasks_before_removal))
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
print("There are {} subjects in total prior to exclusion".format(subjects_before_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))


There are 843 tasks in total prior to exclusion
Removed 51 tasks
There are 742 subjects in total prior to exclusion
Removed 44 subjects


### Likert Experiment

In [86]:
#Likert: remove subjects that selected "didn't understand the study"
subjects_before_removal = study_likert.drop_duplicates(subset=['sona','subject'], keep='last').count()[0]
study_likert = study_likert[study_likert['understood task']==True]
subjects_after_removal = study_likert.drop_duplicates(subset=['sona','subject'], keep='last').count()[0]
print("There are {} subjects in total prior to exclusion".format(subjects_before_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

There are 157 subjects in total prior to exclusion
Removed 10 subjects


### Uniform Set Experiment

In [101]:
#Uniform: remove subjects that selected "didn't understand the study"
tasks_before_removal = study_uniform.groupby('subject').count().shape[0]
subjects_before_removal = study_uniform.groupby('sona').count().shape[0]
study_uniform = study_uniform[study_uniform['understood task']==True]
tasks_after_removal = study_uniform.groupby('subject').count().shape[0]
subjects_after_removal = study_uniform.groupby('sona').count().shape[0]
print("There are {} tasks in total prior to exclusion".format(tasks_before_removal))
print("There are {} subjects in total prior to exclusion".format(subjects_before_removal))
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

There are 51 tasks in total prior to exclusion
There are 51 subjects in total prior to exclusion
Removed 4 tasks
Removed 4 subjects


## Exclude sets with too few responses
### Study I

In [95]:
min_responses = 15

In [8]:
#Study I: remove sets with fewer than 15 no-neither responses
tasks_before_removal = studyI_GL.groupby('subject').count().shape[0]
subjects_before_removal = studyI_GL.groupby('sona').count().shape[0]
studyI_GL = studyI_GL[studyI_GL['# no_neither_trials']>=min_responses].reset_index()
tasks_after_removal = studyI_GL.groupby('subject').count().shape[0]
subjects_after_removal = studyI_GL.groupby('sona').count().shape[0]
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

Removed 0 tasks
Removed 0 subjects


### Study II

In [17]:
#Study II: remove sets with fewer than 15 no-neither responses
tasks_before_removal = studyII_GL.groupby('subject').count().shape[0]
subjects_before_removal = studyII_GL.groupby('sona').count().shape[0]
studyII_GL = studyII_GL[studyII_GL['# no_neither_trials']>=min_responses].reset_index()
print("There are {} tasks in total prior to exclusion".format(tasks_before_removal))
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
tasks_after_removal = studyII_GL.groupby('subject').count().shape[0]
subjects_after_removal = studyII_GL.groupby('sona').count().shape[0]
print("There are {} subjects in total prior to exclusion".format(subjects_before_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

There are 792 tasks in total prior to exclusion
Removed 0 tasks
There are 698 subjects in total prior to exclusion
Removed 130 subjects


### Uniform Set Experiment

In [102]:
#Study Uniform Sets: remove sets with fewer than 15 no-neither responses
tasks_before_removal = study_uniform.groupby('subject').count().shape[0]
subjects_before_removal = study_uniform.groupby('sona').count().shape[0]
study_uniform = study_uniform[study_uniform['# no_neither_trials']>=min_responses].reset_index()
tasks_after_removal = study_uniform.groupby('subject').count().shape[0]
print("There are {} tasks in total prior to exclusion".format(tasks_before_removal))
print("Removed {} tasks".format(tasks_before_removal-tasks_after_removal))
subjects_after_removal = study_uniform.groupby('sona').count().shape[0]
print("There are {} subjects in total prior to exclusion".format(subjects_before_removal))
print("Removed {} subjects".format(subjects_before_removal-subjects_after_removal))

There are 47 tasks in total prior to exclusion
Removed 1 tasks
There are 47 subjects in total prior to exclusion
Removed 1 subjects


In [18]:
study1_subject_counts = studyI_GL.groupby('sona').count()
study1_task_counts = studyI_GL.groupby('subject').count()
study2_subject_counts = studyII_GL.groupby('sona').count()
study2_task_counts = studyII_GL.groupby('subject').count()
study_likert_subject_counts = study_likert.groupby('sona').count()
study_likert_task_counts = study_likert.groupby('subject').count()
study_uniform_subject_counts = study_uniform.groupby('sona').count()
study_uniform_task_counts = study_uniform.groupby('subject').count()

print("Post exclusion counts:")
print("Study I: {} subjects, {} tasks".format(study1_subject_counts.shape[0], study1_task_counts.shape[0]))
print("Study II: {} subjects, {} tasks".format(study2_subject_counts.shape[0], study2_task_counts.shape[0]))
print("Likert: {} subjects, {} tasks".format(study_likert_subject_counts.shape[0], study_likert_task_counts.shape[0]))
print("Uniform: {} subjects, {} tasks".format(study_uniform_subject_counts.shape[0], study_uniform_task_counts.shape[0]))

Post exclusion counts:
Study I: 98 subjects, 98 tasks
Study II: 568 subjects, 630 tasks
Likert: 147 subjects, 145 tasks
Uniform: 46 subjects, 46 tasks


## Save data post-exlusion
### Study I

In [56]:
# Save Study I post-exclusion
studyI_GL.to_pickle(StudyI_paths.processed_dir + StudyI_paths.post_exclusion_data_pickle_filename)

### Study II

In [57]:
# Save Study II post-exclusion
studyII_GL.to_pickle(StudyII_paths.processed_dir + StudyII_paths.post_exclusion_data_pickle_filename)

### Likert Experiment

In [81]:
# Save Likert post-exclusion
study_likert.to_pickle(Study_likert_paths.processed_dir + Study_likert_paths.post_exclusion_data_pickle_filename)

### Uniform Sets Experiment


In [28]:
# Save Uniform post-exclusion
study_uniform.to_pickle(Study_uniform_paths.processed_dir + Study_uniform_paths.post_exclusion_data_pickle_filename)