In [1]:
import pandas as pd
import numpy as np
from os import path
from glob import glob
from os.path import basename, join

basedir = '/home/data/nbc/physics-learning/data/behavioral-data/analyses/physics-learning-behavioral'
rcsv_dir = '/home/data/nbc/physics-learning/data/behavioral-data/rcsv/'
datadir = 'datafiles'
plotdir = 'plots'

# set file names that will be read out
f_gen = path.join(basedir, datadir, 'c123456-demographics.txt')
outfile_phy_pre =  path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender_pre.txt')
outfile_gen_pre =  path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender_pre.txt')
outfile_sym_pre =  path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender_pre.txt')
outfile_phy_post =  path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender_post.txt')
outfile_gen_post =  path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender_post.txt')
outfile_sym_post =  path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender_post.txt')
outfile_phy_full =  path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender.txt')
outfile_gen_full =  path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender.txt')
outfile_sym_full =  path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender.txt')

cols = ['Subject', 'Session', 'QuestionType', 'ExpSlide.ACC', 'ExpSlide2.ACC',  'ExpSlide.RT', 'ExpSlide2.RT']

# read in map from subject ID to gender
#df_acc = pd.read_csv(r_acc, sep=',')
#df_acc2 = pd.read_csv(f_acc, sep='\t')
df_gen = pd.read_csv(f_gen, sep='\t')
df_gen.head()

Unnamed: 0,PID,Gender,Group,Cmp.Stdy
0,103,M,Trad,Y
1,110,M,Trad,Y
2,212,M,Trad,Y
3,217,M,Trad,Y
4,219,M,Trad,Y


In [2]:
def drop_unmatched(df, ids):
    indices_to_drop = []
    for s in ids:
        if df.loc[df['Subject']==s].shape[0] == 1:
            unmatched_index = df.loc[df['Subject']==s].index[0]
            indices_to_drop.append(unmatched_index)
    df_matched = df.drop(df.index[indices_to_drop]).reset_index()
    return df_matched

In [None]:
# note being used currently
def read_in_acc_rt(condition_string):
    # read in phys/gen/base condition acc and rt and separate by correct/incorrect rt
    # valid options fo condition are 'physics', 'general', or 'baseline'
    # the (arbitrary) eprime naming conventions were:
    #'ExpSlide1'= quesitons containing no image within the answer options
    #'ExpSlide2'= questions containing an image within the answer options
    #'ExpSlide3'= new column for data from both image and no image questions together

    subjects = glob(join(rcsv_dir, '*'))
    subjects = [basename(s) for s in subjects]
    subjects = sorted([s for s in subjects if s.isdigit() and len(s)==3])
    sessions = ['session-0', 'session-1']
    all_dfs_cond = []
    for s in subjects:
        for ss in sessions:
            files = sorted(glob(join(rcsv_dir, s, ss, 'RETR/RETR_*.csv')))
            if len(files) > 0:
                dfs = [pd.read_csv(f) for f in files]
                df = pd.concat(dfs, axis=0)#concatonate runs
                df = df.loc[df['QuestionType']==condition_string]
                df['ExpSlide3.ACC'] = df['ExpSlide.ACC'].fillna(df['ExpSlide2.ACC'])
                df['ExpSlide3.RT'] = df['ExpSlide.RT'].fillna(df['ExpSlide2.RT'])
                df2 = pd.DataFrame(df.mean()[['Subject', 'Session', 'ExpSlide3.ACC']]).transpose()            
                mean_df = df.groupby('ExpSlide3.ACC').mean()
                if 1 in mean_df.index:
                    df2.loc[0, 'Mean Correct RT'] = mean_df.loc[1, 'ExpSlide3.RT']
                else:
                    df2.loc[0, 'Mean Correct RT'] = np.nan

                if 0 in mean_df.index:
                    df2.loc[0, 'Mean Incorrect RT'] = mean_df.loc[0, 'ExpSlide3.RT']
                else:
                    df2.loc[0, 'Mean Incorrect RT'] = np.nan

                all_dfs_cond.append(df2)
                return all_dfs_cond

In [3]:
# read physics condition acc and rt data and separate by correct/incorrect rt
# the (arbitrary) eprime naming conventions were:
#'ExpSlide1'= quesitons containing no image within the answer options
#'ExpSlide2'= questions containing an image within the answer options
#'ExpSlide3'= new column for data from both image and no image questions together

subjects = glob(join(rcsv_dir, '*'))
subjects = [basename(s) for s in subjects]
subjects = sorted([s for s in subjects if s.isdigit() and len(s)==3])
sessions = ['session-0', 'session-1']
all_dfs_physcond = []
all_dfs_gencond = []
all_dfs_symcond = []
for s in subjects:
    for ss in sessions:
        files = sorted(glob(join(rcsv_dir, s, ss, 'RETR/RETR_*.csv')))
        if len(files) > 0:
            dfs = [pd.read_csv(f) for f in files]
            df = pd.concat(dfs, axis=0)#concatonate runs
            df = df.loc[df['QuestionType']=='physics']
            df['ExpSlide3.ACC'] = df['ExpSlide.ACC'].fillna(df['ExpSlide2.ACC'])
            df['ExpSlide3.RT'] = df['ExpSlide.RT'].fillna(df['ExpSlide2.RT'])
            df2 = pd.DataFrame(df.mean()[['Subject', 'Session', 'ExpSlide3.ACC']]).transpose()            
            mean_df = df.groupby('ExpSlide3.ACC').mean()
            if 1 in mean_df.index:
                df2.loc[0, 'Mean Correct RT'] = mean_df.loc[1, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Correct RT'] = np.nan

            if 0 in mean_df.index:
                df2.loc[0, 'Mean Incorrect RT'] = mean_df.loc[0, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Incorrect RT'] = np.nan

            all_dfs_physcond.append(df2)

#all_dfs_physcond = read_in_acc_rt('physics')

#physics condition
full_df_phy = pd.concat(all_dfs_physcond, axis=0, ignore_index=True)
full_df_phy['Subject'] = full_df_phy['Subject'].astype(int).astype(str)
full_df_phy['Session'] = full_df_phy['Session'].astype(int).astype(str)
full_df_phy['Session'] = full_df_phy['Session'].map({'1': 'pre',
                                             '2': 'post'})
full_df_phy = full_df_phy.rename(columns={'ExpSlide3.ACC': 'Mean Physics Retrieval Accuracy'})
full_df_phy = drop_unmatched(full_df_phy, subjects)
pre_df_phy = full_df_phy.loc[full_df_phy['Session']=='pre']
post_df_phy = full_df_phy.loc[full_df_phy['Session']=='post']
#print full_df_phy.head()
pre_df_phy.shape

(108, 6)

In [4]:
#not working... what was I trying to do here?
# pre_phy_col = df.loc[: , "Mean Correct RT":"Mean Incorrect RT"]
# #Mean Incorrect RT
# #pre_df_phy = 
# pre_df_phy = pre_df_phy.rename(index=str, columns={"ExpSlide3.ACC": "Phy.ExpSlide3.ACC", 
#                                                    "Mean Correct RT": "Phy.Mean.Correct.RT")
# df_acc_pre = pd.merge(pre_df_phy, pre_df_gen, on=['Subject', 'Session'])

In [5]:
# read general condition acc and rt data and separate by correct/incorrect rt
for s in subjects:
    for ss in sessions:
        files = sorted(glob(join(rcsv_dir, s, ss, 'RETR/RETR_*.csv')))
        if len(files) > 0:
            dfs = [pd.read_csv(f) for f in files]
            df = pd.concat(dfs, axis=0)
            df = df.loc[df['QuestionType']=='general']
            df['ExpSlide3.ACC'] = df['ExpSlide.ACC'].fillna(df['ExpSlide2.ACC'])
            df['ExpSlide3.RT'] = df['ExpSlide.RT'].fillna(df['ExpSlide2.RT'])
            df2 = pd.DataFrame(df.mean()[['Subject', 'Session', 'ExpSlide3.ACC']]).transpose()            
            mean_df = df.groupby('ExpSlide3.ACC').mean()
            if 1 in mean_df.index:
                df2.loc[0, 'Mean Correct RT'] = mean_df.loc[1, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Correct RT'] = np.nan

            if 0 in mean_df.index:
                df2.loc[0, 'Mean Incorrect RT'] = mean_df.loc[0, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Incorrect RT'] = np.nan

            all_dfs_gencond.append(df2)

#general condition
full_df_gen = pd.concat(all_dfs_gencond, axis=0, ignore_index=True)
#print full_df_gen.head()
full_df_gen['Subject'] = full_df_gen['Subject'].astype(int).astype(str)
full_df_gen['Session'] = full_df_gen['Session'].astype(int).astype(str)
full_df_gen['Session'] = full_df_gen['Session'].map({'1': 'pre',
                                             '2': 'post'})
full_df_gen = full_df_gen.rename(columns={'ExpSlide3.ACC': 'Mean General Retrieval Accuracy'})
full_df_gen = drop_unmatched(full_df_gen, subjects)
pre_df_gen = full_df_gen.loc[full_df_gen['Session']=='pre']
post_df_gen = full_df_gen.loc[full_df_gen['Session']=='post']
#print full_df_gen.head()
#print pre_df_gen['Subject']
pre_df_gen.shape

(108, 6)

In [6]:
# read baseline condition acc and rt data and separate by correct/incorrect rt
for s in subjects:
    for ss in sessions:
        files = sorted(glob(join(rcsv_dir, s, ss, 'RETR/RETR_*.csv')))
        if len(files) > 0:
            dfs = [pd.read_csv(f) for f in files]
            df = pd.concat(dfs, axis=0)
            df = df.loc[df['QuestionType']=='baseline']
            df['ExpSlide3.ACC'] = df['ExpSlide.ACC'].fillna(df['ExpSlide2.ACC'])
            df['ExpSlide3.RT'] = df['ExpSlide.RT'].fillna(df['ExpSlide2.RT'])
#            print df.head()[['Subject','Session','ExpSlide.ACC','ExpSlide2.ACC','ExpSlide3.ACC',
#                             'ExpSlide.RT','ExpSlide2.RT','ExpSlide3.RT']]
            df2 = pd.DataFrame(df.mean()[['Subject', 'Session', 'ExpSlide3.ACC']]).transpose()            
            mean_df = df.groupby('ExpSlide3.ACC').mean()
            if 1 in mean_df.index:
                df2.loc[0, 'Mean Correct RT'] = mean_df.loc[1, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Correct RT'] = np.nan

            if 0 in mean_df.index:
                df2.loc[0, 'Mean Incorrect RT'] = mean_df.loc[0, 'ExpSlide3.RT']
            else:
                df2.loc[0, 'Mean Incorrect RT'] = np.nan

            all_dfs_symcond.append(df2)

#symbol condition
full_df_sym = pd.concat(all_dfs_symcond, axis=0, ignore_index=True)
#print full_df_sym.head()
full_df_sym['Subject'] = full_df_sym['Subject'].astype(int).astype(str)
full_df_sym['Session'] = full_df_sym['Session'].astype(int).astype(str)
full_df_sym['Session'] = full_df_sym['Session'].map({'1': 'pre',
                                             '2': 'post'})
full_df_sym = full_df_sym.rename(columns={'ExpSlide3.ACC': 'Mean Baseline Retrieval Accuracy'})
full_df_sym = drop_unmatched(full_df_sym, subjects)
pre_df_sym = full_df_sym.loc[full_df_sym['Session']=='pre']
post_df_sym = full_df_sym.loc[full_df_sym['Session']=='post']
full_df_sym.head()
print pre_df_sym.shape

(108, 6)


In [7]:
# set key and drop irrelevant columns
df_gen = df_gen.rename(columns={'PID': 'Subject', 'Group': 'Class'})
df_gen['Subject'] = df_gen['Subject'].astype(str)

# merge data frames on key and sort
out_df1 = pre_df_phy.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df2 = pre_df_gen.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df3 = pre_df_sym.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df4 = post_df_phy.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df5 = post_df_gen.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df6 = post_df_sym.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df7 = full_df_phy.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df8 = full_df_gen.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
out_df9 = full_df_sym.merge(df_gen[['Subject', 'Gender', 'Class']], on=['Subject'])
#df = df.sort_values(['Session','Subject'], ascending=[False,True])

# update values in class column and add a column for subgroup type
out_df1['Class'] = out_df1['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df1['Gender And Class'] = out_df1['Class'] + ' ' + out_df1['Gender']
out_df2['Class'] = out_df2['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df2['Gender And Class'] = out_df2['Class'] + ' ' + out_df2['Gender']
out_df3['Class'] = out_df3['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df3['Gender And Class'] = out_df3['Class'] + ' ' + out_df3['Gender']
out_df4['Class'] = out_df4['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df4['Gender And Class'] = out_df4['Class'] + ' ' + out_df4['Gender']
out_df5['Class'] = out_df5['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df5['Gender And Class'] = out_df5['Class'] + ' ' + out_df5['Gender']
out_df6['Class'] = out_df6['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df6['Gender And Class'] = out_df6['Class'] + ' ' + out_df6['Gender']
out_df7['Class'] = out_df7['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df7['Gender And Class'] = out_df7['Class'] + ' ' + out_df7['Gender']
out_df8['Class'] = out_df8['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df8['Gender And Class'] = out_df8['Class'] + ' ' + out_df8['Gender']
out_df9['Class'] = out_df9['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
out_df9['Gender And Class'] = out_df9['Class'] + ' ' + out_df9['Gender']

# write new file
out_df1.to_csv(outfile_phy_pre,sep='\t',header=True,index=False)
out_df2.to_csv(outfile_gen_pre,sep='\t',header=True,index=False)
out_df3.to_csv(outfile_sym_pre,sep='\t',header=True,index=False)
out_df4.to_csv(outfile_phy_post,sep='\t',header=True,index=False)
out_df5.to_csv(outfile_gen_post,sep='\t',header=True,index=False)
out_df6.to_csv(outfile_sym_post,sep='\t',header=True,index=False)
out_df7.to_csv(outfile_phy_full,sep='\t',header=True,index=False)
out_df8.to_csv(outfile_gen_full,sep='\t',header=True,index=False)
out_df9.to_csv(outfile_sym_full,sep='\t',header=True,index=False)

In [8]:
### analysis_df_phy_pre = pd.read_csv(path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender_pre.txt'), sep='\t')
analysis_df_gen_pre = pd.read_csv(path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender_pre.txt'), sep='\t')
analysis_df_sym_pre = pd.read_csv(path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender_pre.txt'), sep='\t')
analysis_df_phy_post = pd.read_csv(path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender_post.txt'), sep='\t')
analysis_df_gen_post = pd.read_csv(path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender_post.txt'), sep='\t')
analysis_df_sym_post = pd.read_csv(path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender_post.txt'), sep='\t')
analysis_df_phy_full = pd.read_csv(path.join(basedir, datadir, 'retr_physcond_accuracy_by_gender.txt'), sep='\t')
analysis_df_gen_full = pd.read_csv(path.join(basedir, datadir, 'retr_gencond_accuracy_by_gender.txt'), sep='\t')
analysis_df_sym_full = pd.read_csv(path.join(basedir, datadir, 'retr_symcond_accuracy_by_gender.txt'), sep='\t')

analysis_df_phy_full.head()

Unnamed: 0,index,Subject,Session,Mean Physics Retrieval Accuracy,Mean Correct RT,Mean Incorrect RT,Gender,Class,Gender And Class
0,0,101,pre,0.708333,4597.352941,4495.714286,F,Lecture,Lecture F
1,1,101,post,0.75,4280.277778,2958.5,F,Lecture,Lecture F
2,2,102,pre,0.541667,4241.769231,3737.090909,M,Modeling,Modeling M
3,3,102,post,0.833333,4387.0,1459.75,M,Modeling,Modeling M
4,4,103,pre,0.75,4124.0,3753.833333,M,Lecture,Lecture M


In [None]:
# Pre-instruction accuracy and RT plots by gender
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette="muted", color_codes=True)

# physics acc, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_phy_pre['Gender'].unique():
    sex_df = analysis_df_phy_pre.loc[analysis_df_phy_pre['Gender']==sex]
    sns.distplot(sex_df['Mean Physics Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_accuracy_pre.png'), dpi=400)

# physics rt, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_phy_pre['Gender'].unique():
    sex_df = analysis_df_phy_pre.loc[analysis_df_phy_pre['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_rt_pre.png'), dpi=400)

# general acc, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_gen_pre['Gender'].unique():
    sex_df = analysis_df_gen_pre.loc[analysis_df_gen_pre['Gender']==sex]
    sns.distplot(sex_df['Mean General Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_accuracy_pre.png'), dpi=400)

# general rt, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_gen_pre['Gender'].unique():
    sex_df = analysis_df_gen_pre.loc[analysis_df_gen_pre['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_rt_pre.png'), dpi=400)


# baseline acc, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_sym_pre['Gender'].unique():
    sex_df = analysis_df_sym_pre.loc[analysis_df_sym_pre['Gender']==sex]
    sns.distplot(sex_df['Mean Baseline Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_accuracy_pre.png'), dpi=400)

# baseline rt, pre
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_sym_pre['Gender'].unique():
    sex_df = analysis_df_sym_pre.loc[analysis_df_sym_pre['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Pre-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_rt_pre.png'), dpi=400)

In [None]:
# post-instruction plots by gender
sns.set(palette="muted", color_codes=True)

# physics acc, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_phy_post['Gender'].unique():
    sex_df = analysis_df_phy_post.loc[analysis_df_phy_post['Gender']==sex]
    sns.distplot(sex_df['Mean Physics Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_accuracy_post.png'), dpi=400)

# physics rt, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_phy_post['Gender'].unique():
    sex_df = analysis_df_phy_post.loc[analysis_df_phy_post['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_rt_post.png'), dpi=400)


# general acc, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_gen_post['Gender'].unique():
    sex_df = analysis_df_gen_post.loc[analysis_df_gen_post['Gender']==sex]
    sns.distplot(sex_df['Mean General Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_accuracy_post.png'), dpi=400)

# general rt, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_gen_post['Gender'].unique():
    sex_df = analysis_df_gen_post.loc[analysis_df_gen_post['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_rt_post.png'), dpi=400)


# baseline acc, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_sym_post['Gender'].unique():
    sex_df = analysis_df_sym_post.loc[analysis_df_sym_post['Gender']==sex]
    sns.distplot(sex_df['Mean Baseline Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_accuracy_post.png'), dpi=400)

# baseline rt, post
fig, ax = plt.subplots(figsize=(8, 6))
for sex in analysis_df_sym_post['Gender'].unique():
    sex_df = analysis_df_sym_post.loc[analysis_df_sym_post['Gender']==sex]
    sns.distplot(sex_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sex)

legend = ax.legend(frameon=True)
plt.title('Post-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_rt_post.png'), dpi=400)

In [None]:
# Pre to post instruction plots, not by gender
sns.set(palette="bright", color_codes=True)

# physics
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_phy_full['Session'].unique():
    sess_df = analysis_df_phy_full.loc[analysis_df_phy_full['Session']==sess]
    sns.distplot(sess_df['Mean Physics Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_accuracy_pre2post.png'), dpi=400)


# general
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_gen_full['Session'].unique():
    sess_df = analysis_df_gen_full.loc[analysis_df_gen_full['Session']==sess]
    sns.distplot(sess_df['Mean General Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_accuracy_pre2post.png'), dpi=400)


# baseline
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_sym_full['Session'].unique():
    sess_df = analysis_df_sym_full.loc[analysis_df_sym_full['Session']==sess]
    sns.distplot(sess_df['Mean Baseline Retrieval Accuracy'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_accuracy_pre2post.png'), dpi=400)

In [None]:
# plot rt pre and post
sns.set(palette="bright", color_codes=True)

# physics
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_phy_full['Session'].unique():
    sess_df = analysis_df_phy_full.loc[analysis_df_phy_full['Session']==sess]
    sns.distplot(sess_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: Physics Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_physcond_RT_pre2post.png'), dpi=400)

# general
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_gen_full['Session'].unique():
    sess_df = analysis_df_gen_full.loc[analysis_df_gen_full['Session']==sess]
    sns.distplot(sess_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: General Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_gencond_RT_pre2post.png'), dpi=400)

# baseline
fig, ax = plt.subplots(figsize=(8, 6))
for sess in analysis_df_sym_full['Session'].unique():
    sess_df = analysis_df_sym_full.loc[analysis_df_sym_full['Session']==sess]
    sns.distplot(sess_df['Mean Correct RT'], ax=ax, kde=False, 
                 label=sess)

legend = ax.legend(frameon=True)
plt.title('Pre- to Post-instruction: Baseline Condition')
frame = legend.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
fig.savefig(path.join(basedir, plotdir, 'mean_retr_symcond_RT_pre2post.png'), dpi=400)

In [13]:
from scipy.stats import ttest_ind, ttest_rel

## Accuracy By Gender

analysis_df_phy_pre = analysis_df_phy_full[analysis_df_phy_full['Session']=='pre']
analysis_df_phy_post = analysis_df_phy_full[analysis_df_phy_full['Session']=='post']
analysis_df_gen_pre = analysis_df_gen_full[analysis_df_gen_full['Session']=='pre']
analysis_df_gen_post = analysis_df_gen_full[analysis_df_gen_full['Session']=='post']
analysis_df_sym_pre = analysis_df_sym_full[analysis_df_sym_full['Session']=='pre']
analysis_df_sym_post = analysis_df_sym_full[analysis_df_sym_full['Session']=='post']

# Get means for pre and post, all conditions
mean_df_phy_pre = analysis_df_phy_pre.groupby('Gender').mean()
mean_df_gen_pre = analysis_df_gen_pre.groupby('Gender').mean()
mean_df_sym_pre = analysis_df_sym_pre.groupby('Gender').mean()

mean_df_phy_post = analysis_df_phy_post.groupby('Gender').mean()
mean_df_gen_post = analysis_df_gen_post.groupby('Gender').mean()
mean_df_sym_post = analysis_df_sym_post.groupby('Gender').mean()

# Accuracies
acc_m_phy_pre = analysis_df_phy_pre[analysis_df_phy_pre['Gender']=='M']
acc_f_phy_pre = analysis_df_phy_pre[analysis_df_phy_pre['Gender']=='F']

acc_m_gen_pre = analysis_df_gen_pre[analysis_df_gen_pre['Gender']=='M']
acc_f_gen_pre = analysis_df_gen_pre[analysis_df_gen_pre['Gender']=='F']

acc_m_sym_pre = analysis_df_sym_pre[analysis_df_sym_pre['Gender']=='M']
acc_f_sym_pre = analysis_df_sym_pre[analysis_df_sym_pre['Gender']=='F']

acc_m_phy_post = analysis_df_phy_post[analysis_df_phy_post['Gender']=='M']
acc_f_phy_post = analysis_df_phy_post[analysis_df_phy_post['Gender']=='F']

acc_m_gen_post = analysis_df_gen_post[analysis_df_gen_post['Gender']=='M']
acc_f_gen_post = analysis_df_gen_post[analysis_df_gen_post['Gender']=='F']

acc_m_sym_post = analysis_df_sym_post[analysis_df_sym_post['Gender']=='M']
acc_f_sym_post = analysis_df_sym_post[analysis_df_sym_post['Gender']=='F']

# t tests (for normal data)
tstat_phy_pre = ttest_ind(acc_m_phy_pre['Mean Physics Retrieval Accuracy'], acc_f_phy_pre['Mean Physics Retrieval Accuracy'])
tstat_gen_pre = ttest_ind(acc_m_gen_pre['Mean General Retrieval Accuracy'], acc_f_gen_pre['Mean General Retrieval Accuracy'])
tstat_sym_pre = ttest_ind(acc_m_sym_pre['Mean Baseline Retrieval Accuracy'], acc_f_sym_pre['Mean Baseline Retrieval Accuracy'])
tstat_phy_post = ttest_ind(acc_m_phy_post['Mean Physics Retrieval Accuracy'], acc_f_phy_post['Mean Physics Retrieval Accuracy'])
tstat_gen_post = ttest_ind(acc_m_gen_post['Mean General Retrieval Accuracy'], acc_f_gen_post['Mean General Retrieval Accuracy'])
tstat_sym_post = ttest_ind(acc_m_sym_post['Mean Baseline Retrieval Accuracy'], acc_f_sym_post['Mean Baseline Retrieval Accuracy'])

print '\n Pre-instruction: \n Physics Condition Accuracy By', mean_df_phy_pre['Mean Physics Retrieval Accuracy']
print 'P val:', tstat_phy_pre.pvalue
print '\n Pre-instruction: \n General Condition Accuracy By', mean_df_gen_pre['Mean General Retrieval Accuracy']
print 'P val:', tstat_gen_pre.pvalue
print '\n Pre-instruction: \n Baseline Condition Accuracy By', mean_df_sym_pre['Mean Baseline Retrieval Accuracy']
print 'P val:', tstat_sym_pre.pvalue

print '\n Post-instruction: \n Physics Condition Accuracy By', mean_df_phy_post['Mean Physics Retrieval Accuracy']
print 'P val:', tstat_phy_post.pvalue
print '\n Post-instruction: \n General Condition Accuracy By', mean_df_gen_post['Mean General Retrieval Accuracy']
print 'P val:', tstat_gen_post.pvalue
print '\n Post-instruction: \n Baseline Condition Accuracy By', mean_df_sym_post['Mean Baseline Retrieval Accuracy']
print 'P val:', tstat_sym_post.pvalue



 Pre-instruction: 
 Physics Condition Accuracy By Gender
F    0.615451
M    0.717361
Name: Mean Physics Retrieval Accuracy, dtype: float64
P val: 2.3727276244532567e-05

 Pre-instruction: 
 General Condition Accuracy By Gender
F    0.884549
M    0.936111
Name: Mean General Retrieval Accuracy, dtype: float64
P val: 0.0018281842436741248

 Pre-instruction: 
 Baseline Condition Accuracy By Gender
F    0.996528
M    0.997222
Name: Mean Baseline Retrieval Accuracy, dtype: float64
P val: 0.7934899337142314

 Post-instruction: 
 Physics Condition Accuracy By Gender
F    0.717014
M    0.803472
Name: Mean Physics Retrieval Accuracy, dtype: float64
P val: 0.00015738659126521306

 Post-instruction: 
 General Condition Accuracy By Gender
F    0.919271
M    0.942361
Name: Mean General Retrieval Accuracy, dtype: float64
P val: 0.17010528270893863

 Post-instruction: 
 Baseline Condition Accuracy By Gender
F    0.993924
M    0.995833
Name: Mean Baseline Retrieval Accuracy, dtype: float64
P val: 0.47

In [26]:
## RT by Gender
# t tests
tstat_phy_pre = ttest_ind(acc_m_phy_pre['Mean Correct RT'], acc_f_phy_pre['Mean Correct RT'])
tstat_gen_pre = ttest_ind(acc_m_gen_pre['Mean Correct RT'], acc_f_gen_pre['Mean Correct RT'])
tstat_sym_pre = ttest_ind(acc_m_sym_pre['Mean Correct RT'], acc_f_sym_pre['Mean Correct RT'])
tstat_phy_post = ttest_ind(acc_m_phy_post['Mean Correct RT'], acc_f_phy_post['Mean Correct RT'])
tstat_gen_post = ttest_ind(acc_m_gen_post['Mean Correct RT'], acc_f_gen_post['Mean Correct RT'])
tstat_sym_post = ttest_ind(acc_m_sym_post['Mean Correct RT'], acc_f_sym_post['Mean Correct RT'])

print '\n Pre-instruction: \n Physics Condition Correct RT By', mean_df_phy_pre['Mean Correct RT']
print 'P val:', tstat_phy_pre.pvalue
print '\n Pre-instruction: \n General Condition Correct RT By', mean_df_gen_pre['Mean Correct RT']
print 'P val:', tstat_gen_pre.pvalue
print '\n Pre-instruction: \n Baseline Condition Correct RT By', mean_df_sym_pre['Mean Correct RT']
print 'P val:', tstat_sym_pre.pvalue

print '\n Post-instruction: \n Physics Condition Correct RT By', mean_df_phy_post['Mean Correct RT']
print 'P val:', tstat_phy_post.pvalue
print '\n Post-instruction: \n General Condition Correct RT By', mean_df_gen_post['Mean Correct RT']
print 'P val:', tstat_gen_post.pvalue
print '\n Post-instruction: \n Baseline Condition Correct RT By', mean_df_sym_post['Mean Correct RT']
print 'P val:', tstat_sym_post.pvalue




 Pre-instruction: 
 Physics Condition Correct RT By Gender
F    4372.112357
M    4283.139574
Name: Mean Correct RT, dtype: float64
P val: 0.2765259400016505

 Pre-instruction: 
 General Condition Correct RT By Gender
F    3470.814676
M    3256.815987
Name: Mean Correct RT, dtype: float64
P val: 0.016901012389541302

 Pre-instruction: 
 Baseline Condition Correct RT By Gender
F    2015.830341
M    1852.549816
Name: Mean Correct RT, dtype: float64
P val: 0.0440680386662106

 Post-instruction: 
 Physics Condition Correct RT By Gender
F    4321.633837
M    4105.983774
Name: Mean Correct RT, dtype: float64
P val: 0.010714792829146698

 Post-instruction: 
 General Condition Correct RT By Gender
F    3469.286802
M    3271.198726
Name: Mean Correct RT, dtype: float64
P val: 0.03357346670898674

 Post-instruction: 
 Baseline Condition Correct RT By Gender
F    2006.947917
M    1831.416395
Name: Mean Correct RT, dtype: float64
P val: 0.04486970094708733


In [27]:
#Shapiro-Wilk Normality Test
analysis_df_phy_pre = analysis_df_phy_full[analysis_df_phy_full['Session']=='pre']
analysis_df_phy_post = analysis_df_phy_full[analysis_df_phy_full['Session']=='post']
analysis_df_gen_pre = analysis_df_gen_full[analysis_df_gen_full['Session']=='pre']
analysis_df_gen_post = analysis_df_gen_full[analysis_df_gen_full['Session']=='post']
analysis_df_sym_pre = analysis_df_sym_full[analysis_df_sym_full['Session']=='pre']
analysis_df_sym_post = analysis_df_sym_full[analysis_df_sym_full['Session']=='post']

#print(analysis_df_sym_post.head())

cond = 'Physics Retrieval'
meas = 'Accuracy'
data1 = analysis_df_phy_pre['Mean {0} {1}'.format(cond,meas)]
data2 = analysis_df_phy_post['Mean {0} {1}'.format(cond,meas)]

cond = 'General Retrieval'
meas = 'Accuracy'
data3 = analysis_df_gen_pre['Mean {0} {1}'.format(cond,meas)]
data4 = analysis_df_gen_post['Mean {0} {1}'.format(cond,meas)]

cond = 'Baseline Retrieval'
meas = 'Accuracy'
data5 = analysis_df_sym_pre['Mean {0} {1}'.format(cond,meas)]
data6 = analysis_df_sym_post['Mean {0} {1}'.format(cond,meas)]

datalist = [data1, data2, data3, data4, data5, data6]

# normality test
for i, data in enumerate(datalist):
    stat, p = shapiro(data)
    if i == 0:
        print('Phys Retr Accuracy Pre')
    elif i == 1:
        print('Phys Retr Accuracy Post')
    elif i == 2:
        print('Gen Retr Accuracy Pre')
    elif i == 3:
        print('Gen Retr Accuracy Post')
    elif i == 4:
        print('Sym Retr Accuracy Pre')
    elif i == 5:
        print('Sym Retr Accuracy Post')
    else:
        print('something went wrong')
    print('Statistics=%.3f, p=%.3f' % (stat, p))

    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')

Phys Retr Accuracy Pre
Statistics=0.971, p=0.017
Sample does not look Gaussian (reject H0)
Phys Retr Accuracy Post
Statistics=0.960, p=0.003
Sample does not look Gaussian (reject H0)
Gen Retr Accuracy Pre
Statistics=0.826, p=0.000
Sample does not look Gaussian (reject H0)
Gen Retr Accuracy Post
Statistics=0.731, p=0.000
Sample does not look Gaussian (reject H0)
Sym Retr Accuracy Pre
Statistics=0.237, p=0.000
Sample does not look Gaussian (reject H0)
Sym Retr Accuracy Post
Statistics=0.380, p=0.000
Sample does not look Gaussian (reject H0)


In [35]:
## Accuracy and RT t-tests post to pre (NOT separated by gender)

from scipy.stats import wilcoxon

# Get means for pre and post
mean_df_phy_full = analysis_df_phy_full.groupby('Session').mean()
mean_df_gen_full = analysis_df_gen_full.groupby('Session').mean()
mean_df_sym_full = analysis_df_sym_full.groupby('Session').mean()
sem_df_phy_full = analysis_df_phy_full.groupby('Session').sem()
sem_df_gen_full = analysis_df_gen_full.groupby('Session').sem()
sem_df_sym_full = analysis_df_sym_full.groupby('Session').sem()

# Accuracies
acc_phy_pre = analysis_df_phy_full[analysis_df_phy_full['Session']=='pre']
acc_gen_pre = analysis_df_gen_full[analysis_df_gen_full['Session']=='pre']
acc_sym_pre = analysis_df_sym_full[analysis_df_sym_full['Session']=='pre']

acc_phy_post = analysis_df_phy_full[analysis_df_phy_full['Session']=='post']
acc_gen_post = analysis_df_gen_full[analysis_df_gen_full['Session']=='post']
acc_sym_post = analysis_df_sym_full[analysis_df_sym_full['Session']=='post']

# Response Times
rt_phy_pre = analysis_df_phy_full[analysis_df_phy_full['Session']=='pre']
rt_gen_pre = analysis_df_gen_full[analysis_df_gen_full['Session']=='pre']
rt_sym_pre = analysis_df_sym_full[analysis_df_sym_full['Session']=='pre']

rt_phy_post = analysis_df_phy_full[analysis_df_phy_full['Session']=='post']
rt_gen_post = analysis_df_gen_full[analysis_df_gen_full['Session']=='post']
rt_sym_post = analysis_df_sym_full[analysis_df_sym_full['Session']=='post']

# t tests (for normal data)
tstat_phy_acc = ttest_rel(acc_phy_pre['Mean Physics Retrieval Accuracy'], acc_phy_post['Mean Physics Retrieval Accuracy'])
tstat_gen_acc = ttest_rel(acc_gen_pre['Mean General Retrieval Accuracy'], acc_gen_post['Mean General Retrieval Accuracy'])
tstat_sym_acc = ttest_rel(acc_sym_pre['Mean Baseline Retrieval Accuracy'], acc_sym_post['Mean Baseline Retrieval Accuracy'])

tstat_phy_rt = ttest_rel(rt_phy_pre['Mean Correct RT'], rt_phy_post['Mean Correct RT'])
tstat_gen_rt = ttest_rel(rt_gen_pre['Mean Correct RT'], rt_gen_post['Mean Correct RT'])
tstat_sym_rt = ttest_rel(rt_sym_pre['Mean Correct RT'], rt_sym_post['Mean Correct RT'])

# Wilcoxon rank-sum test (for not normal data)
data1, data2 = acc_phy_pre['Mean Physics Retrieval Accuracy'], acc_phy_post['Mean Physics Retrieval Accuracy']
wstat_phy_acc, p_phy_acc = wilcoxon(data1, data2)
data3, data4 = acc_gen_pre['Mean General Retrieval Accuracy'], acc_gen_post['Mean General Retrieval Accuracy']
wstat_gen_acc, p_gen_acc = wilcoxon(data3, data4)
data5, data6 = acc_sym_pre['Mean Baseline Retrieval Accuracy'], acc_sym_post['Mean Baseline Retrieval Accuracy']
wstat_sym_acc, p_sym_acc = wilcoxon(data5, data6)

print '\n Pre- to Post-Instruction Accuracy: \n Physics Retrieval Condition By', mean_df_phy_full['Mean Physics Retrieval Accuracy']
print 'T-Test P val:', tstat_phy_acc.pvalue
print 'Wilcoxon P val:', tstat_phy_acc.pvalue
print '\n Pre- to Post-Instruction Accuracy SEM: \n Physics Retrieval Condition By', sem_df_phy_full['Mean Physics Retrieval Accuracy']
#print '\n Pre- to Post-Instruction RT: \n Physics Retrieval Condition By', mean_df_phy_full['Mean Correct RT']
#print 'P val:', tstat_phy_rt.pvalue

print '\n Pre- to Post-Instruction Accuracy: \n General Retrieval Condition By', mean_df_gen_full['Mean General Retrieval Accuracy']
print 'T-Test P val:', tstat_gen_acc.pvalue
print 'Wilcoxon P val:', tstat_gen_acc.pvalue
print '\n Pre- to Post-Instruction Accuracy SEM: \n General Retrieval Condition By', sem_df_gen_full['Mean General Retrieval Accuracy']
#print '\n Pre- to Post-Instruction RT: \n General Retrieval Condition By', mean_df_gen_full['Mean Correct RT']
#print 'P val:', tstat_gen_rt.pvalue

print '\n Pre- to Post-Instruction Accuracy: \n Baseline Retrieval Condition By', mean_df_sym_full['Mean Baseline Retrieval Accuracy']
print 'T-Test P val:', tstat_sym_acc.pvalue
print 'Wilcoxon P val:', tstat_sym_acc.pvalue
print '\n Pre- to Post-Instruction Accuracy SEM: \n Baseline Retrieval Condition By', sem_df_sym_full['Mean Baseline Retrieval Accuracy']
#print '\n Pre- to Post-Instruction RT: \n Baseline Retrieval Condition By', mean_df_sym_full['Mean Correct RT']
#print 'P val:', tstat_sym_rt.pvalue


 Pre- to Post-Instruction Accuracy: 
 Physics Retrieval Condition By Session
post    0.765046
pre     0.672068
Name: Mean Physics Retrieval Accuracy, dtype: float64
T-Test P val: 4.789541503740517e-14
Wilcoxon P val: 4.789541503740517e-14

 Pre- to Post-Instruction Accuracy SEM: 
 Physics Retrieval Condition By Session
post    0.011673
pre     0.012404
Name: Mean Physics Retrieval Accuracy, dtype: float64

 Pre- to Post-Instruction Accuracy: 
 General Retrieval Condition By Session
post    0.932099
pre     0.913194
Name: Mean General Retrieval Accuracy, dtype: float64
T-Test P val: 0.0039943567904733935
Wilcoxon P val: 0.0039943567904733935

 Pre- to Post-Instruction Accuracy SEM: 
 General Retrieval Condition By Session
post    0.008342
pre     0.008351
Name: Mean General Retrieval Accuracy, dtype: float64

 Pre- to Post-Instruction Accuracy: 
 Baseline Retrieval Condition By Session
post    0.994985
pre     0.996914
Name: Mean Baseline Retrieval Accuracy, dtype: float64
T-Test P val

In [36]:
# copied over from fci beahv script need to edit to fit retr df's
#want new df with acc + gender, annotated by pre + post

# rename columns in demographics df, remove unmatched subjects, and drop unneeded cols beofre merge
df_gen_class = df_gen.rename(columns={'PID': 'Subject', 'Group': 'Class'})
df_gen_class.drop('Cmp.Stdy', axis=1, inplace=True)
df_gen_class['Class'] = df_gen_class['Class'].map({'Trad': 'Lecture', 'Mod': 'Modeling'})
df_gen_class['Gender And Class'] = df_gen_class['Class'] + ' ' + df_gen_class['Gender']
matched_id_list = post_df_phy['Subject'].tolist()
df_gen_class = df_gen_class[df_gen_class['Subject'].isin(matched_id_list)]
df_gen_class['Subject'] = df_gen_class['Subject'].astype('str')
#print df_gen_class.head()

# merge phys condition df with class and gender df
df_acc_phy = pd.merge(full_df_phy, df_gen_class, on=['Subject'])
df_acc_pre_phy = df_acc_phy[df_acc_phy['Session'] == 'pre']
df_acc_post_phy = df_acc_phy[df_acc_phy['Session'] == 'post']
df_wide = pd.merge(df_acc_pre_phy,df_acc_post_phy,on=['Subject'],suffixes=('_pre','_post'))
df_wide = df_wide.rename(index=str, columns={"Gender_pre": "Gender", "Class_pre": "Class", 
                                             "Gender And Class_pre": "Gender And Class"})
df_wide = df_wide.drop(['Gender_post', 'Class_post', 'Gender And Class_post', 
                        'index_pre', 'index_post'], axis=1)
#print df_wide.head()

df_retr_phy_ac = df_wide.melt(id_vars='Gender', value_vars=['Mean Physics Retrieval Accuracy_pre', 
                                                            'Mean Physics Retrieval Accuracy_post'],
                 var_name='Physics Retrieval Accuracy')
df_retr_phy_ac.replace(to_replace='Mean Physics Retrieval Accuracy_pre', value='Pre', inplace=True)
df_retr_phy_ac.replace(to_replace='Mean Physics Retrieval Accuracy_post', value='Post', inplace=True)

#the following is still copied over from the fci behav script. will need to edit.
# df_control_ac = df_wide.melt(id_vars='Gender', value_vars=['Mean Control Accuracy_pre', 'Mean Control Accuracy_post'],
#                  var_name='Control Accuracy')
# df_control_ac.replace(to_replace='Mean Control Accuracy_pre', value='Pre', inplace=True)
# df_control_ac.replace(to_replace='Mean Control Accuracy_post', value='Post', inplace=True)

# df_fci_rt = df_wide.melt(id_vars='Gender', value_vars=['Mean FCI RT_pre', 'Mean FCI RT_post'],
#                  var_name='FCI Response Time')
# df_fci_rt.replace(to_replace='Mean FCI RT_pre', value='Pre', inplace=True)
# df_fci_rt.replace(to_replace='Mean FCI RT_post', value='Post', inplace=True)

# df_control_rt = df_wide.melt(id_vars='Gender', value_vars=['Mean Control RT_pre', 'Mean Control RT_post'],
#                  var_name='Control Response Time')
# df_control_rt.replace(to_replace='Mean Control RT_pre', value='Pre', inplace=True)
# df_control_rt.replace(to_replace='Mean Control RT_post', value='Post', inplace=True)

df_retr_phy_ac.tail()

Unnamed: 0,Gender,Physics Retrieval Accuracy,value
211,M,Post,0.708333
212,M,Post,0.791667
213,F,Post,0.625
214,M,Post,0.916667
215,F,Post,0.875


In [37]:
sns.set_style(style='whitegrid')
#print sns.crayons

crayons = sns.crayon_palette(['Fuchsia', 'Fern'])
class_crayons = sns.crayon_palette(['Copper', 'Cornflower'])
dark_crayons = sns.crayon_palette(['Royal Purple', 'Tropical Rain Forest'])
class_dark_crayons = sns.crayon_palette(['Raw Sienna', 'Indigo'])

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(10, 5), sharex=False, sharey=True)

linewidth = 0.95

sns.swarmplot(x="Physics Retrieval Accuracy", y="value", hue="Gender", data=df_retr_phy_ac, 
              palette=crayons, ax=ax[0], zorder=1)
sns.pointplot(x="Physics Retrieval Accuracy", y="value", hue="Gender", data=df_retr_phy_ac, 
              palette=dark_crayons, ax=ax[0], zorder=100, dodge=False, 
              scale=linewidth, errwidth=linewidth+2,
              markers='_')

# sns.swarmplot(x="Control Accuracy", y="value", hue="Gender", data=df_control_ac, 
#               palette=crayons, ax=ax[1], zorder=1)
# sns.pointplot(x="Control Accuracy", y="value", hue="Gender", data=df_control_ac, 
#               palette=dark_crayons, ax=ax[1], zorder=100, dodge=False, 
#               scale=linewidth, errwidth=linewidth+2,
#               markers='_')

plt.title('Gender Differences Across Instruction')

#fig.savefig(path.join(basedir, plotdir, 'mean_fci+control_acc_gender_swarmplot.png'), dpi=400)

NameError: name 'sns' is not defined

In [None]:
#not working
df_acc_pre_gender = df_acc_pre.melt(id_vars='Gender', value_vars=['Mean FCI Accuracy', 'Mean Control Accuracy'], 
                            var_name='Accuracy_Pre')
df_acc_post_gender = df_acc_post.melt(id_vars='Gender', value_vars=['Mean FCI Accuracy', 'Mean Control Accuracy'], 
                            var_name='Accuracy_Post')
df_rt_pre_gender = df_acc_pre.melt(id_vars='Gender', value_vars=['Mean FCI RT', 'Mean Control RT'], 
                            var_name='RT_Pre')
df_rt_post_gender = df_acc_post.melt(id_vars='Gender', value_vars=['Mean FCI RT', 'Mean Control RT'], 
                            var_name='RT_Post')

df_rt_post_gender.head()