In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
pre = pd.read_csv('anon_pre.csv')
post = pd.read_csv('anon_post.csv')
cis = pd.read_csv('anon_cis.csv')
# matched = pre.set_index('anon_student_id').join(post.set_index('anon_student_id'), rsuffix='_post').dropna(subset=['q01a', 'q01a_post']).reset_index()
# pre.set_index(['anon_student_id', 'ResponseId']).join(post.set_index(['anon_student_id', 'ResponseId']))

# pre['anon_student_id'] = pre.anon_student_id.astype('category')
# pre['ResponseId'] = pre.ResponseId.astype('category')

# post['anon_student_id'] = post.anon_student_id.astype('category')
# post['ResponseId'] = post.ResponseId.astype('category')

matched = pre.merge(post, on=['anon_student_id', 'ResponseId'], suffixes=['_pre', '_post'])

In [7]:
print('----Overview----')
print('Total number of instances of the survey everywhere:', cis.shape[0])
print('Number of unique universities in data set:', cis.anon_university_id.nunique())

print('Number of unique instructors in data set:', cis.anon_instructor_id.nunique())

# this works becuase students were given anonymous IDs sequentially starting from 0
print('Number of unique students in data set (unmatched):', post.anon_student_id.max())

# this is checking if the students answered at least the first question of both pre and post and then calling that matched.
print('Number of unique students in data set (matched):', matched.index.nunique())

print('Number of instructors with repeated surveys:', cis.groupby('anon_instructor_id').count().groupby('Q5').count().Q52.iloc[1:].sum())

print('Number of universities with repeated surveys:', cis.groupby('anon_university_id').count().groupby('Q5').count().Q52.iloc[1:].sum())

print('\n')
pre_completion = pre.groupby('survey_id').count().anon_student_id.reset_index()
pre_completion.columns = ['pre_survey_id', 'pre_cnt']
post_completion = post.groupby('survey_id').count().anon_student_id.reset_index()
post_completion.columns = ['post_survey_id', 'post_cnt']
cisq19 = cis[['Q19', 'pre_survey_id', 'post_survey_id']]
pre19 = cisq19.merge(pre_completion)
post19 = cisq19.merge(post_completion)

matched_cnt = pre.set_index('anon_student_id').join(post.set_index('anon_student_id'), rsuffix='_post').dropna(subset=['q01a', 'q01a_post']).groupby('survey_id').count().duration.reset_index()
matched_cnt.columns = ['pre_survey_id', 'cnt']
matched_cnt = cis.merge(matched_cnt)[['pre_survey_id', 'Q19', 'cnt']]


print('Number of surveys with more student respondents than reported maximum registered students (PRE):', (pre19.pre_cnt/pre19.Q19).apply(lambda x: x if x > 1 else np.nan).dropna().shape[0])
print('Number of surveys with more student respondents than reported maximum registered students (POST):', (post19.post_cnt/post19.Q19).apply(lambda x: x if x > 1 else np.nan).dropna().shape[0])

print('Number of surveys with more student respodnents than repored maximum registered students (MATCHED):', (matched_cnt.cnt/matched_cnt.Q19).apply(lambda x: x if x > 1 else np.nan).dropna().shape[0])

print('Average fraction +\- STDEV of course completing the PRE survey:', round((pre19.pre_cnt/pre19.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().mean(), 2), '+/-', round((pre19.pre_cnt/pre19.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().std(), 2)
)
print('Average +\- STDEV fraction of course completing the POST survey:', round((post19.post_cnt/post19.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().mean(), 2), '+/-', round((post19.post_cnt/post19.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().std(), 2))
print('Average +\- STDEV fraction of course completing the MATCHED survey:', round((matched_cnt.cnt/matched_cnt.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().mean(), 2), '+/-', round((matched_cnt.cnt/matched_cnt.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().std(), 2))

print('\n')

print('----Gender----')
# unmatched_gender = post.groupby('Q54').count().anon_student_id
unmatched_gender = post[['anon_student_id', 'Q54']].drop_duplicates().groupby('Q54').count().anon_student_id
print('Number of females in data set (unmatched):', unmatched_gender[1])
print('Number of males in data set (unmatched):', unmatched_gender[2])
print('Number of other in data set (unmatched):', unmatched_gender[3])
print('\n')

# matched_gender = matched.groupby('Q54').count().anon_student_id
matched_gender = matched[['anon_student_id', 'Q54']].drop_duplicates().groupby('Q54').count().anon_student_id
print('Number of females in data set (matched):', matched_gender[1])
print('Number of males in data set (matched):', matched_gender[2])
print('Number of other in data set (matched):', matched_gender[3])
print('\n')


print('----Race (unmatched)----')
unmatched_race = post[['anon_student_id', 'Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].drop_duplicates()
print('American Indian or Alaska Native:', unmatched_race.Q52_1.sum())
print('Asian:', unmatched_race.Q52_2.sum())
print('Black or African American:', unmatched_race.Q52_3.sum())
print('Hispanic/Latino:', unmatched_race.Q52_4.sum())
print('Native Hawaiian or other Pacific Islander:', unmatched_race.Q52_5.sum())
print('White:', unmatched_race.Q52_6.sum())
print('Other race/ethnicity:', unmatched_race.Q52_7.sum())
s = unmatched_race[['Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].sum(axis=1)
print('More than one race:', s.where(s > 1).count())
print('\n')

print('----Race (matched)----')
matched_race = matched[['anon_student_id', 'Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7', 'race_unknown']].drop_duplicates()
print('American Indian or Alaska Native:', matched_race.Q52_1.sum())
print('Asian:', matched_race.Q52_2.sum())
print('Black or African American:', matched_race.Q52_3.sum())
print('Hispanic/Latino:', matched_race.Q52_4.sum())
print('Native Hawaiian or other Pacific Islander:', matched_race.Q52_5.sum())
print('White:', matched_race.Q52_6.sum())
print('Other race/ethnicity:', matched_race.Q52_7.sum())
s = matched_race[['Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].sum(axis=1)
print('More than one race:', s.where(s > 1).count())
print('Race Unknown:', matched_race.race_unknown.sum())
print('\n')

print('----Course information----')
course_type = cis.groupby('Q18')

print('Number of times survey given to students in first year lab:', course_type.count().Q5.loc['First year (introductory) lab'])
print('Number of times survey given to students in BUFFY lab courses:', course_type.count().Q5.loc['Beyond the first year lab'])

## haha, i guess its not a good assumption to think its normally distributed course enrollment numbers
print('Average +/- STDEV students in first year lab:', course_type.mean().Q19.loc['First year (introductory) lab'].round(2), '+/-',course_type.std().Q19.loc['First year (introductory) lab'].round(2))
print('Average +/- STDEV students in BUFFY lab courses:', course_type.mean().Q19.loc['Beyond the first year lab'].round(2), '+/-', course_type.std().Q19.loc['Beyond the first year lab'].round(2))

course_type = cis.groupby('Q27')
print('Algebra based intro courses:', course_type.count().Q5.loc['Algebra-based'])
print('Calculus based intro courses:', course_type.count().Q5.loc['Calculus-based'])

----Overview----
Total number of instances of the survey everywhere: 599
Number of unique universities in data set: 133
Number of unique instructors in data set: 204
Number of unique students in data set (unmatched): 43080
Number of unique students in data set (matched): 23005
Number of instructors with repeated surveys: 100
Number of universities with repeated surveys: 80


Number of surveys with more student respondents than reported maximum registered students (PRE): 32
Number of surveys with more student respondents than reported maximum registered students (POST): 17
Number of surveys with more student respodnents than repored maximum registered students (MATCHED): 51
Average fraction +\- STDEV of course completing the PRE survey: 0.71 +/- 0.25
Average +\- STDEV fraction of course completing the POST survey: 0.58 +/- 0.25
Average +\- STDEV fraction of course completing the MATCHED survey: 0.49 +/- 0.27


----Gender----
Number of females in data set (unmatched): 11620
Number of mal

In [4]:
matched_cnt = pre.set_index('anon_student_id').join(post.set_index('anon_student_id'), rsuffix='_post').dropna(subset=['q01a', 'q01a_post']).groupby('survey_id').count().duration.reset_index()
matched_cnt.columns = ['pre_survey_id', 'cnt']
matched_cnt = cis.merge(matched_cnt)[['pre_survey_id', 'Q19', 'cnt']]
round((matched_cnt.cnt/matched_cnt.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().mean(), 2)

0.49

In [5]:
pre_completion = pre.groupby('survey_id').count().anon_student_id.reset_index()
pre_completion.columns = ['pre_survey_id', 'pre_cnt']
post_completion = post.groupby('survey_id').count().anon_student_id.reset_index()
post_completion.columns = ['post_survey_id', 'post_cnt']
cisq19 = cis[['Q19', 'pre_survey_id', 'post_survey_id']]
# cisq19.set_index('pre_survey_id').join(pre_completion.set_index('post_survey_id'))
# cisq19.merge(pre_completion).merge(post_completion)
pre19 = cisq19.merge(pre_completion)
# (pre19.pre_cnt/pre19.Q19).apply(lambda x: x if x ==2 else np.nan).dropna()
(pre19.pre_cnt/pre19.Q19).apply(lambda x: x if x <= 1 else np.nan).dropna().std()

0.24913458710907008

In [6]:
matched.groupby('anon_student_id').count().sort_values('survey_id')

KeyError: 'survey_id'

In [None]:
matched[matched.anon_student_id==1950]

In [None]:
cis[cis.pre_survey_id=='3scq3Lugl5QdmER']

In [None]:
pre[pre.survey_id=='2iDeu0O3EPqxT4V'].anon_student_id

In [None]:
cis[cis.pre_survey_id=='2iDeu0O3EPqxT4V']

In [None]:
pre19.loc[268]

In [None]:
cis[cis.pre_survey_id=='6FQgPIcTNGjmkSx']

In [None]:
pre[pre.survey_id=='6FQgPIcTNGjmkSx']

In [None]:
pre.shape

In [None]:
pre.drop_duplicates()

In [None]:
post.shape

In [None]:
post.drop_duplicates()