In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [82]:
pre = pd.read_csv('anon_pre.csv')
post = pd.read_csv('anon_post.csv')
cis = pd.read_csv('anon_cis.csv')
matched = pre.set_index('anon_student_id').join(post.set_index('anon_student_id'), rsuffix='_post').dropna(subset=['q01a', 'q01a_post']).reset_index()

In [159]:
print('----Overview----')
print('Total number of times survey has been offered everywhere:', cis.shape[0])
print('Number of unique universities in data set:', cis.anon_university_id.nunique())

print('Number of unique instructors in data set:', cis.anon_instructor_id.nunique())

# this works becuase students were given anonymous IDs sequentially starting from 0
print('Number of unique students in data set (unmatched):', post.anon_student_id.max())

# this is checking if the students answered at least the first question of both pre and post and then calling that matched.
print('Number of unique students in data set (matched):', matched.index.nunique())

print('Number of instructors with repeated surveys:', cis.groupby('anon_instructor_id').count().groupby('Q5').count().Q52.iloc[1:].sum())

print('Number of universities with repeated surveys:', cis.groupby('anon_university_id').count().groupby('Q5').count().Q52.iloc[1:].sum())

print('\n')

print('----Gender----')
# unmatched_gender = post.groupby('Q54').count().anon_student_id
unmatched_gender = post[['anon_student_id', 'Q54']].drop_duplicates().groupby('Q54').count().anon_student_id
print('Number of females in data set (unmatched):', unmatched_gender[1])
print('Number of males in data set (unmatched):', unmatched_gender[2])
print('Number of other in data set (unmatched):', unmatched_gender[3])
print('\n')

# matched_gender = matched.groupby('Q54').count().anon_student_id
matched_gender = matched[['anon_student_id', 'Q54']].drop_duplicates().groupby('Q54').count().anon_student_id
print('Number of females in data set (matched):', matched_gender[1])
print('Number of males in data set (matched):', matched_gender[2])
print('Number of other in data set (matched):', matched_gender[3])
print('\n')


print('----Race (unmatched)----')
unmatched_race = post[['anon_student_id', 'Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].drop_duplicates()
print('American Indian or Alaska Native:', unmatched_race.Q52_1.sum())
print('Asian:', unmatched_race.Q52_2.sum())
print('Black or African American:', unmatched_race.Q52_3.sum())
print('Hispanic/Latino:', unmatched_race.Q52_4.sum())
print('Native Hawaiian or other Pacific Islander:', unmatched_race.Q52_5.sum())
print('White:', unmatched_race.Q52_6.sum())
print('Other race/ethnicity:', unmatched_race.Q52_7.sum())
s = unmatched_race[['Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].sum(axis=1)
print('More than one race:', s.where(s > 1).count())
print('\n')

print('----Race (matched)----')
matched_race = matched[['anon_student_id', 'Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].drop_duplicates()
print('American Indian or Alaska Native:', matched_race.Q52_1.sum())
print('Asian:', matched_race.Q52_2.sum())
print('Black or African American:', matched_race.Q52_3.sum())
print('Hispanic/Latino:', matched_race.Q52_4.sum())
print('Native Hawaiian or other Pacific Islander:', matched_race.Q52_5.sum())
print('White:', matched_race.Q52_6.sum())
print('Other race/ethnicity:', matched_race.Q52_7.sum())
s = matched_race[['Q52_1', 'Q52_2', 'Q52_3', 'Q52_4', 'Q52_5', 'Q52_6', 'Q52_7']].sum(axis=1)
print('More than one race:', s.where(s > 1).count())
print('\n')

print('----Course information----')
course_type = cis.groupby('Q18')

print('Number of times survey given to students in first year lab:', course_type.count().Q5.loc['First year (introductory) lab'])
print('Number of times survey given to students in BUFFY lab courses:', course_type.count().Q5.loc['Beyond the first year lab'])

## haha, i guess its not a good assumption to think its normally distributed course enrollment numbers
print('Average =/-std students in first year lab:', course_type.mean().Q19.loc['First year (introductory) lab'].round(2), '+/-',course_type.std().Q19.loc['First year (introductory) lab'].round(2))
print('Average =/-std students in BUFFY lab courses:', course_type.mean().Q19.loc['Beyond the first year lab'].round(2), '+/-', course_type.std().Q19.loc['Beyond the first year lab'].round(2))

course_type = cis.groupby('Q27')
print('Algebra based intro courses:', course_type.count().Q5.loc['Algebra-based'])
print('Calculus based intro courses:', course_type.count().Q5.loc['Calculus-based'])

----Overview----
Total number of times survey has been offered everywhere: 599
Number of unique universities in data set: 133
Number of unique instructors in data set: 204
Number of unique students in data set (unmatched): 42424
Number of unique students in data set (matched): 45287
Number of instructors with repeated surveys: 100
Number of universities with repeated surveys: 80


----Gender----
Number of females in data set (unmatched): 11545
Number of males in data set (unmatched): 16054
Number of other in data set (unmatched): 311


Number of females in data set (matched): 8603
Number of males in data set (matched): 12118
Number of other in data set (matched): 249


----Race (unmatched)----
American Indian or Alaska Native: 309.0
Asian: 6559.0
Black or African American: 1987.0
Hispanic/Latino: 2591.0
Native Hawaiian or other Pacific Islander: 275.0
White: 16249.0
Other race/ethnicity: 840.0
More than one race: 2093


----Race (matched)----
American Indian or Alaska Native: 215.0
Asi

In [157]:
course_type = cis.groupby('Q27')
print(course_type.count().Q5)


Q27
Algebra-based     141
Calculus-based    222
Name: Q5, dtype: int64


In [12]:
post.anon_student_id.max()

42424

In [79]:
matched.reset_index().groupby('Q54').count()

Unnamed: 0_level_0,anon_student_id,survey_id,duration,q01a,q01b,q27a,q27b,q03a,q03b,q22a,...,q28c,q29a_post,q29b_post,q30a_post,q30b_post,q30c,q31a_post,q31b_post,q40a_post,q40b_post
Q54,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,17897,17897,17897,17897,17863,17879,17856,17872,17859,17863,...,17841,17890,17880,17870,17853,17866,17896,17873,17897,17897
2.0,25945,25945,25945,25945,25902,25904,25845,25895,25864,25900,...,25843,25900,25865,25905,25868,25880,25933,25894,25945,25945
3.0,515,515,515,515,515,515,515,514,515,514,...,505,512,497,513,505,511,513,503,515,515


In [81]:
matched.reset_index()

Unnamed: 0,anon_student_id,survey_id,duration,q01a,q01b,q27a,q27b,q03a,q03b,q22a,...,q28c,q29a_post,q29b_post,q30a_post,q30b_post,q30c,q31a_post,q31b_post,q40a_post,q40b_post
0,5,0NBvjWEubiZYgcJ,783.0,5.0,5.0,3.0,5.0,5.0,5.0,1.0,...,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,4.0,4.0
1,6,0NBvjWEubiZYgcJ,488.0,5.0,5.0,3.0,3.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,4.0,4.0
2,7,0NBvjWEubiZYgcJ,393.0,5.0,,5.0,5.0,5.0,5.0,5.0,...,4.0,5.0,5.0,5.0,5.0,1.0,5.0,5.0,4.0,4.0
3,10,0NBvjWEubiZYgcJ,1472.0,5.0,5.0,3.0,5.0,5.0,5.0,1.0,...,4.0,5.0,5.0,3.0,1.0,5.0,5.0,5.0,4.0,4.0
4,11,0NBvjWEubiZYgcJ,364.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,4.0,1.0,5.0,1.0,5.0,5.0,5.0,5.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45282,35385,dpp1pzjQkszbz6Z,196.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,,3.0,3.0,3.0,3.0,,3.0,5.0,4.0,4.0
45283,35386,dpp1pzjQkszbz6Z,515.0,5.0,5.0,5.0,5.0,1.0,5.0,3.0,...,4.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,4.0,4.0
45284,35387,dpp1pzjQkszbz6Z,499.0,5.0,1.0,1.0,1.0,5.0,1.0,1.0,...,5.0,5.0,5.0,1.0,1.0,4.0,5.0,5.0,4.0,4.0
45285,35390,dpp1pzjQkszbz6Z,534.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,...,5.0,5.0,5.0,1.0,1.0,5.0,5.0,5.0,4.0,4.0
