In [1]:
import pandas as pd

# some headers have a space, eg u' degree', so manually create them
data = pd.read_csv('faculty.csv', names=['name','degree','title','email'], header=0)
data.head()

Unnamed: 0,name,degree,title,email
0,Scarlett L. Bellamy,Sc.D.,Associate Professor of Biostatistics,bellamys@mail.med.upenn.edu
1,Warren B. Bilker,Ph.D.,Professor of Biostatistics,warren@upenn.edu
2,Matthew W Bryan,PhD,Assistant Professor of Biostatistics,bryanma@upenn.edu
3,Jinbo Chen,Ph.D.,Associate Professor of Biostatistics,jinboche@upenn.edu
4,Susan S Ellenberg,Ph.D.,Professor of Biostatistics,sellenbe@upenn.edu


In [2]:
data.degree.describe()

count         37
unique        11
top        Ph.D.
freq          15
Name: degree, dtype: object

In [3]:
# messy!
data.degree.unique()

array([' Sc.D.', 'Ph.D.', ' PhD', ' Ph.D.', ' Ph.D', ' MD MPH Ph.D',
       ' B.S.Ed. M.S. Ph.D.', ' JD MA MPH MS PhD', ' PhD ScD', '0', ' ScD'], dtype=object)

In [4]:
# standardize degree text
data.degree = data.degree.apply(lambda s: s.replace('.','').strip())
data.degree.unique()

array(['ScD', 'PhD', 'MD MPH PhD', 'BSEd MS PhD', 'JD MA MPH MS PhD',
       'PhD ScD', '0'], dtype=object)

In [5]:
data

Unnamed: 0,name,degree,title,email
0,Scarlett L. Bellamy,ScD,Associate Professor of Biostatistics,bellamys@mail.med.upenn.edu
1,Warren B. Bilker,PhD,Professor of Biostatistics,warren@upenn.edu
2,Matthew W Bryan,PhD,Assistant Professor of Biostatistics,bryanma@upenn.edu
3,Jinbo Chen,PhD,Associate Professor of Biostatistics,jinboche@upenn.edu
4,Susan S Ellenberg,PhD,Professor of Biostatistics,sellenbe@upenn.edu
5,Jonas H. Ellenberg,PhD,Professor of Biostatistics,jellenbe@mail.med.upenn.edu
6,Rui Feng,PhD,Assistant Professor of Biostatistics,ruifeng@upenn.edu
7,Benjamin C. French,PhD,Associate Professor of Biostatistics,bcfrench@mail.med.upenn.edu
8,Phyllis A. Gimotty,PhD,Professor of Biostatistics,pgimotty@upenn.edu
9,Wensheng Guo,PhD,Professor of Biostatistics,wguo@mail.med.upenn.edu


In [6]:
from collections import defaultdict
unique_degrees = defaultdict(int)

for deg_raw in data.degree:
    for deg in deg_raw.split(' '):
        unique_degrees[deg] += 1

unique_degrees

defaultdict(int,
            {'0': 1,
             'BSEd': 1,
             'JD': 1,
             'MA': 1,
             'MD': 1,
             'MPH': 2,
             'MS': 2,
             'PhD': 31,
             'ScD': 6})

In [7]:
degree_freqs = pd.DataFrame.from_dict(unique_degrees, orient='index')
degree_freqs.columns = ['freq']
degree_freqs = degree_freqs.sort_values('freq',ascending=False)
degree_freqs

Unnamed: 0,freq
PhD,31
ScD,6
MPH,2
MS,2
MD,1
MA,1
BSEd,1
0,1
JD,1


In [9]:
print "there are %i degrees total, for %i people" % (degree_freqs.freq.sum(), len(data))

there are 46 degrees total, for 37 people


In [14]:
data.title = data.title.apply(lambda t: t.replace(' is ',' of '))

In [17]:
data.title.value_counts()

Professor of Biostatistics              13
Associate Professor of Biostatistics    12
Assistant Professor of Biostatistics    12
Name: title, dtype: int64

In [21]:
for email in data.email.values:
    print email

bellamys@mail.med.upenn.edu
warren@upenn.edu
bryanma@upenn.edu
jinboche@upenn.edu
sellenbe@upenn.edu
jellenbe@mail.med.upenn.edu
ruifeng@upenn.edu
bcfrench@mail.med.upenn.edu
pgimotty@upenn.edu
wguo@mail.med.upenn.edu
hsu9@mail.med.upenn.edu
rhubb@mail.med.upenn.edu
whwang@mail.med.upenn.edu
mjoffe@mail.med.upenn.edu
jrlandis@mail.med.upenn.edu
liy3@email.chop.edu
mingyao@mail.med.upenn.edu
hongzhe@upenn.edu
rlocalio@upenn.edu
nanditam@mail.med.upenn.edu
knashawn@mail.med.upenn.edu
propert@mail.med.upenn.edu
mputt@mail.med.upenn.edu
sratclif@upenn.edu
michross@upenn.edu
jaroy@mail.med.upenn.edu
msammel@cceb.med.upenn.edu
shawp@upenn.edu
rshi@mail.med.upenn.edu
hshou@mail.med.upenn.edu
jshults@mail.med.upenn.edu
alisaste@mail.med.upenn.edu
atroxel@mail.med.upenn.edu
rxiao@mail.med.upenn.edu
sxie@mail.med.upenn.edu
dxie@upenn.edu
weiyang@mail.med.upenn.edu


In [61]:
import re

unique_email_domains = {re.findall(r'@(.+)',email)[0] for email in data.email.values}

# markup formatted bulleted list
for dom in unique_email_domains:
    print '* ' + dom

* email.chop.edu
* upenn.edu
* cceb.med.upenn.edu
* mail.med.upenn.edu
