# using categoricals to represent object data

In [1]:
pd.set_option('max_rows',12)

# in memory

In [2]:
df_object = DataFrame({'B' : Series(['a','foo','bar','a really long string','baz'])})
df_cat = df_object.copy()
df_cat['B'] = df_cat['B'].astype('category')
df_object = pd.concat([df_object]*100000,ignore_index=True)
df_cat = pd.concat([df_cat]*100000,ignore_index=True)
df_object

Unnamed: 0,B
0,a
1,foo
2,bar
3,a really long string
4,baz
5,a
...,...
499994,baz
499995,a
499996,foo


In [3]:
df_object.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 1 columns):
B    500000 non-null object
dtypes: object(1)
memory usage: 7.6+ MB


In [4]:
def as_mb(v):
    return "%.1f MB" % (v/(1024.0*1024))

In [5]:
# what python actually
import sys
as_mb(sum(map(sys.getsizeof,df_object['B'].values)))

'20.5 MB'

In [6]:
# approx fixed-len string storage
as_mb(df_object['B'].values.astype(str).nbytes - 8*len(df_object['B']))

'5.7 MB'

In [7]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 1 columns):
B    500000 non-null category
dtypes: category(1)
memory usage: 4.3 MB


# on disk

In [8]:
df_object.to_hdf('data/test_object.h5','df',mode='w',data_columns=True,format='table')
df_cat.to_hdf('data/test_cat.h5','df',mode='w',data_columns=True,format='table')
!ls -ltr data/*.h5


-rw-rw-r--  1 jreback  staff     84472 Jun 17 22:06 data/test_iterator.h5
-rw-rw-r--  1 jreback  staff  14749644 Jun 18 10:14 data/test_object.h5
-rw-rw-r--  1 jreback  staff   5290697 Jun 18 10:14 data/test_cat.h5


In [9]:
with pd.get_store('data/test_cat.h5') as store:
    print store

<class 'pandas.io.pytables.HDFStore'>
File path: data/test_cat.h5
/df                        frame_table  (typ->appendable,nrows->500000,ncols->1,indexers->[index],dc->[B])
/df/meta/B/meta            series_table (typ->appendable,nrows->5,ncols->1,indexers->[index],dc->[values])
