A common problem faced in reading files into structures like `pandas.dataFrames`, `astropy.table.Table`, `numpy.rec.arrays`
is that the files have headers with different names meaning the same thing.

For example consider a `time` variable in these files. Let us assume that we are aware that in different files we are interested in the same variable is known by different aliases for example any of 
['mjd','expmjd', 'date', 'jd'], capitalized differently. We would like our data structure to use the standard name `time` and therfore change occurances of other aliases to standard names. This notebook demonstrates utilities that help in this process.

## Setup: obtain a `pandas.dataFrame`, an `astropy.table.Table` and `numpy.rec.array` instance which have bad columns

In [1]:
import desc.monitor as monitor
import sncosmo
import numpy as np
from astropy.table import Table

In [2]:
# This is what we would like our datastructures to be
# Astropy Table
lc  = sncosmo.load_example_data()
# pandas DataFrame
lcdf = lc.to_pandas()
# numpy.recarray
lcrec = np.asarray(lc)

In [3]:
lc[:5]

time,band,flux,fluxerr,zp,zpsys
float64,str5,float64,float64,float64,str2
55070.0,sdssg,0.36351153597,0.672843847541,25.0,ab
55072.0512821,sdssr,-0.200801295864,0.672843847541,25.0,ab
55074.1025641,sdssi,0.307494232981,0.672843847541,25.0,ab
55076.1538462,sdssz,1.08776103656,0.672843847541,25.0,ab
55078.2051282,sdssg,-0.43667895645,0.672843847541,25.0,ab


In [4]:
lcdf.head()

Unnamed: 0,time,band,flux,fluxerr,zp,zpsys
0,55070.0,sdssg,0.363512,0.672844,25,ab
1,55072.051282,sdssr,-0.200801,0.672844,25,ab
2,55074.102564,sdssi,0.307494,0.672844,25,ab
3,55076.153846,sdssz,1.087761,0.672844,25,ab
4,55078.205128,sdssg,-0.436679,0.672844,25,ab


In [5]:
lcrec[:5]

array([(55070.0, 'sdssg', 0.36351153597, 0.672843847541, 25.0, 'ab'),
       (55072.0512821, 'sdssr', -0.200801295864, 0.672843847541, 25.0, 'ab'),
       (55074.1025641, 'sdssi', 0.307494232981, 0.672843847541, 25.0, 'ab'),
       (55076.1538462, 'sdssz', 1.08776103656, 0.672843847541, 25.0, 'ab'),
       (55078.2051282, 'sdssg', -0.43667895645, 0.672843847541, 25.0, 'ab')], 
      dtype=[('time', '<f8'), ('band', 'S5'), ('flux', '<f8'), ('fluxerr', '<f8'), ('zp', '<f8'), ('zpsys', 'S2')])

##  Generate the bad headers

In [6]:
# Just for fun, let us make this a little more crazy
# This is the bad Pandas DataFrame
badHeadersDF = lcdf.rename(columns={'time':'mJD', 'flux': 'Flux', 'fluxerr': 'Flux_Err'})
badHeadersDF.head()
# numpy rec.array
badHeadersRecArr = badHeadersDF.to_records(index=False)
# Astropy Table
badHeadersTable = Table(badHeadersRecArr)

In [7]:
badHeadersTable.show_in_notebook(display_length=5)

mJD,band,Flux,Flux_Err,zp,zpsys
55070.0,sdssg,0.36351153597,0.672843847541,25.0,ab
55072.0512821,sdssr,-0.200801295864,0.672843847541,25.0,ab
55074.1025641,sdssi,0.307494232981,0.672843847541,25.0,ab
55076.1538462,sdssz,1.08776103656,0.672843847541,25.0,ab
55078.2051282,sdssg,-0.43667895645,0.672843847541,25.0,ab
55080.2564103,sdssr,1.09780966779,0.672843847541,25.0,ab
55082.3076923,sdssi,3.7562685627,0.672843847541,25.0,ab
55084.3589744,sdssz,5.34858894966,0.672843847541,25.0,ab
55086.4102564,sdssg,2.82614187269,0.672843847541,25.0,ab
55088.4615385,sdssr,7.56547045054,0.672843847541,25.0,ab


In [8]:
badHeadersDF.head(5)

Unnamed: 0,mJD,band,Flux,Flux_Err,zp,zpsys
0,55070.0,sdssg,0.363512,0.672844,25,ab
1,55072.051282,sdssr,-0.200801,0.672844,25,ab
2,55074.102564,sdssi,0.307494,0.672844,25,ab
3,55076.153846,sdssz,1.087761,0.672844,25,ab
4,55078.205128,sdssg,-0.436679,0.672844,25,ab


In [9]:
badHeadersRecArr[:5]

rec.array([(55070.0, 'sdssg', 0.36351153597, 0.672843847541, 25.0, 'ab'),
 (55072.0512821, 'sdssr', -0.200801295864, 0.672843847541, 25.0, 'ab'),
 (55074.1025641, 'sdssi', 0.307494232981, 0.672843847541, 25.0, 'ab'),
 (55076.1538462, 'sdssz', 1.08776103656, 0.672843847541, 25.0, 'ab'),
 (55078.2051282, 'sdssg', -0.43667895645, 0.672843847541, 25.0, 'ab')], 
          dtype=[('mJD', '<f8'), ('band', 'O'), ('Flux', '<f8'), ('Flux_Err', '<f8'), ('zp', '<f8'), ('zpsys', 'O')])

## The possible Aliases

In [10]:
# One thing to note is that all the values here should be in lower case 
aliases = dict(time=['mjd','expmjd', 'date', 'jd'], 
               flux=['counts', 'adu'],
               fluxerr=['flux_err', 'fluxerror', 'flux_error'],
               zp = ['zeropoint', 'zero_point'],
               zpsys = ['zero_point_sys'])

In [11]:
standardNamingDict = monitor.aliasDictionary(badHeadersDF.columns, aliases)

In [12]:
standardNamingDict

{'Flux': 'flux',
 'Flux_Err': 'fluxerr',
 'mJD': 'time',
 'zp': 'zp',
 'zpsys': 'zpsys'}

In [13]:
standardizedNames = monitor.mapSeq2Standard(badHeadersDF.columns, standardNamingDict)

## Fix the dataFrame

In [14]:
badHeadersDF.rename(columns=standardNamingDict, inplace=True)
print(badHeadersDF.columns)

Index([u'time', u'band', u'flux', u'fluxerr', u'zp', u'zpsys'], dtype='object')


## Fix the rec.array

In [15]:
badHeadersRecArr.dtype.names = standardizedNames

In [16]:
badHeadersRecArr[:5]

rec.array([(55070.0, 'sdssg', 0.36351153597, 0.672843847541, 25.0, 'ab'),
 (55072.0512821, 'sdssr', -0.200801295864, 0.672843847541, 25.0, 'ab'),
 (55074.1025641, 'sdssi', 0.307494232981, 0.672843847541, 25.0, 'ab'),
 (55076.1538462, 'sdssz', 1.08776103656, 0.672843847541, 25.0, 'ab'),
 (55078.2051282, 'sdssg', -0.43667895645, 0.672843847541, 25.0, 'ab')], 
          dtype=[('time', '<f8'), ('band', 'O'), ('flux', '<f8'), ('fluxerr', '<f8'), ('zp', '<f8'), ('zpsys', 'O')])

## Fixing the `astropy.table.table`

In [17]:
_ = [badHeadersTable.rename_column(col, stdcol) 
     for (col, stdcol) in standardNamingDict.iteritems() 
     if not col == stdcol]

In [18]:
badHeadersTable.show_in_notebook(display_length=5)

time,band,flux,fluxerr,zp,zpsys
55070.0,sdssg,0.36351153597,0.672843847541,25.0,ab
55072.0512821,sdssr,-0.200801295864,0.672843847541,25.0,ab
55074.1025641,sdssi,0.307494232981,0.672843847541,25.0,ab
55076.1538462,sdssz,1.08776103656,0.672843847541,25.0,ab
55078.2051282,sdssg,-0.43667895645,0.672843847541,25.0,ab
55080.2564103,sdssr,1.09780966779,0.672843847541,25.0,ab
55082.3076923,sdssi,3.7562685627,0.672843847541,25.0,ab
55084.3589744,sdssz,5.34858894966,0.672843847541,25.0,ab
55086.4102564,sdssg,2.82614187269,0.672843847541,25.0,ab
55088.4615385,sdssr,7.56547045054,0.672843847541,25.0,ab
