In [None]:
# The following is to know when this notebook has been run and with which python version.
import time, sys
print(time.ctime())
print(sys.version.split('|')[0])

# C: How to read and write files (ASCII and FITS)

This is part of the Python lecture given by Christophe Morisset at IA-UNAM.

Some informations are here: http://www.tutorialspoint.com/python/python_files_io.htm

## Reading a simple ascii file

In [None]:
# numpy is needed in some part of the lecture
import numpy as np

First of all, we will have to have some files on the hard drive to read them The following notebook cell will write a file in the same directory where the notebook has been started.

In [None]:
%%writefile data1.dat
1   2.3  6   8 star
2   3.5  7   9 galaxy
3  -4.2  5   7 cluster

Now the goal is to read this file. The first way is to open the file, read it completely in a variable and close the file. Then we can play with the content of the file.

In [None]:
datafile = open('data1.dat', 'r') # Open the file to read it

In [None]:
data = datafile.readlines() # The variable data will receive the content of the file.

In [None]:
datafile.close() # Not need anymore of the file.

In [None]:
print(type(data)) # The data file is stored in the form of a list, each element of the list corresponding to a row of the list.

In [None]:
print(data) # Each row is a string and terminates with \n, symbol of END OF LINE.

In [None]:
print(len(data)) # number of rows

In [None]:
print(data[0], 'tralala')

In [None]:
for row in data:
    print(row)

In [None]:
# In python :
for row in data:
    print(row),

In [None]:
# In python 3:
for row in data:
    print(row, end='')

In [None]:
print(type(data[0])) # Each element is a string

Now it is easy to separate each field with the split command: 

In [None]:
for row in data:
    print(row.split())

In [None]:
# One can also transform the data if the type is known:
for row in data:
    this_data = row.split()
    print('N = {0:2d} f = {1:5.2f} type = {2:>10s}'.format(int(this_data[0]), 
                                                           float(this_data[1]), 
                                                           this_data[4]))

In [None]:
# One can even fill a list with the data, by column:
N = []
f = []
type_ = [] # take care, type is a python command, you can erase it if you use it...
for row in data:
    this_data = row.split()
    N.append(int(this_data[0]))
    f.append(float(this_data[1]))
    type_.append(this_data[4])
print(N)
print(f)
print(type_)
N = np.array(N)
print(N)

In [None]:
# If the file number of rows is not too big, you can use list comprehension (and even send the result to a numpy array)
N = np.array([int(row.split()[0]) for row in data])
f = np.array([float(row.split()[1]) for row in data])
# Each one of this command scans all the rows, don't use for huge files
print(N)
print(f)

## How to treat special rows (headers, comments)

In [None]:
%%writefile data2.dat
# The following data are for test purpose
N    f   x   y type
1   2.3  6   8 star
2   3.5  7   9 galaxy
3  -4.2  5   7 cluster
#4  -10.5  5  7 test

In [None]:
!cat data2.dat # Just to check that the # comments are also in the file

The file has to be read row by row, to be sure that special cases are treated.

In [None]:
datafile = open('data2.dat', 'r') # Open the file to read it

row = datafile.readline() # this reads only one line
first_comment = row
print(first_comment, end='')

row = datafile.readline() # this reads only one line
header = row
print(header, end='')

data = []
while True: # loops until exit by break command
    row = datafile.readline()
    if row == '':
        break
    if row[0] != '#' and row[0] != '\n': # comment lines are skipped
        data.append(row)
datafile.close()
print(data)

In [None]:
datafile = open('data2.dat', 'r') # Open the file to read it
row = datafile.readline() # this reads only one line
first_comment = row
print(first_comment, end='')
row = datafile.readline() # this reads only one line
header = row
print(header, end='')
data = []
row = datafile.readline()
while row != '': # loops until exit by break command
    if row[0] != '#': # comment lines are skipped
        data.append(row)
    row = datafile.readline()
datafile.close()
print(data)

In [None]:
# very shorter way to deal with the file. No need to look for the end of the file.
datafile = open('data2.dat', 'r') # Open the file to read it
data = []
for row in datafile:
    if row[0] != '#': # comment lines are skipped
        data.append(row)  
datafile.close()
print(data)
# This way will include the header in the data... Not what we want

In [None]:
# very shorter way to deal with the file:
# we know that the header is the first no-comment line in the file.
datafile = open('data2.dat', 'r') # Open the file to read it
data = []
comments = [] # we can keep the comments for some usage
header_read = False # We will turn it to True once the header is read
for row in datafile:
    if row[0] != '#': # comment lines are skipped
        if not header_read:
            header = row
            header_read = True # next time, data will be read
        else:
            data.append(row)
    else:
        comments.append(row)
datafile.close()
print(header, end='')
print(data)
print(comments)

In [None]:
# Alternative way using "with". No need to close the file, done when the "with" block is terminated.
data = []
comments = []
header_read = False
def change_type(row_split):
    # This function change the type of the data read from the file from 5 strings into int, 3 floats and a string
    # It also return the result in form of a tuple
    return (int(row_split[0]), 
            float(row_split[1]), 
            float(row_split[2]), 
            float(row_split[3]), 
            row_split[4])
with open('data2.dat', 'r') as datafile:
    for row in datafile:
        if row[0] != '#' and row[0] != '\n': # comment lines are skipped
            if not header_read:
                header = row
                header_read = True
            else:
                data.append(change_type(row.split()))
        else:
            comments.append(row)
print(header)
print(data)
print(comments)

In [None]:
# We can define the result as a structured array
# We use the header to define the field names.
# data must be a list of tuples.
a = np.array(data, dtype={'names':header.split(), 
                          'formats':['i4','f16', 'f16', 'f16', 'U10']})

In [None]:
a

In [None]:
print(data[0])

In [None]:
print(a[0])

In [None]:
# Easy access to the columns, by their name
print(a['N'])

In [None]:
print(a['type'])

In [None]:
# Easy combine the values of columns
print(np.sqrt(a['x']**2 + a['y']**2))

### Using numpy loadtxt

http://docs.scipy.org/doc/numpy/reference/generated/numpy.loadtxt.html

In [None]:
# Fast way for reading the file
# One hace to tell to skip the 2 first rows
# skiprows 
b = np.loadtxt('data2.dat', skiprows=2, dtype='i4,f, f, f, U10')

In [None]:
print(b)

In [None]:
type(b)

In [None]:
# The names of the columns are f0, f1, f2, etc
b.dtype

### Using numpy genfromtxt

http://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html

In [None]:
# Fast and versatile way to read the file
# the names are taken from the file
# The types are defined automatically when reading the columns
c = np.genfromtxt('data2.dat', names=True, dtype=None, skip_header=1)

In [None]:
print(c)

In [None]:
type(c)

In [None]:
c.dtype

In [None]:
c['f']

Now a value of x is missing (not possible with space separator, so we use "," as separator):

In [None]:
%%writefile data3.dat
# The following data are for test purpose
N,    f,   x,   y, type
1,   2.3,  6,   8, star
2,   3000.5,   ,  9, galaxy
3,  -4.2,  5,   7, cluster
#4,  -10.5,  5,  7, test

In [None]:
d = np.genfromtxt('data3.dat', names=True, dtype=None, skip_header=1, 
                  delimiter=',')

In [None]:
# The missing value has been changed to -1
d

In [None]:
# Th emissing value can be set to whatever you want (but non a NaN here, as the typ eis integer, and NaN is a float...)
d = np.genfromtxt('data3.dat', names=True, dtype=None, skip_header=1, delimiter=',', 
                  filling_values=0.0)

In [None]:
d['x'][1]

In [None]:
# ons can select the columns to be store
e = np.genfromtxt('data3.dat', names=True, dtype=None, skip_header=1, 
                  delimiter=',',usecols=(0,1,4))

In [None]:
print(e)

In [None]:
# ons can select the columns to be store
N, f, typ = np.genfromtxt('data3.dat', skip_header=2, 
                  delimiter=',',usecols=(0,1,4), unpack=True)

In [None]:
# The resulting array now contains only the given columns
print(N)
print(f)

### Using recfrom to obtain a record array

In [None]:
# Uses the same keywords than genfromtxt
f = np.recfromtxt('data3.dat', names=True, dtype=None, skip_header=1, 
                  delimiter=',',usecols=("N", "f", "type"))

In [None]:
f

In [None]:
f.N

## Fixed size ascii files

In [None]:
%%writefile data4.dat
#  Line      Iobs    lambda  relat_error Obs_code
H  1  4861A 1.00000    4861. 0.08000  Anabel                               
H  1  6563A 2.8667     6563. 0.19467  Anabel                               
H  1  4340A 0.4933     4340. 0.03307  Anabel                               
H  1  4102A 0.2907     4102. 0.02229  Anabel                               
H  1  3970A 0.1800     3970. 0.01253  Anabel                               
N  2  6584A 2.1681     6584. 0.08686  Anabel                               
N  2 121.7m 0.0044621217000. 0.20000  Liu                                  
O  1  6300A 0.0147     6300. 0.00325  Anabel                               
TOTL  2326A 0.07900    2326. 0.20000  Adams                                
C  2 157.6m 0.00856 1576000. 0.20000  Liu                                  
O  1 63.17m 0.13647  631700. 0.10000  Liu                                  
O  1 145.5m 0.00446 1455000. 0.200    Liu                                  
TOTL  3727A 0.77609    3727. 0.200    Torres-Peimbert                      
S II  4070A 0.06174    4070. 0.200    Torres-Peimbert                      
S II  4078A 0.06174    4078. 0.200    Torres-Peimbert                      

In [None]:
# Here we cannot use SPACE as a separator, as some strings contains spaces.
# "delimiter" is used to specify the size (in characters in the file) of each variables. 
# The types must be clearly defined too.
obs  = np.genfromtxt('data4.dat', 
                     dtype=["U11","float","float","float","U2"],
                     delimiter=[11,7,10,10,2],
                     names = True
                     )

In [None]:
obs # The same delimiter (fixed sizes) is applied to the names. May not be what you want:

In [None]:
# Defining the names:
obs2  = np.genfromtxt('data4.dat', skip_header=1,
                     dtype=None,
                     delimiter=[11,7,10,10,2],
                     names = ['label', 'i_obs', 'lambda', 'e_obs', 'observer']
                     )

In [None]:
obs2 

In [None]:
%%writefile data5.dat
#  Line      Iobs    lambda  relat_error Obs_code
H  1  4861A 1.00000    4861. 0.08000 x Anabel                               
H  1  6563A 2.8667     6563. 0.19467 x Anabel                               
H  1  4340A 0.4933     4340. 0.03307 x Anabel                               
H  1  4102A 0.2907     4102. 0.02229 x Anabel                               
H  1  3970A 0.1800     3970. 0.01253 t Anabel                               
N  2  6584A 2.1681           0.08686 x Anabel                               
N  2 121.7m 0.00446 1217000. 0.20000 g Liu                                  
O  1  6300A 0.0147     6300. 0.00325 t Anabel                               
TOTL  2326A 0.07900    2326. 0.20000 g Adams                                
C  2 157.6m 0.00856 1576000. 0.20000 t Liu                                  
O  1 63.17m 0.13647  631700. 0.10000 g Liu                                  
O  1 145.5m 0.00446 1455000. 0.200   g Liu                                  
TOTL  3727A 0.77609    3727. 0.200   g Torres-Peimbert                      
S II  4070A 0.06174    4070. 0.200   g Torres-Peimbert                      
S II  4078A 0.06174    4078. 0.200   g Torres-Peimbert   

In [None]:
# Here we want to skip one column:
obs3  = np.genfromtxt('data5.dat', skip_header=1,
                     dtype=None,
                     delimiter=[11, 8, 9, 9, 2, 2],
                     names = ['label', 'i_obs', 'lambda', 'e_obs', 'na', 'observer'],
                     usecols = (0, 1, 2, 3, 5)
                     )

In [None]:
obs3

In [None]:
obs3['lambda']

In [None]:
new_obs3 = obs3.view(np.recarray)

In [None]:
new_obs3.label

In [None]:
new_obs3.lambda # lambda is reserved!!!

In [None]:
new_obs3['lambda']

Using masks on the structured array.

In [None]:
mask_observer = (obs3['observer'] == b'An') & (np.isfinite(obs3['lambda']))
print(obs3[mask_observer])

In [None]:
for o in obs3[mask_observer]:
    print('line {0[label]:4s}, wavelength={0[lambda]}A Intensity={0[i_obs]:5.3f}+/-{1:4.1f}%)'.format(o, o['e_obs']*100))

## Writing files

### Simple "write" method from "open" class

In [None]:
f = open('data10.dat', 'w')

In [None]:
f.write('tralala')
f.write('trololo')

In [None]:
f.close()

In [None]:
!cat 'data10.dat' # the writing method put everything together.

In [None]:
f = open('data11.dat', 'w')
f.write('tralala\n') # \n to indicate end of line
f.write('trololo\n')
f.close()
!cat 'data11.dat'

In [None]:
f = open('data11.dat', 'a') # Append to the edn of the file
f.write('trilili\n') # \n to indicate end of line
f.write('trululu\n')
f.close()
!cat 'data11.dat'

In [None]:
a = 'Smith'
b = 3
with open('data12.dat', 'w') as datafile:
    datafile.write("""Hola Sr. {0}
This is a file
with a lot of lines.
It is easy to write it.
The value of your data is {1}.
""".format(a, b))
!cat "data12.dat"

### Using pickle (and cpickle) python specific format

In [None]:
# Let's define some stuffs we want to keep in a file (data and variable names)
a = 5
b = 'Hola'
c = np.array([1,2,3,4,5])
def d(x):
    """ Function mia"""
    return x**2

In [None]:
import pickle # The module we will use for this

In [None]:
pickle.dump((a,b,c,d), open('Demo.pickle','wb')) # Writing the variables

In [None]:
res = pickle.load(open('Demo.pickle', 'rb'))

In [None]:
type(res)

In [None]:
print(res[0])
print(res[1])
print(res[2])

In [None]:
res[3](5)

In [None]:
a2,b2,c2,d2 = pickle.load(open('Demo.pickle', 'rb'))

In [None]:
a2

In [None]:
d2(10)

In [None]:
help(d2)

In [None]:
%timeit res = pickle.load(open('Demo.pickle', 'rb'))

In [None]:
import gzip
pickle.dump((a,b,c,d), gzip.open('Demo.pklz','wb')) # Writing the variables

In [None]:
f = gzip.open('Demo.pklz','rb')
a, b, c, d = pickle.load(f)
f.close()

### FITS files

In [None]:
import astropy
print(astropy.__version__)

In [None]:
from astropy.io import fits

In [None]:
# All of the functionality of PyFITS is now available in Astropy
# from astropy.io import fits as pyfits

Manual here: https://pythonhosted.org/pyfits/

We will use one FITS files from San Pedro Martir echelle spectrograph. The file can be downloaded from: https://github.com/Morisset/Python-lectures-Notebooks/raw/master/Notebooks/n10017o.fits

In [None]:
hdulist = fits.open('n10017o.fits')

In [None]:
# The result hdulist is a list of HDU objects. 
# In the case of a simple file, there is only one primary HDU so the list contains only one element
len(hdulist)

In [None]:
# The information on what the file contains can be obtained by calling the info() method:
hdulist.info()
# The table said that there is only a primary HDU which contains 2154 X 2048 image with data stored in 2 bytes (16 bits) integers.

In [None]:
# As described above, the HDU (header/data unit) contains header and data. The header is a dictionary. 
# To see what keywords were used in the header one can do:
list(hdulist[0].header.keys())

In [None]:
# and to get the value of a given keyword :
hdulist[0].header['OBJECT'] 

In [None]:
hh = hdulist[0].header
hh?

In [None]:
hdulist[0].header

In [None]:
# The header can be printed as it appears in the file by
print(hdulist[0].header.cards)

In [None]:
# The data in the file are accessible with
data = hdulist[0].data

In [None]:
# and can be seen with [we need to import matplotlib.pyplot as plt before running this]:
%matplotlib inline
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(15,15))
ax.imshow(data)

In [None]:
# A column from the data can be plotted with 
plt.plot(data[:,1000])
# where I am plotting the column number 1000. 

In [None]:
# In the same way a line from the data is plotted with: 
plt.plot(data[1000,:])

In [None]:
# For this example I'll use a spectrum obtain with the high dispersion camera on board of IUE. 
# The file is opened as usual:
hdulist = fits.open('swp04345.mxhi')

The file is there: https://github.com/Morisset/Python-lectures-Notebooks/raw/master/Notebooks/swp04345.mxhi

In [None]:
#but now hdulist has 2 elements (2 header/data units):
len(hdulist)

In [None]:
# We can see that the primary header has dimension (), son does not contain any data. 
# The data are in the extension.
hdulist.info()

In [None]:
# The first header contains the minimal infirmation:
print(hdulist[0].header.cards[:5])

In [None]:
# The number of axis is 0 which means there is no data block in the primary HDU. 
# The header of the second HDU begins with the keyword XTENSION and with the specification of the data
print(hdulist[1].header.cards[:5])

In [None]:
# To progress further we need to know what is in the table. 
# As usual, the columns have names and type of the stored data. 
# These information can be obtained using the column attribute of hdulist:
cols = hdulist[1].columns

In [None]:
# the cols.info returns the names of the columns and the information of their format and units.
cols.info

In [None]:
# The data are available using (this example is NOT the right way of plotting the data, it's just an example) 
# and don't forget to import numpy as np to have np.arange working]:

data1 = hdulist[1].data
DTs =  data1.ABS_CAL
WLs = data1.WAVELENGTH
DWs = data1.DELTAW
for WL, DW, DT in zip(WLs, DWs, DTs):
    plt.plot(WL + np.arange(len(DT)) * DW, DT)

### Writing FITS files

In [None]:
# Creation of numpy array with the data. 
x = np.arange(100)

In [None]:
# Creation of the HDU from the data. 
hdu = fits.PrimaryHDU(x)
print(hdu.header.cards)

In [None]:
#Adding additional keywords to the header. 
# The automatically created header contains only the required minimum of keywords. 
# If additional keywords are needed they are added with:
hdu.header['testkey'] = (0.001,'some test value')

In [None]:
print(hdu.header.cards)

In [None]:
hdulist = fits.HDUList([hdu])
hdulist.writeto('new.fits', overwrite=True) 
hdulist.close()

Another way to deal with FITS tables is to use the ATpy library, we'll see this later