# Bulge data exploration

In Part A, we showed how to ingest raw DECam images and to process them. A processed data set is available at `/project/stack-club/course_data/DECAM_BULGE`. In this notebook we will use the Butler to explore the processed data. 



In [None]:
# Make plots available to the notebook
%matplotlib inline

In [None]:
import os
import numpy as np
import pandas as pd
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.table import Table
from astropy.visualization import hist
import matplotlib.pyplot as plt
import lsst.daf.persistence as dafPersist
import lsst.afw.table as afwTable
import lsst.afw.display as afwDisplay

afwDisplay.setDefaultBackend('matplotlib') 


In [None]:
# Repo directory and rerun directory
REPO_DIR = '/project/stack-club/course_data/DECAM_BULGE/' 
RERUN_DIR = REPO_DIR + "rerun"

# Directory where we save our data, like pandas data frames, tables, light curves ...
parquet_save_path = '/home/mrabus/DATA/'

## After processing the images we can start with the butler and displaying images:

In [None]:
#Create Butler with the rerun directory of the processed DECam Bulge data 
butler = dafPersist.Butler(RERUN_DIR)

## We create a two pandas data frames and a list of cal. exposures for CCD1.

We have go through the metadata and validate for each dataId that it exits. If the dataset exits we write the coordinates min., max., and center based on the source catalog into a pandas data frame. All CCDs for a certain pointing should have the same time, i.e. you point all CCDs at once to the field, therefore, we only need one data frame which associates the visit to the time of observation. 

In [None]:
coordinate_list = []
calexp_list = []
visit_date_list = []
metadata = butler.queryMetadata('calexp',['visit','ccd','filter'])
# Iterate to metadata
for dataset in metadata:
    dataId = {'visit': int(dataset[0]), 'ccd': int(dataset[1]), 'filter':dataset[2]}
    #Check if the data set has a source catalog
    if butler.datasetExists('src', dataId=dataId):
        srcCatalog = butler.get('src', dataId=dataId).asAstropy() # get the source catalog
        # get the minimum and maximum RA/DEC
        raMax = srcCatalog['coord_ra'].max()
        raMin = srcCatalog['coord_ra'].min()
        decMax = srcCatalog['coord_dec'].max()
        decMin = srcCatalog['coord_dec'].min()
        #Calculate the center 
        raCenter = 0.5*(raMax + raMin)
        decCenter = 0.5*(decMax + decMin)
        # Number of detected sources
        nr_detected_sources = len(srcCatalog)
        # Get the median effective PSF area
        medianPSFarea = np.median(srcCatalog['base_PsfFlux_area'])
        #Append to the list.
        coordinate_list.append([int(dataset[0]), int(dataset[1]), dataset[2], raCenter, decCenter, 
                  raMin, raMax, decMin, decMax, nr_detected_sources, medianPSFarea])
        # for ccd1, create a list with visit and time of observation. (Should be the same for all other CCDs)
        if int(dataset[1]) == 1:
            # get the calexp for CCD1
            calexp = butler.get('calexp', visit=int(dataset[0]), ccd=1)
            # Append the calexp in the list
            calexp_list.append( calexp )
            #Get visit info, to extract time of observation
            exp_visit_info = calexp.getInfo().getVisitInfo()
            visit_date = exp_visit_info.getDate()
            visit_date_list.append( [int(dataset[0]), visit_date.toPython()] )



In [None]:
#Write the lists to a panda data frame 
df_valid_visists = pd.DataFrame(coordinate_list, columns=['visit', 'ccd', 'DECAM_filter', 'ra_center', 'dec_center', 
                                        'ra_min', 'ra_max', 'dec_min', 'dec_max', 'nr_detected_sources', 'median_effPSF_area'])
df_visit_date = pd.DataFrame(visit_date_list, columns=['visit', 'timestamp'])


In [None]:
#Save the pandas data frame as parque on disk
df_valid_visists.to_parquet( os.path.join(parquet_save_path,'df_valid_visits.parquet.gzip'), compression='gzip')
df_visit_date.to_parquet( os.path.join(parquet_save_path,'df_visits_date.parquet.gzip'), compression='gzip')

In [None]:
#Read the paque files
df_valid_visists = pd.read_parquet( os.path.join(parquet_save_path,'df_valid_visits.parquet.gzip'), engine='fastparquet' )
df_visit_data = pd.read_parquet( os.path.join(parquet_save_path,'df_visits_date.parquet.gzip'), engine='fastparquet' )

In [None]:
#Show the visit time of observations
df_visit_data

In [None]:
#Show the beginning of the data frame which has all valid visits and coordinates in it.
df_valid_visists.head()

In [None]:
#Get unique visit IDs
df_valid_visists.visit.unique()

In [None]:
# Query all ccd 1 in the pandas data frame
valid_visit_ccd1 = df_valid_visists.query('ccd == 1')
valid_visit_ccd1.head()

We see that all pointing centers for CCD 1 are in the same field.

In [None]:
#Print the standard deviation of the pointing centers for each visit and for CCD1 in arcsec
print(valid_visit_ccd1['ra_center'].std()*u.deg.to(u.arcsec),valid_visit_ccd1['dec_center'].std()*u.deg.to(u.arcsec))

In [None]:
#plot the first ten images of CCD1 to inspect visually the pointing:

for ii,calexp in enumerate(calexp_list[:10]):

    plt.figure(ii)
    display = afwDisplay.Display(frame=ii, backend='matplotlib')
    display.scale("linear", "zscale")
    #display only a small region of the calexp.
    display.mtv(calexp[500:1500,2500:3000])


In [None]:
#Sort valid visit from ccd1 
valid_visit_ccd1 = valid_visit_ccd1.sort_values(by=['nr_detected_sources'])

In [None]:
valid_visit_ccd1

In [None]:
unique_visits = valid_visit_ccd1.visit.unique()
ccd = 1

In [None]:
dataId={'visit': int(unique_visits[0]), 'ccd': ccd}
srcCatalog1 = butler.get('src', dataId=dataId).asAstropy().to_pandas()
dataId={'visit': int(unique_visits[1]), 'ccd': ccd}
srcCatalog2 = butler.get('src', dataId=dataId).asAstropy().to_pandas()

srcCatalog1 = srcCatalog1.sort_values(by=['id'])
srcCatalog2 = srcCatalog2.sort_values(by=['id'])

In [None]:
srcCatalog1.head()

In [None]:
srcCatalog2.head()

In [None]:
all_dist = np.array([])
#create master star list


dataId={'visit': int(unique_visits[0]), 'ccd': ccd}

srcCatalog = butler.get('src', dataId=dataId) # get the source catalog for the first visit and make this the master star list
master_coordinates = SkyCoord(srcCatalog['coord_ra']*u.deg, srcCatalog['coord_dec']*u.deg)
master_starID = srcCatalog['id']

coord_table = Table([master_starID, master_coordinates], names=('id', int(unique_visits[0])))

for visit in unique_visits[1:]:
    dataId={'visit': int(visit), 'ccd': ccd}
    srcCatalog = butler.get('src', dataId=dataId)
    coordinates = SkyCoord(srcCatalog['coord_ra']*u.deg, srcCatalog['coord_dec']*u.deg)
    idx, d2d, d3d = master_coordinates.match_to_catalog_sky(coordinates)
    coord_table.add_column(coordinates[idx], name=int(visit))
    coord_table[f'distance {int(visit)}'] = d2d.arcsec*u.arcsec
    all_dist = np.append(all_dist,d2d.arcsec)
    print('visit: {}       max. dist. {:.3f} arcsec       std. dist. {:.3f} arcsec'.format(visit, np.max(d2d.arcsec), np.std(d2d.arcsec)))


In [None]:
# plot histogram of distances
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
hist(all_dist, bins='freedman', ax=ax, histtype='stepfilled',
         alpha=0.75, density=True)
ax.set_xlabel('distance [arcsec]')
ax.set_ylabel('Density(distance)')
ax.set_xlim(-0.001,0.01)

TODO:

- Check astrometry
- Make light curves
- create co-add image
- run ap-pipe

