This notebook examines the approximate SEDs supplied by cosmoDC2 for a sample of galaxies that are 
centrals and are flagged as being on the red sequence (RS). Some sample SEDs are plotted in bins of stellar mass and redshift. Then we apply a PCA algorithm to decompose the SEDs into principal components. 
We plot the first few eigenvectors and we plot the distribution of the first two PCA coefficients in bins of stellar mass and redshift to see if there any trenda that could be exploited for modeling the SEDs of these galaxies.
Eve Kovacs and Andrew Hearin

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np 
import h5py
import os
import re
import sys
sys.path.insert(0, '/global/u1/k/kovacs/gcr-catalogs_cosmoDC2_v1.1')
import GCRCatalogs
import math

In [None]:
def stringSplitByIntegers(x):
        r = re.compile('(\d+)')
        l = r.split(x)
        return [int(y) if y.isdigit() else y for y in l]

Use a single healpixel (9556) to assemble a data set from cosmoDC2_v1.1.4. 

In [None]:
catalogs = ['cosmoDC2_v1.1.4_9556']
gcs = []
for catalog in catalogs:
    gc = GCRCatalogs.load_catalog(catalog)
    gcs.append(gc)
    
keys = gcs[0].list_all_native_quantities()
rs_keys = [k for k in keys if 'sequence' in k]
print (rs_keys)
keys = gcs[0].list_all_quantities()
mass_keys = [k for k in keys if 'mass' in k]

SED_keys = sorted([k for k in keys if 'sed' in k and 'no_host' not in k], key=stringSplitByIntegers)
SED_nodust_keys = [k for k in keys if 'sed' in k and 'no_host' in k]
print (mass_keys, SED_keys)
filters = ['g', 'r', 'i', 'z']
mag_keys = ['mag_{}_lsst'.format(f) for f in filters]
Mag_keys = ['Mag_true_{}_lsst_z0'.format(f) for f in filters]
print(mag_keys, Mag_keys)
quantities = ['galaxy_id', 'redshift', 'is_central'] + mag_keys + Mag_keys + rs_keys + mass_keys + SED_keys

In [None]:
catdata = {}
for catalog, gc in zip(catalogs, gcs):
    catdata[catalog] = gc.get_quantities(quantities)
    data = catdata[catalog]
    data['g-r'] = data['mag_g_lsst'] - data['mag_r_lsst']
    data['r-i'] = data['mag_r_lsst'] - data['mag_i_lsst'] 
    #print(list(data.keys()))
    
print(list(catdata.keys()))

Define functions to get the wavelengths associated with each SED filter and return the fluxes.

In [None]:
def get_lambdas(sed_keys):
    lambdas = np.asarray([re.findall('\d+', k) for k in sorted(sed_keys, key=stringSplitByIntegers)])
    return lambdas[...,0].astype(np.float) + lambdas[...,1].astype(np.float)/2.


def get_fluxes(data, sed_keys, mask=None, number=None):
    if mask is None:
        mask = np.ones(len(data[sed_keys[0]]), type=bool)
    if number is None:
        number = len(data[sed_keys[0]][mask])
    number = min(number, len(data[sed_keys[0]][mask]))
    fluxes = np.zeros((len(sed_keys), number))
    for n, k in enumerate(sorted(sed_keys, key=stringSplitByIntegers)):
        fluxes[n] = data[k][mask][0:number]
    
    return fluxes.T

In [None]:
sed_keys = sorted([k for k in SED_keys if 'disk' not in k and 'bulge' not in k], key=stringSplitByIntegers) 
lambda_c = get_lambdas(sed_keys)
print(lambda_c)
NSED = len(lambda_c)
print(NSED)

Define a function to run the PCA decomposition on the fluxes. The define a function to loop over the selected galaxy sample, gin in mass and redshift and
1) plot some sample SEDs
2) Evaluate the PCA eigenvectors and coeeficients

In [None]:
from sklearn.decomposition import PCA as sklPCA
def get_PCA_fit(fluxes):
    pca = sklPCA(n_components=fluxes.shape[1])
    __=pca.fit(fluxes)
    evecs = pca.components_
    coeffs = np.array(pca.transform(fluxes))
    return evecs, coeffs

In [None]:
def analyse_SEDs(data, sed_keys, lambda_c, mstar_cuts, z_cuts, mask=None, number=None, colors=None,
                 plot_SED=False, pca=False, catalog='', figdir='./', mask_label='', NPCA=5):
    results={}
    if mask is None:
        mask = np.ones(len(data[sed_keys[0]]), dtype=bool)
    print('#'+mask_label, np.count_nonzero(mask))
    for zcutlo, zcuthi in zip(z_cuts[0:-1], z_cuts[1:]):
        zlabel = '${} < z < {}$'.format(zcutlo, zcuthi)
        zmask = (data['redshift'] > zcutlo) & (data['redshift'] <= zcuthi)
        print('#z=',zcutlo, zcuthi, np.count_nonzero(zmask))
        for mcutlo, mcuthi in zip(mstar_cuts[0:-1], mstar_cuts[1:]):
            mmask =  (data['stellar_mass'] > mcutlo) & (data['stellar_mass'] <= mcuthi)
            print('#M*=',np.count_nonzero(mmask))
            mlabel = '${} < log(M*/M_\odot) < {}$'.format(math.log10(mcutlo), math.log10(mcuthi))
            mask_this = mmask & zmask & mask
            print('#all=',np.count_nonzero(mask_this))
            if plot_SED:
                fig, axall = plt.subplots(1, 2, figsize=(16, 5))
            elif pca:
                fig, axall = plt.subplots(1, 2, figsize=(16, 5))
            for i, c in enumerate(components):
                key = 'z_{}_{}_{}<M<{}_{}'.format(zcutlo, zcuthi, mcutlo, mcuthi, c)
                results[key] = {}
                sed_keys = sorted([k for k in SED_keys if c in k], key=stringSplitByIntegers)
                #print('galaxy_ids=', data['galaxy_id'][mask_this][0:len(fluxes)])
                ax = axall.flat[i]
                if plot_SED:
                    fluxes = get_fluxes(data, sed_keys, mask=mask_this, number=number)
                    for n, (flux, color) in enumerate(zip(fluxes, colors[0:len(fluxes)])):
                        mag = -2.5*np.log10(flux)
                        ax.plot(lambda_c, mag, color=color)
                    
                    ax.set_xlabel('$\lambda$')
                    ax.set_ylabel('Magnitude')
                    ax.set_ylim(ax.get_ylim()[::-1])
                    fig_id ='SED'
                elif pca:
                    fluxes = get_fluxes(data, sed_keys, mask=mask_this)
                    #print(fluxes.shape)
                    evecs, coeffs = get_PCA_fit(fluxes)
                    #print(evecs.shape, fluxes.shape)
                    results[key][c+'_evecs'] = evecs
                    results[key][c+'_coeffs'] = coeffs
                    ax = axall.flat[i]
                    for n, (evec, color) in enumerate(zip(evecs[0:NPCA], colors[0:NPCA])):
                        ax.plot(lambda_c, evec, label=' '.join(['Eigenvector', str(n)]), color=color)
                    ax.set_xlabel('$\lambda$')
                    ax.set_ylabel('PCA Component')
                    ax.legend(loc='best', fontsize='small')
                    fig_id = 'PCAcomponents'
                    
                label  = ', '.join([mlabel, zlabel])
                ax.set_title(', '.join([mask_label, label, c+' component']))
            
            figname = '{}_{}_lt_logM_le_{:.1f}_{:.2f}_lt_z_le_{:.2f}_{}.png'.format(fig_id, math.log10(mcutlo),
                        math.log10(mcuthi), zcutlo, zcuthi, catalog)        
            print('Saving {}'.format(figname))
            figfile = os.path.join(figdir, figname)
            fig.savefig(figfile, bbox_inches='tight')                    
                    
    return results

Now set up the mask to select galaxies that are centrals and RS members and make plots. We keep the disk and bulge components separate.

In [None]:
data = catdata[catalogs[0]]
#components = ['', 'disk', 'bulge']
components = ['disk', 'bulge']
figdir='./'
code ='ri'
number=10
colors = ['black', 'r', 'y', 'g', 'blue', 'm', 'orange', 'c', 'pink', 'purple']
rsmask = (data['is_central']) & (data['baseDC2/is_on_red_sequence_' + code])
print('#RS=', np.count_nonzero(rsmask))
mstar_cuts = np.logspace(9., 12., 4)
zcuts = np.linspace(0., 1.5, 4)
#make plots of some representative SEDs
results = analyse_SEDs(data, sed_keys, lambda_c, mstar_cuts, zcuts, mask=rsmask, number=number, plot_SED=True,
             colors=colors, catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals')

The following cell evaluates the PCA decomposition for the above mass and redshift bins. However, the number of objects in each bins is limited and the decomposition becomes noisy after the first 2 components. We will skip this cell and instead move on to using the whole sample.

In [None]:
#results = analyse_SEDs(data, sed_keys, lambda_c, mstar_cuts, zcuts, mask=rsmask, number=number, pca=True, NPCA=5,
#             colors=colors, catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals')

In [None]:
#get PCA decomposition for entire dataset with mask
def PCA_data(data, sed_keys, mask=None, components=[], NPCA=None):
    if mask is None:
        mask = np.ones(len(data[sed_keys[0]]), dtype=bool)
    print('#=', np.count_nonzero(mask))
    if NPCA is None:
        NPCA = len([k for k in sed_keys if components[0] in k])
    print('NPCA=',NPCA)
    evectors = {}
    #initialize new data columns
    
    for c in components:
        #initialize new data columns
        sed_keys = sorted([k for k in SED_keys if c in k], key=stringSplitByIntegers)
        fluxes = get_fluxes(data, sed_keys, mask=mask)
        evecs, coeffs = get_PCA_fit(fluxes)
        print('shapes:', evecs.shape, coeffs.shape)
        for npca in range(NPCA):
            #initialize new data columns to nans
            newkey = '_'.join(['PCA', c, str(npca)])
            data[newkey] = np.array([np.nan]*len(mask))
            #fill selected rows with coefficients
            #print(len(data[newkey][mask]), len(coeffs[:,npca]))
            data[newkey][mask] = coeffs[:,npca]
        evectors[c] = evecs
    
    return evectors, data


def plot_evectors(evectors, lambda_c, components=components, colors=None, 
                  catalog='', figdir='./', mask_label='', NPCA=5):
    fig, axall = plt.subplots(1, 2, figsize=(16, 5))
    for i, c in enumerate(components):
        ax = axall.flat[i]
        for n, (evec, color) in enumerate(zip(evectors[c][0:NPCA], colors[0:NPCA])):
            ax.plot(lambda_c, evec, label=' '.join(['Eigenvector', str(n)]), color=color)
            ax.set_xlabel('$\lambda$')
            ax.set_ylabel('PCA Component')
            ax.legend(loc='best', fontsize='small')
            fig_id = 'PCAcomponents'
            ax.set_title(', '.join([mask_label, c+' component']))
            
    figname = '{}_{}.png'.format(fig_id, catalog)
    print('Saving {}'.format(figname))
    figfile = os.path.join(figdir, figname)
    fig.savefig(figfile, bbox_inches='tight')    

Note that we have added the PCA coefficients to the data dict. This is so we can subdivide by mass and redshift later and look for trends. We add some checks to make sure that the code is working as expected.

In [None]:
evectors, data = PCA_data(data, sed_keys, mask=rsmask, components=components, NPCA=NSED)
print([k for k in data.keys() if 'PCA' in k])
print(evectors.keys())

Plot the first 5 eigenvectors. The 0th component captures the basic shape of the SED. Components 3 and 4 are starting to look noisy.

In [None]:
plot_evectors(evectors, lambda_c, components=components, colors=colors, 
              catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals')

Now select some properties to use to search for correlations with the PCA coefficients. Stellar mass, halo mass and colors are good candidates for properties that may be correclated with the coefficients. Define a function to compute the colors from magnitudes and add them to the data dict.

In [None]:
properties = ['stellar_mass', 'halo_mass', 'r-i', 'g-r']
#bands = ['u', 'g', 'r', 'i', 'z', 'y']
bands = ['g', 'r', 'i', 'z']
frames =['rest', 'obs']
Mtemplate = 'Mag_true_{}_lsst_z0'
mtemplate = 'mag_{}_lsst'
def get_colors(bands, color_dict, key_template):
    newkeys = []
    frame = 'rest' if 'z0' in key_template else 'obs'
    for b1, b2 in zip(bands[0:-1], bands[1:]):
        k1 = key_template.format(b1)
        k2 = key_template.format(b2)
        newkey = '('+b1+'-'+b2+')_'+frame
        newkeys.append(newkey)
        color_dict[newkey] = color_dict[k1] - color_dict[k2]
        
    print('Added keys:', newkeys)
    
    return color_dict

data = get_colors(bands, data, Mtemplate)
data = get_colors(bands, data, mtemplate)
print(data['(r-i)_rest'][0:20])

Define a function to plot the distribution of a selected PCA coefficient versus some property for different redshift ranges.

In [None]:
#plot PCA coefficients
def plot_coefficients(data, npca, properties, z_cuts, components=[], mask=None, cmap='cool',
                      Nxbins=50, Nybins=50, catalog='', figdir='./', mask_label=''):
    
    if mask is None:
        mask = np.ones(len(data['PCA_disk_0']), dtype=bool)
    print('#'+mask_label, np.count_nonzero(mask))
    for p in properties:
        nrows = len(z_cuts[0:-1])
        fig, axall = plt.subplots(nrows, 2, figsize=(16, nrows*5))
        for nz, (zcutlo, zcuthi) in enumerate(zip(z_cuts[0:-1], z_cuts[1:])):
            zlabel = '${} < z < {}$'.format(zcutlo, zcuthi)
            zmask = (data['redshift'] > zcutlo) & (data['redshift'] <= zcuthi)
            print('#z=',zcutlo, zcuthi, np.count_nonzero(zmask))
            mask_this = zmask & mask
            print('#all=',np.count_nonzero(mask_this))
            for i, c in enumerate(components):
                ax = axall.flat[nz*2+i]
                ykey = '_'.join(['PCA', c, str(npca)])               
                if 'mass' in p:
                    xdata = np.log10(data[p])                    
                    subscript = '*' if 'stellar' in p else 'halo' 
                    xlabel = ''.join(['$\log10(M_{',subscript,'}/M_{\odot})$'])
                else:
                    xlabel = p
                    xdata = data[p]
                print(c, np.min(data[ykey][mask_this]), np.max(data[ykey][mask_this]))
                CD2, xedges, yedges = np.histogram2d(xdata[mask_this], data[ykey][mask_this],
                                                     bins=(Nxbins, Nybins))
                CD2masked = np.ma.masked_where(CD2.T==0.0, CD2.T)
                hd2 = ax.pcolormesh(xedges, yedges, CD2masked, cmap=cmap)#, alpha=alpha2d)
                cb2 = plt.colorbar(hd2, ax=ax)
                #ax.plot(xdata[mask_this], data[ykey][mask_this], label=mask_label, color=color, alpha=0.3)
                ax.set_xlabel(xlabel)
                ax.set_ylabel('PCA Coefficient '+str(npca))
                #ax.legend(loc='best', fontsize='small')
                fig_id = '_'.join(['PCAcoefficient',str(npca),'vs',p])
                ax.set_title(', '.join([mask_label, zlabel, c+' component']))
            
        figname = '{}_{:.2f}_lt_z_le_{:.2f}_{}.png'.format(fig_id, zcutlo, 
                                                            zcuthi, catalog)
        print('Saving {}'.format(figname))
        figfile = os.path.join(figdir, figname)
        fig.savefig(figfile, bbox_inches='tight')    

Make a further selection on stellar mass > 1e9, since we are interested in BGCs. Plots the beahvaior of PCA coefficient 0. Note the correlation with stellar mass for the bulge component. No correlations are observed with color, which is expected since we colors would characterize changes in shape from 0th eigenvector.

In [None]:
properties = ['stellar_mass', 'halo_mass', '(r-i)_rest', '(g-r)_rest']
print(components)
mass_mask = data['stellar_mass'] > 1e9
mask_all = mass_mask & rsmask
plot_coefficients(data, 0, properties, zcuts, mask=mask_all, components=components, 
                      catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals, M* > 1e9')

Now look at the behavior of PCA coefficent 1. There is still some correlation with stellar mass, and a very mild correlation with color for the disk component. However nothing striking is seen.

In [None]:
plot_coefficients(data, 1, properties, zcuts, mask=mask_all, components=components, 
                      catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals, M* > 1e9')

Finally for completeness, here are the plost for PCA coefficient 2. No significant correlations are observed.

In [None]:
plot_coefficients(data, 2, properties, zcuts, mask=mask_all, components=components, 
                      catalog=catalogs[0], figdir=figdir, mask_label='RS Centrals, M* > 1e9')