# Walkthrough the whole process of acquiring and cross-matching data
In this notebook, we will go through the stages required to transform our spectra into a useful dataset, complemented by available photometry and information.

## 1. Ensure that the correct data path is known to the system

In [None]:
import os

os.environ["FORS2DATALOC"]

In [None]:
if os.environ["FORS2DATALOC"] == "":
    os.environ["FORS2DATALOC"] = os.path.abspath(os.path.join("..", "..", "src", "data"))
os.environ["FORS2DATALOC"]

It is strongly recommended to add the following to your `.bashrc` or `.bash_aliases` file:
```bash
export FORS2DATALOC="[path to this repository]/src/data"
```
Then log out and log back in, or `source` the file, and the environment variable will be set and should be set automatically each time you start a session.

## 2. Explore available data
FITS tables for FORS2 and GALEX data are queried automatically.
FITS table from 9-band KiDS must be queried externally from the ESO archives website and saved with the appropriate name. It should, however, be part of the data cloned from the GitHub repository.

In [None]:
from process_fors2.fetchData import queryTargetInSimbad

### Simbad query
For illustration purposes - we notice the `MAIN_ID` field that gives us the handle to the target in Simbad. It is already hard-coded in our package.

In [None]:
simbadtable = queryTargetInSimbad()

In [None]:
simbadtable

### Vizier query
This is how we obtain data related to the spectra that come with this package. The function can be used to query other objects but defaults to argument values that are hard-coded in the package.

In [None]:
from process_fors2.fetchData import DEFAULTS_DICT, getFors2FitsTable

DEFAULTS_DICT

In [None]:
os.path.isfile(DEFAULTS_DICT["FITS location"])

In [None]:
fors2table_vizier = getFors2FitsTable()

In [None]:
fors2table_vizier

In [None]:
os.path.isfile(DEFAULTS_DICT["FITS location"])

The table has been queried from Vizier and correctly written to the disk.

### GALEX query

In [None]:
os.path.isfile(DEFAULTS_DICT["GALEX FITS"])

In [None]:
from process_fors2.fetchData import queryGalexMast

In [None]:
galextable_mast = queryGalexMast()

In [None]:
galextable_mast

In [None]:
os.path.isfile(DEFAULTS_DICT["GALEX FITS"])

The table has been queried from MAST and correctly written to the disk.

In [None]:
df_galex = galextable_mast.to_pandas()

In [None]:
df_galex

In [None]:
df_galex.hist("fuv_mag", bins=100)

In [None]:
df_galex.hist("distance_arcmin", bins=100)

In [None]:
SelectedColumns_galex = [
    "ra_galex",
    "dec_galex",
    "fuv_mag",
    "nuv_mag",
    "fuv_magerr",
    "nuv_magerr",
    "fuv_flux",
    "nuv_flux",
    "fuv_fluxerr",
    "nuv_fluxerr",
]
df_galex = df_galex.filter(items=SelectedColumns_galex, axis=1)
df_galex

In [None]:
import numpy as np

df_galex[np.isfinite(df_galex["fuv_mag"])]

### 9-band photometry from KiDS
This data is not as easily available from astroquery and shall be downloaded from the ESO Archives website, then saved with an appropriate name, such as the one in defaults parameters.
The existing file was obtained with a query centered on the cluster region, in a $12' \times 12'$ box, keeping only galaxies with a filter on the parameter `SG_FLAG`.

In [None]:
os.path.isfile(DEFAULTS_DICT["KiDS FITS"])

In [None]:
from process_fors2.fetchData import readKids

In [None]:
kidstable_eso = readKids()

In [None]:
kidstable_eso

In [None]:
[col for col in kidstable_eso.columns if "DMAG" in col]

In [None]:
df_kids = kidstable_eso.to_pandas()

In [None]:
SelectedColumns_kids = [
    "KiDS_ID",
    "KIDS_TILE",
    "ra_kids",
    "dec_kids",
    "FLUX_RADIUS",
    "CLASS_STAR",
    "Z_B",
    "Z_ML",
    "MAG_GAAP_u",
    "MAG_GAAP_g",
    "MAG_GAAP_r",
    "MAG_GAAP_i",
    "MAG_GAAP_Z",
    "MAG_GAAP_Y",
    "MAG_GAAP_J",
    "MAG_GAAP_H",
    "MAG_GAAP_Ks",
    "MAGERR_GAAP_u",
    "MAGERR_GAAP_g",
    "MAGERR_GAAP_r",
    "MAGERR_GAAP_i",
    "MAGERR_GAAP_Z",
    "MAGERR_GAAP_Y",
    "MAGERR_GAAP_J",
    "MAGERR_GAAP_H",
    "MAGERR_GAAP_Ks",
    "FLUX_GAAP_u",
    "FLUX_GAAP_g",
    "FLUX_GAAP_r",
    "FLUX_GAAP_i",
    "FLUX_GAAP_Z",
    "FLUX_GAAP_Y",
    "FLUX_GAAP_J",
    "FLUX_GAAP_H",
    "FLUX_GAAP_Ks",
    "FLUXERR_GAAP_u",
    "FLUXERR_GAAP_g",
    "FLUXERR_GAAP_r",
    "FLUXERR_GAAP_i",
    "FLUXERR_GAAP_Z",
    "FLUXERR_GAAP_Y",
    "FLUXERR_GAAP_J",
    "FLUXERR_GAAP_H",
    "FLUXERR_GAAP_Ks",
    "EXTINCTION_u",
    "EXTINCTION_g",
    "EXTINCTION_r",
    "EXTINCTION_i",
]
df_kids = df_kids.filter(items=SelectedColumns_kids, axis=1)

In [None]:
df_kids

In [None]:
df_kids.hist("MAG_GAAP_r", bins=100)

In [None]:
import astropy.coordinates as coord
import astropy.units as u

radec_galex = coord.SkyCoord(df_galex["ra_galex"].values * u.deg, df_galex["dec_galex"].values * u.deg)
radec_kids = coord.SkyCoord(df_kids["ra_kids"].values * u.deg, df_kids["dec_kids"].values * u.deg)

In [None]:
df_fors2 = fors2table_vizier.to_pandas()
radec_fors2 = coord.SkyCoord(df_fors2["RAJ2000"].values * u.deg, df_fors2["DEJ2000"].values * u.deg)

In [None]:
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline
plt.scatter(radec_fors2.ra.deg, radec_fors2.dec.deg, s=9, label="FORS2")
plt.scatter(radec_kids.ra.deg, radec_kids.dec.deg, s=81, facecolors="none", edgecolors="y", alpha=0.1, label="KiDS")
plt.scatter(radec_galex.ra.deg, radec_galex.dec.deg, s=81, facecolors="none", edgecolors="violet", alpha=0.3, label="GALEX")
plt.grid()
plt.xlabel("Right ascension [deg]")
plt.ylabel("Declination [deg]")
plt.suptitle("On-sky comparison of catalogues")
plt.legend(loc="lower left", bbox_to_anchor=(1.01, 0.0))

In [None]:
CATALOGS = "/home/joseph/Process_FORS2/src/data/catalogs/"
images = os.path.join(CATALOGS, "SDSS_images_ugri_005403-282358")
os.listdir(images)

In [None]:
img_to_plot = os.path.join(images, "ADP.2019-02-11T13_02_24.807_TARGET_00_54_03_-28_23_58.fits")

In [None]:
from astropy.io import fits
from astropy.stats import sigma_clipped_stats

img_hdus = fits.open(img_to_plot)
img_hdr = img_hdus[0].header
img_data = img_hdus[0].data

In [None]:
img_hdr

In [None]:
img_magsAB = -2.5 * np.log10(img_data)
moy, med, sig = sigma_clipped_stats(img_data)
moyAB, medAB, sigAB = sigma_clipped_stats(img_magsAB)

In [None]:
img_mask = os.path.join(images, "ADP.2019-02-11T13:02:24.809_TARGET_00:54:03_-28:23:58.fits")
msk_hdus = fits.open(img_to_plot)
msk_hdr = img_hdus[0].header
msk_data = img_hdus[0].data

In [None]:
msk_hdr

In [None]:
from astropy.wcs import WCS

wcs = WCS(img_hdr)
ax = plt.subplot(projection=wcs)
ax.imshow(img_data, vmin=med - 1 * sig, vmax=med + 5 * sig, origin="lower")  # , vmin=med-5*sig, vmax=med+5*sig
ax.coords.grid(True, color="white", ls="solid")
ax.coords[0].set_axislabel("Galactic Longitude")
ax.coords[1].set_axislabel("Galactic Latitude")

overlay = ax.get_coords_overlay("fk5")
overlay.grid(True, color="white", ls="dotted")
overlay[0].set_axislabel("Right Ascension (J2000)")
overlay[1].set_axislabel("Declination (J2000)")

ax.scatter(radec_fors2.ra.deg, radec_fors2.dec.deg, s=36, label="FORS2", transform=ax.get_transform("fk5"), facecolors="none", edgecolors="pink", alpha=0.4)

Ma query de KiDS est calquÃ©e sur GALEX.

## 3. Check spectra
Spectra from galaxies in the field described above are shipped within this package. Here, we manipulate them to obtain a final file that gathers all available data, cross-matched, thus combining spectroscopy and photometry information for those galaxies.

In [None]:
os.listdir(DEFAULTS_DICT["FORS2 spectra"])

In [None]:
os.listdir(DEFAULTS_DICT["Starlight spectra"])

In [None]:
from process_fors2.fetchData import fors2ToH5

In [None]:
# if os.path.isfile(DEFAULTS_DICT["FORS2 HDF5"]):
#    os.remove(DEFAULTS_DICT["FORS2 HDF5"])
os.path.isfile(DEFAULTS_DICT["FORS2 HDF5"])

In [None]:
# fors2ToH5()

In [None]:
import numpy as np

uniques, counts = np.unique(fors2table_vizier["ID"], return_counts=True)
uniques[counts > 1]

In [None]:
_sel = fors2table_vizier["ID"] == 72
fors2table_vizier[_sel]

In [None]:
os.path.isfile(DEFAULTS_DICT["FORS2 HDF5"])

In [None]:
from process_fors2.fetchData import starlightToH5

In [None]:
# if os.path.isfile(DEFAULTS_DICT["Starlight HDF5"]):
#    os.remove(DEFAULTS_DICT["Starlight HDF5"])
os.path.isfile(DEFAULTS_DICT["Starlight HDF5"])

In [None]:
# starlightToH5()

In [None]:
os.path.isfile(DEFAULTS_DICT["Starlight HDF5"])

There, we have generated HDF5 files containing catalog data + available spectra ; we have also noticed one caveat of the script and checked that no data would be conflicting. Let's decode the files that were created !

In [None]:
from process_fors2.fetchData import readH5FileAttributes

In [None]:
sl_df = readH5FileAttributes(DEFAULTS_DICT["Starlight HDF5"])

In [None]:
sl_df

In [None]:
sl_df[sl_df["num"] == 72]

In [None]:
import h5py

In [None]:
with h5py.File(DEFAULTS_DICT["Starlight HDF5"], "r") as sl_in:
    for tag in list(sl_in.keys())[:1]:
        group = sl_in.get(tag)
        print("DATA IN GROUP \n============")
        for attr in group:
            print(attr)
        print("ATTRIBUTES IN GROUP \n===================")
        for attr in group.attrs:
            print(attr)
        print("CHECK CONSISTENCY \n=================")
        print(tag, group.attrs.get("num"))

In [None]:
with h5py.File(DEFAULTS_DICT["Starlight HDF5"], "r") as sl_in:
    for tag in sl_in:
        group = sl_in.get(tag)
        if f"{group.attrs.get('num')}" not in tag:
            print("Inconsistent spectrum : tag {tag}, num {group.attrs.get('num')}")

In [None]:
import matplotlib.pyplot as plt

with h5py.File(DEFAULTS_DICT["Starlight HDF5"], "r") as sl_in:
    for tag in list(sl_in.keys())[:4]:
        group = sl_in.get(tag)
        wl = np.array(group.get("wl"))
        fl = np.array(group.get("fl"))
        fl_ext = np.array(group.get("fl_ext"))
        plt.plot(wl, fl, label="Flux corrected for dust extinction")
        plt.plot(wl, fl_ext, label="Flux not corrected for dust extinction")
        plt.xscale("log")
        plt.yscale("log")
        plt.xlabel("Wavelength [Ang.]")
        plt.ylabel("Flux [arbitrary units]")
        plt.suptitle(f"tag {tag}, num {group.attrs.get('num')}")
        plt.legend()
        plt.show()

In [None]:
with h5py.File(DEFAULTS_DICT["FORS2 HDF5"], "r") as f2_in:
    for tag in list(f2_in.keys())[:4]:
        group = f2_in.get(tag)
        wl = np.array(group.get("wl"))
        fl = np.array(group.get("fl"))
        msk = np.array(group.get("mask"))
        msk = np.where(msk > 0, True, False)
        plt.plot(wl, fl, label="Observed flux")
        plt.plot(wl[msk], fl[msk], lw=0.5, label="Masked portions of the flux")
        plt.xscale("log")
        plt.yscale("log")
        plt.xlabel("Wavelength [Ang.]")
        plt.ylabel("Flux [arbitrary units]")
        plt.suptitle(f"tag {tag}, num {group.attrs.get('num')}")
        plt.legend()
        plt.show()

We have showed that our `hdf5` files contain all informations from the initial table + all available spectra from observations (FORS2) or SPS-extrapolation (Starlight) - plus mask information and with/without dust extinction.

## 4. Perform cross-match

In [None]:
import pandas as pd
from process_fors2.fetchData import readH5FileAttributes

df_for2_forfit = readH5FileAttributes(DEFAULTS_DICT["FORS2 HDF5"])

In [None]:
df_for2_forfit

In [None]:
if True:
    all_idx_k = []  # index of the match
    all_d2d_k = []  # distance in arcsec
    all_idx_g = []  # index of the match
    all_d2d_g = []  # distance in arcsec

    df_photometry = pd.DataFrame(index=df_for2_forfit.index, columns=SelectedColumns_kids + SelectedColumns_galex)
    for index, row in df_for2_forfit.iterrows():
        c = coord.SkyCoord(row["ra"] * u.degree, row["dec"] * u.degree)
        idx_k, d2d_k, _ = c.match_to_catalog_sky(radec_kids)
        idx_g, d2d_g, _ = c.match_to_catalog_sky(radec_galex)
        all_idx_k.append(int(idx_k))
        all_idx_g.append(int(idx_g))
        all_d2d_k.append(coord.Angle(d2d_k[0]).arcsec)
        all_d2d_g.append(coord.Angle(d2d_g[0]).arcsec)
        df_photometry.loc[index, SelectedColumns_kids] = df_kids.iloc[idx_k]
        df_photometry.loc[index, SelectedColumns_galex] = df_galex.iloc[idx_g]

    all_idx_k = np.array(all_idx_k, dtype=int)
    all_idx_g = np.array(all_idx_g, dtype=int)
    all_d2d_k = np.array(all_d2d_k)
    all_d2d_g = np.array(all_d2d_g)
    df_photometry["id_galex"] = all_idx_g
    df_photometry["id_kids"] = all_idx_k
    df_photometry["asep_galex"] = all_d2d_g
    df_photometry["asep_kids"] = all_d2d_k

    df_concatenated = pd.concat((df_for2_forfit, df_photometry), axis=1)

In [None]:
df_concatenated

In [None]:
for index, row in df_concatenated.iterrows():
    if f"{row['num']}" not in f"{row['name']}":
        print("Inconsistent spectrum at row {index}: name {row['name']}, num {row['num']}")

In [None]:
(df_concatenated["asep_galex"].values[0] * u.arcsec).to(u.deg).value

In [None]:
df_concatenated.hist("asep_kids", bins=100)

In [None]:
df_concatenated.hist("asep_galex", bins=100)

In [None]:
df_concatenated.hist("nuv_flux", bins=100)

In [None]:
for col in df_concatenated.columns:
    try:
        df_concatenated[col] = pd.to_numeric(df_concatenated[col])
    except ValueError:
        pass

In [None]:
df_concatenated.hist("nuv_flux", bins=100)

In [None]:
df_concatenated.hist("fuv_flux", bins=100)

In [None]:
df_concatenated["num"]

In [None]:
ax = plt.subplot(projection=wcs)
ax.imshow(img_data, vmin=med - 1 * sig, vmax=med + 5 * sig, origin="lower")  # , vmin=med-5*sig, vmax=med+5*sig
ax.coords.grid(False, color="white", ls="solid")
ax.coords[0].set_axislabel("Galactic Longitude")
ax.coords[1].set_axislabel("Galactic Latitude")

overlay = ax.get_coords_overlay("fk5")
overlay.grid(True, color="white", ls="dotted")
overlay[0].set_axislabel("Right Ascension (J2000)")
overlay[1].set_axislabel("Declination (J2000)")

ax.scatter(df_concatenated["ra"], df_concatenated["dec"], s=9, label="FORS2", transform=ax.get_transform("fk5"), c="r", alpha=0.3)
ax.scatter(df_concatenated["ra_kids"], df_concatenated["dec_kids"], s=4, label="KiDS", transform=ax.get_transform("fk5"), c="yellow", alpha=0.3)

from matplotlib.patches import Circle

for ra, dec, asep in zip(df_concatenated["ra_kids"].values, df_concatenated["dec_kids"].values, df_concatenated["asep_kids"].values):
    cir = Circle((ra, dec), (asep * u.arcsec).to(u.deg).value, edgecolor="yellow", facecolor="none", transform=ax.get_transform("fk5"), alpha=0.5)
    ax.add_patch(cir)

In [None]:
ax = plt.subplot(projection=wcs)
ax.imshow(img_data, vmin=med - 1 * sig, vmax=med + 5 * sig, origin="lower")  # , vmin=med-5*sig, vmax=med+5*sig
ax.coords.grid(False, color="white", ls="solid")
ax.coords[0].set_axislabel("Galactic Longitude")
ax.coords[1].set_axislabel("Galactic Latitude")

overlay = ax.get_coords_overlay("fk5")
overlay.grid(True, color="white", ls="dotted")
overlay[0].set_axislabel("Right Ascension (J2000)")
overlay[1].set_axislabel("Declination (J2000)")

ax.scatter(df_concatenated["ra"], df_concatenated["dec"], s=9, label="FORS2", transform=ax.get_transform("fk5"), c="r", alpha=0.3)
ax.scatter(df_concatenated["ra_galex"], df_concatenated["dec_galex"], s=4, label="GALEX", transform=ax.get_transform("fk5"), c="yellow", alpha=0.3)

for ra, dec, asep in zip(df_concatenated["ra_galex"].values, df_concatenated["dec_galex"].values, df_concatenated["asep_galex"].values):
    cir = Circle((ra, dec), (asep * u.arcsec).to(u.deg).value, edgecolor="yellow", facecolor="none", transform=ax.get_transform("fk5"), alpha=0.5)
    ax.add_patch(cir)

## 5. Merge catalogs
We will now generate a single `hdf5` file that gathers all appropriate data from the tables above and the spectra. This will be used as inputs for various studies.

In [None]:
from process_fors2.fetchData import crossmatchFors2KidsGalex

In [None]:
filename = "resulting_merge_from_walkthrough.h5"
outfile = os.path.abspath(os.path.join(".", filename))

In [None]:
# crossmatchFors2KidsGalex(outfile)