# Verify and validate merged and matched truth catalogs

**Author**: Yao-Yuan Mao (@yymao)
    
This notebook is used to verify and validate the merged and matched truth catalogs, which are the data products from
`scripts/repartition_into_tracts.py` and `scripts/merge_truth_per_tract.py`.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from astropy.table import Table
import astropy.units as u
import pandas as pd
from easyquery import *

In [None]:
tract = 3259

truth_path = f"/global/cscratch1/sd/yymao/desc/truth_run2.2_merged/truth_tract{tract}.parquet"

object_path = f"/global/cfs/cdirs/lsst/shared/DC2-prod/Run2.2i/dpdd/Run2.2i-dr6-wfd-v1/dc2_object_run2.2i_dr6_wfd_v1/dc2_object_run2.2i_dr6_wfd_v1_tract{tract}.parquet"

javi_path = f"/global/cfs/cdirs/lsst/shared/DC2-prod/Run2.2i/addons/matched/dr6/matched_ids_dc2_object_run2.2i_dr6_wfd_with_metacal_{tract}.fits.gz"

### Basic validation

In [None]:
truth_cat = pd.read_parquet(truth_path)

In [None]:
plt.hist(Query("match_sep > -1").filter(truth_cat, "match_sep").values, 101);
plt.yscale("log");
plt.xlabel("nearest neighbor [arcsec]");

In [None]:
is_matched = Query("match_objectId > -1")
is_gal = Query("cosmodc2_id > -1")
is_star = Query(~is_gal, "is_sn == 0")
is_sn = Query(~is_gal, "is_sn")
is_unique = Query("is_unique_truth_entry")

n_matched = is_matched.count(truth_cat)
n_unmatched = len(truth_cat) - n_matched

print()
print("Among matched truth entries")
print("% of duplicated truth", Query(is_matched, ~is_unique).count(truth_cat) / n_matched * 100)
print("% of unique galaxy", Query(is_matched, is_unique, is_gal).count(truth_cat) / n_matched * 100)
print("% of unique stars", Query(is_matched, is_unique, is_star).count(truth_cat) / n_matched * 100)
print("% of unique SNe", Query(is_matched, is_unique, is_sn).count(truth_cat) / n_matched * 100)

print()
print("Among unmatched truth entries")
print("% of galaxy", Query(~is_matched, is_gal).count(truth_cat) / n_unmatched * 100)
print("% of stars", Query(~is_matched, is_star).count(truth_cat) / n_unmatched * 100)
print("% of SNe", Query(~is_matched, is_sn).count(truth_cat) / n_unmatched * 100)

print("")
print("% matched among all truth entries", Query(is_matched, is_unique).count(truth_cat) / is_unique.count(truth_cat) * 100)
print("% matched among galaxies", Query(is_matched, is_unique, is_gal).count(truth_cat) / Query(is_unique, is_gal).count(truth_cat) * 100)
print("% matched among stars", Query(is_matched, is_unique, is_star).count(truth_cat) / Query(is_unique, is_star).count(truth_cat) * 100)
print("% matched among SNe", Query(is_matched, is_unique, is_sn).count(truth_cat) / Query(is_unique, is_sn).count(truth_cat) * 100)

In [None]:
def flux_to_mag(flux):
    return (flux*u.nJy).to_value(u.ABmag)

def add_mag_columns(df, bands="ugrizy"):
    for band in bands:
        df["mag_"+band] = flux_to_mag(df["flux_"+band].values)

In [None]:
galaxies = Query(is_gal, is_unique).filter(truth_cat, ["match_objectId", "flux_r"]).copy()
add_mag_columns(galaxies, "r")

In [None]:
plt.hist(is_matched.filter(galaxies, "mag_r").values, np.linspace(14, 29.5, 51), alpha=0.4, label="matched (22.3%)");
plt.hist((~is_matched).filter(galaxies, "mag_r").values, np.linspace(14, 29.5, 51), alpha=0.4, label="not matched (77.7%)");
plt.yscale("log")
plt.xlabel("galaxy $r$-band magnitude");
plt.legend();

### Check with object catalog

In [None]:
object_cat = pd.read_parquet(object_path, columns=["objectId", "mag_r_cModel", "mag_i_cModel", "extendness"])
object_cat = pd.concat([object_cat, truth_cat[:len(object_cat)]], 1)
assert object_cat.eval("objectId == match_objectId").all()

In [None]:
add_mag_columns(object_cat, "ri")

In [None]:
plt.hist(object_cat.eval("mag_r_cModel - mag_r").values, np.linspace(-10, 10, 101));
plt.yscale("log");
plt.xlabel("magnitude difference (obs - true)");

### Check with Javi's matched catalog

In [None]:
javi_cat = Table.read(javi_path).to_pandas()

javi_cat = pd.concat([javi_cat, truth_cat[:len(javi_cat)].rename(columns={"ra": "ra_truth", "dec": "dec_truth"})], 1)

assert len(javi_cat.columns) == len(set(javi_cat.columns))
assert javi_cat.eval("objectId == match_objectId").all()

In [None]:
javi_cat["truthId_str"] = javi_cat["truthId"].astype(str)

In [None]:
matched = Query("is_matched").filter(javi_cat).copy()
add_mag_columns(matched)

In [None]:
np.count_nonzero(matched["truthId_str"] == matched["id"]) / len(matched)

In [None]:
same_matches = matched[matched["truthId_str"] == matched["id"]]
diff_matches = matched[matched["truthId_str"] != matched["id"]]

In [None]:
plt.hist(same_matches["mag_r"].values, np.linspace(14, 29.5, 51), alpha=0.4, label="Same matches as Javi (94.1%)");
plt.hist(diff_matches["mag_r"].values, np.linspace(14, 29.5, 51), alpha=0.4, label="Different matches (5.9%)");
plt.yscale("log");
plt.xlabel("$r$-band magnitude");
plt.legend();

In [None]:
large_sep = diff_matches["match_sep"] > 1
plt.scatter(diff_matches["ra_truth"][large_sep], diff_matches["dec_truth"][large_sep], c="C1", s=0.1, rasterized=True)

In [None]:
plt.hist(diff_matches["dist"])
plt.yscale("log")
plt.xlabel("sep [arcsec]");

In [None]:
plt.hist(diff_matches["match_sep"])
plt.yscale("log")
plt.xlabel("sep [arcsec]");