# Verify and validate merged and matched truth catalogs

**Author**: Yao-Yuan Mao (@yymao)
    
This notebook is used to verify and validate the merged and matched truth catalogs, which are the data products from
`scripts/repartition_into_tracts.py` and `scripts/merge_truth_per_tract.py`.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from astropy.table import Table
import astropy.units as u
from astropy.coordinates import SkyCoord

import pandas as pd
from easyquery import Query, QueryMaker

In [None]:
tract = 3259

truth_path = f"/global/cscratch1/sd/yymao/desc/truth_run2.2_merged/truth_tract{tract}.parquet"

object_path = f"/global/cfs/cdirs/lsst/shared/DC2-prod/Run2.2i/dpdd/Run2.2i-dr6-wfd-v1/dc2_object_run2.2i_dr6_wfd_v1/dc2_object_run2.2i_dr6_wfd_v1_tract{tract}.parquet"

javi_path = f"/global/cfs/cdirs/lsst/shared/DC2-prod/Run2.2i/addons/matched/dr6/matched_ids_dc2_object_run2.2i_dr6_wfd_with_metacal_{tract}.fits.gz"

### Basic validation

In [None]:
truth_cat = pd.read_parquet(truth_path)

In [None]:
truth_cat.columns

In [None]:
plt.hist(Query("match_sep > -1").filter(truth_cat, "match_sep").values, 101);
plt.yscale("log");
plt.xlabel("nearest neighbor [arcsec]");

In [None]:
is_matched = Query("match_objectId > -1")
is_gal = Query("truth_type == 1")
is_star = Query("truth_type == 2")
is_sn = Query("truth_type == 3")
is_unique = Query("is_unique_truth_entry")

n_matched = is_matched.count(truth_cat)
n_unmatched = len(truth_cat) - n_matched

print()
print("Among matched truth entries")
print("% of duplicated truth", Query(is_matched, ~is_unique).count(truth_cat) / n_matched * 100)
print("% of unique galaxy", Query(is_matched, is_unique, is_gal).count(truth_cat) / n_matched * 100)
print("% of unique stars", Query(is_matched, is_unique, is_star).count(truth_cat) / n_matched * 100)
print("% of unique SNe", Query(is_matched, is_unique, is_sn).count(truth_cat) / n_matched * 100)

print()
print("Among unmatched truth entries")
print("% of galaxy", Query(~is_matched, is_gal).count(truth_cat) / n_unmatched * 100)
print("% of stars", Query(~is_matched, is_star).count(truth_cat) / n_unmatched * 100)
print("% of SNe", Query(~is_matched, is_sn).count(truth_cat) / n_unmatched * 100)

print("")
print("% matched among all truth entries", Query(is_matched, is_unique).count(truth_cat) / is_unique.count(truth_cat) * 100)
print("% matched among galaxies", Query(is_matched, is_unique, is_gal).count(truth_cat) / Query(is_unique, is_gal).count(truth_cat) * 100)
print("% matched among stars", Query(is_matched, is_unique, is_star).count(truth_cat) / Query(is_unique, is_star).count(truth_cat) * 100)
print("% matched among SNe", Query(is_matched, is_unique, is_sn).count(truth_cat) / Query(is_unique, is_sn).count(truth_cat) * 100)

In [None]:
def flux_to_mag(flux):
    with np.errstate(divide="ignore"):
        return (flux*u.nJy).to_value(u.ABmag)

def add_mag_columns(df, bands="ugrizy"):
    for band in bands:
        df["mag_"+band] = flux_to_mag(df["flux_"+band].values)

In [None]:
add_mag_columns(truth_cat, "r")

In [None]:
galaxies = Query(is_gal, is_unique).filter(truth_cat, ["match_objectId", "mag_r"])
matched_frac = is_matched.count(galaxies) / len(galaxies)

plt.hist(is_matched.filter(galaxies, "mag_r").values, np.linspace(14, 29.5, 51), alpha=0.4, label=f"matched ({matched_frac*100:.1f}%)");
plt.hist((~is_matched).filter(galaxies, "mag_r").values, np.linspace(14, 29.5, 51), alpha=0.4, label=f"not matched ({100-matched_frac*100:.1f}%)");
plt.yscale("log")
plt.xlabel("truth galaxy $r$-band magnitude");
plt.legend();

In [None]:
unmatched_bright_truth = Query(~is_matched, "mag_r < 20", is_gal).filter(truth_cat)
unmatched_bright_truth

### Check with object catalog

In [None]:
object_cat = pd.read_parquet(object_path, columns=["objectId", "ra", "dec", "mag_r_cModel", "extendedness"])
object_cat = pd.concat([object_cat, truth_cat[:len(object_cat)].rename(columns={"ra": "ra_truth", "dec": "dec_truth"})], 1)
assert object_cat.eval("objectId == match_objectId").all()

#### Magnitude difference between matches

In [None]:
add_mag_columns(object_cat, "r")

In [None]:
plt.hist(object_cat.eval("mag_r_cModel - mag_r").values, np.linspace(-10, 10, 101));
plt.yscale("log");
plt.xlabel("magnitude difference (obs - true)");

#### How about those bright unmatched objects?

In [None]:
sc = SkyCoord(unmatched_bright_truth["ra"].to_numpy(), unmatched_bright_truth["dec"].to_numpy(), unit="deg")
obj_sc = SkyCoord(object_cat["ra"].to_numpy(), object_cat["dec"].to_numpy(), unit="deg")
truth_unique = is_unique.filter(truth_cat)
truth_sc = SkyCoord(truth_unique["ra"].to_numpy(), truth_unique["dec"].to_numpy(), unit="deg")
for sc_this, unmatch_id in zip(sc, unmatched_bright_truth["id"]):
    print(unmatch_id)

    sep = obj_sc.separation(sc_this).arcsec
    sep_mask = sep < 2
    d = object_cat[["objectId", "mag_r_cModel", "extendedness", "id", "match_objectId", "match_sep", "mag_r"]][sep_mask]
    d["sep"] = sep[sep_mask]
    d = d.sort_values("sep")
    display(d)
    
    sep = truth_sc.separation(sc_this).arcsec
    sep_mask = sep < 2
    d = truth_unique[["id", "mag_r", "cosmodc2_id", "cosmodc2_hp", "redshift", "match_objectId", "match_sep"]][sep_mask]
    d["sep"] = sep[sep_mask]
    d = d.sort_values("sep")
    display(d)

### Check with Javi's matched catalog

In [None]:
javi_cat = Table.read(javi_path).to_pandas()
javi_cat = pd.concat([javi_cat, truth_cat[:len(javi_cat)].rename(columns={"ra": "ra_truth", "dec": "dec_truth"})], 1)

assert len(javi_cat.columns) == len(set(javi_cat.columns))
assert javi_cat.eval("objectId == match_objectId").all()

In [None]:
javi_cat["truthId_str"] = javi_cat["truthId"].astype(str)

javi_not_matched = Query("is_matched == 0").filter(javi_cat)
javi_matched = Query("is_matched").filter(javi_cat)

same_matches = Query((np.equal, "truthId_str", "id")).filter(javi_matched)
diff_matches = Query((np.not_equal, "truthId_str", "id")).filter(javi_matched)

In [None]:
same_matched_frac = len(same_matches) / (len(same_matches) + len(javi_matched))

plt.hist(same_matches["mag_r"].values, np.linspace(14, 29.5, 51), alpha=0.4, label=f"Same matches as Javi ({same_matched_frac*100:.1f}%)");
plt.hist(diff_matches["mag_r"].values, np.linspace(14, 29.5, 51), alpha=0.4, label=f"Different matches ({100-same_matched_frac*100:.1f}%)");
plt.yscale("log");
plt.xlabel("$r$-band magnitude");
plt.legend();

#### How do those with different matches distribute spatailly?

In [None]:
larger_sep = (diff_matches["match_sep"] > np.rad2deg(diff_matches["dist"]))

print(np.count_nonzero(larger_sep) / len(diff_matches) * 100, "%")

plt.scatter(diff_matches["ra_truth"], diff_matches["dec_truth"], c="C1", s=0.01, rasterized=True);
plt.scatter(diff_matches["ra_truth"][larger_sep], diff_matches["dec_truth"][larger_sep], c="C3", s=2, rasterized=True);

#### Do those have different matches all present in the new truth catalog?

In [None]:
truth_ids = np.concatenate([
    Query(is_gal, is_unique).filter(truth_cat, "cosmodc2_id"),
    Query(is_star, is_unique).filter(truth_cat, "id").astype(int),
])

In [None]:
id_not_in_new_truth = QueryMaker.in1d("truthId", truth_ids, invert=True)

print(id_not_in_new_truth.count(diff_matches) / len(javi_cat) * 100, "%")
print(id_not_in_new_truth.count(diff_matches) / len(diff_matches) * 100, "%")

plt.scatter(id_not_in_new_truth.filter(diff_matches, "ra_truth"), id_not_in_new_truth.filter(diff_matches, "dec_truth"), c="C1", s=1, rasterized=True);

in_different_tract = Query(id_not_in_new_truth, "mag_r_lsst < 29")

print(in_different_tract.count(diff_matches) / len(javi_cat) * 100, "%")
print(in_different_tract.count(diff_matches) / len(diff_matches) * 100, "%")

plt.scatter(in_different_tract.filter(diff_matches, "ra_truth"), in_different_tract.filter(diff_matches, "dec_truth"), c="C3", s=4, rasterized=True);