Visualization of features. No visualization of SpecType

In [1]:
import os
import numpy as np
import pandas as pd

from pathlib import Path

import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler

In [6]:
ROOT = Path.cwd().parents[0]
DATA_DIR = ROOT / "data"

split_file_name = "train_full_lightcurves.csv"

columns = ["object_id", "Time (MJD)", "Flux", "Flux_err", "Filter"]
filter_order = ["u", "g", "r", "i", "z", "y"]

In [7]:
df_log = pd.read_csv(DATA_DIR / "train_log.csv")
df_log = df_log.drop(columns=["English Translation"])

df_log["target"] = df_log["target"].astype(int)
df_log["target_str"] = df_log["target"].map({0: "non-TDE", 1: "TDE"})

df_log["object_id"] = df_log["object_id"].astype(str)
df_log["split"] = df_log["split"].astype(str)

df_log.head()

Unnamed: 0,object_id,Z,Z_err,EBV,SpecType,split,target,target_str
0,Dornhoth_fervain_onodrim,3.049,,0.11,AGN,split_01,0,non-TDE
1,Dornhoth_galadh_ylf,0.4324,,0.058,SN II,split_01,0,non-TDE
2,Elrim_melethril_thul,0.4673,,0.577,AGN,split_01,0,non-TDE
3,Ithil_tobas_rodwen,0.6946,,0.012,AGN,split_01,0,non-TDE
4,Mirion_adar_Druadan,0.4161,,0.058,AGN,split_01,0,non-TDE


In [8]:
def load_split_lightcurves(split_name):
    lc = pd.read_csv(f"{DATA_DIR}/{split_name}/train_full_lightcurves.csv")

    lc["object_id"] = lc["object_id"].astype(str)
    lc["Filter"] = lc["Filter"].astype("category")

    lc["Time (MJD)"] = pd.to_numeric(lc["Time (MJD)"], errors="coerce")
    lc["Flux"] = pd.to_numeric(lc["Flux"], errors="coerce")
    lc["Flux_err"] = pd.to_numeric(lc["Flux_err"], errors="coerce")
    lc["Time (MJD)"] = lc["Time (MJD)"].astype("float64")
    lc["Flux"] = lc["Flux"].astype("float32")
    lc["Flux_err"] = lc["Flux_err"].astype("float32")

    return lc

def preload_all_splits_from_log(df_log):
    split_to_lc = {}
    split_to_groups = {}

    splits = sorted(df_log["split"].unique())

    for split in splits:
        lc = load_split_lightcurves(split)
        split_to_lc[split] = lc
        split_to_groups[split] = lc.groupby("object_id", sort=False)

    return split_to_lc, split_to_groups

split_to_lc, split_to_groups = preload_all_splits_from_log(df_log)


In [None]:
def plot_one_object_lightcurve(split_to_groups, df_log, object_id):
    row = df_log[df_log["object_id"] == str(object_id)]

    split = row.iloc[0]["split"]
    tgt = row.iloc[0]["target_str"]
    lc = split_to_groups[split].get_group(str(object_id)).copy()
    lc = lc.sort_values("Time (MJD)")

    fig = go.Figure()

    filters = filter_order
    present = set(map(str, lc["Filter"].unique()))
    filters = [f for f in filters if f in present] + sorted(list(present - set(filters)))

    for f in filters:
        tmp = lc[lc["Filter"].astype(str) == f]
        if tmp.empty:
            continue
        fig.add_trace(go.Scatter(
            x=tmp["Time (MJD)"],
            y=tmp["Flux"],
            mode="lines+markers",
            name=f,
            error_y=dict(type="data", array=tmp["Flux_err"], visible=True)
        ))

    fig.update_layout(
        title=f"Lightcurve: object_id={object_id} (target={tgt}, split={split})",
        xaxis_title="Time (MJD)",
        yaxis_title="Flux (μJy)",
    )
    fig.show()
    
tde_ids = df_log.loc[df_log["target"] == 1, "object_id"].head(1).tolist()
non_ids = df_log.loc[df_log["target"] == 0, "object_id"].head(1).tolist()

for oid in (tde_ids + non_ids):
    plot_one_object_lightcurve(split_to_groups, df_log, oid)

![amon imloth luin](figures/amon_imloth_luin.png)
![dornhoth fervain onodrim](figures/dornhoth_fervain_onodrim.png)

In [26]:
def build_per_object_summary_features(split_to_groups, df_log, seed=0):
    objs = df_log["object_id"].values

    rows = []
    for obj_id in objs:
        row = df_log[df_log["object_id"] == obj_id].iloc[0]
        split = row["split"]

        lc = split_to_groups[split].get_group(obj_id)

        per_filter = lc.groupby("Filter", observed=True).agg(
            t_min=("Time (MJD)", "min"),
            t_max=("Time (MJD)", "max"),
            f_max=("Flux", "max"),
            f_min=("Flux", "min"),
            f_med=("Flux", "median"),
            ferr_med=("Flux_err", "median"),
            n_obs=("Flux", "size"),
        ).reset_index()

        rows.append({
            "object_id": obj_id,
            "target": int(row["target"]),
            "target_str": row["target_str"],
            "Z": float(row["Z"]) if "Z" in row and pd.notna(row["Z"]) else np.nan,
            "EBV": float(row["EBV"]) if "EBV" in row and pd.notna(row["EBV"]) else np.nan,

            "n_obs_total": int(per_filter["n_obs"].sum()),
            "n_filters": int(per_filter["Filter"].nunique()),
            "time_span": float(per_filter["t_max"].max() - per_filter["t_min"].min()),

            "peak_flux": float(per_filter["f_max"].max()),
            "min_flux": float(per_filter["f_min"].min()),
            "amp_flux": float(per_filter["f_max"].max() - per_filter["f_min"].min()),
            "median_ferr": float(per_filter["ferr_med"].median()),
        })

    return pd.DataFrame(rows)

df_obj = build_per_object_summary_features(split_to_groups, df_log, seed=0)
df_obj

Unnamed: 0,object_id,target,target_str,Z,EBV,n_obs_total,n_filters,time_span,peak_flux,min_flux,amp_flux,median_ferr
0,Dornhoth_fervain_onodrim,0,non-TDE,3.0490,0.110,65,6,1254.2719,25.047344,-2.756285,27.803629,0.336164
1,Dornhoth_galadh_ylf,0,non-TDE,0.4324,0.058,167,6,2362.1560,11.375500,-1.747082,13.122581,0.319893
2,Elrim_melethril_thul,0,non-TDE,0.4673,0.577,35,6,1206.0218,6.617915,-6.400816,13.018732,0.329105
3,Ithil_tobas_rodwen,0,non-TDE,0.6946,0.012,798,6,2858.4129,5.353821,-7.641818,12.995640,0.251920
4,Mirion_adar_Druadan,0,non-TDE,0.4161,0.058,129,6,2202.3065,5.384463,-3.060399,8.444862,0.323182
...,...,...,...,...,...,...,...,...,...,...,...,...
3038,tinnu_gellui_tathar,0,non-TDE,0.8898,0.042,148,6,2582.0800,6.428462,-1.419681,7.848143,0.333995
3039,uir_heleg_corf,0,non-TDE,0.9598,0.042,138,6,2916.1706,7.369065,-5.330229,12.699294,0.341419
3040,uir_rhosc_law,0,non-TDE,0.1543,0.024,172,6,1936.1637,5.085714,-2.773028,7.858742,0.290256
3041,uruk_in_pess,0,non-TDE,1.1520,0.019,161,6,2699.8022,2.871105,-2.895248,5.766354,0.294903


In [None]:
fig = px.scatter(
    df_obj,
    x="peak_flux",
    y="amp_flux",
    color="target_str",
    hover_data=["object_id", "n_obs_total", "n_filters", "time_span", "Z", "EBV"],
    title="peak_flux vs amp_flux (colored by target)"
)
fig.show()

![peak vs amp](figures/peak_vs_amp.png)

In [None]:
fig = px.scatter(
    df_obj,
    x="time_span",
    y="n_obs_total",
    color="target_str",
    hover_data=["object_id", "n_filters", "Z", "EBV"],
    title="time_span vs total observations (colored by target)"
)
fig.show()


![time vs observations](figures/time_vs_observations.png)

In [None]:
cols = [
    "target",
    "Z",
    "EBV",
    "n_obs_total",
    "n_filters",
    "time_span",
    "peak_flux",
    "amp_flux",
    "median_ferr",
]

cols = [c for c in cols if c in df_obj.columns]

corr = df_obj[cols].corr(method="pearson")

fig = px.imshow(
    corr,
    text_auto=".2f",
    color_continuous_scale="RdBu",
    zmin=-1,
    zmax=1,
    title="Correlation matrix",
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="Feature",
)

fig.show()


![correlation matrix](figures/correlation.png)

In [None]:
use_cols = ["Z", "EBV", "n_obs_total", "n_filters", "time_span", "peak_flux", "amp_flux"]
use_cols = [c for c in use_cols if c in df_obj.columns]

df_pc = df_obj.dropna(subset=use_cols + ["target"]).copy()

max_objects = 3000
if len(df_pc) > max_objects:
    df_pc = df_pc.sample(max_objects, random_state=0)

scaler = StandardScaler()
scaled = scaler.fit_transform(df_pc[use_cols])

df_scaled = pd.DataFrame(scaled, columns=use_cols)
df_scaled["target"] = df_pc["target"].values

fig = px.parallel_coordinates(
    df_scaled,
    dimensions=use_cols,
    color="target",
    color_continuous_scale=px.colors.diverging.Tealrose,
    title="Parallel coordinates",
)
fig.show()


![parallel coordinates](figures/parallel_coordinates.png)