In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request

def download(url, output) : 
    if not os.path.exists(output):
        print(f"{output} not found. Downloading...")
        urllib.request.urlretrieve(url, output)
    else:
        print(f"{output} already exists. Skip download.")

download("https://drive.google.com/uc?id=1JPOVfYXJNXeBgG_V0auiuK5W2CvP4_td", "training.csv")
download("https://drive.google.com/uc?id=11W8QiL98fEqV7Xfbw-xgXCC7G-SM1yMb", "test.csv")

# ==============================
# 設定
# ==============================
input_csv = "training.csv"
output_dir = "plots_kinematics"


os.makedirs(output_dir, exist_ok=True)

# ==============================
# データ読み込み
# ==============================
df = pd.read_csv(input_csv)

# Signal / Background のマスク
mask_sig = df["Label"] == "s"
mask_bg  = df["Label"] == "b"

# 物理量のカラム（EventId, Weight, Label を除いたもの）
feature_cols = [c for c in df.columns if c not in ["EventId", "Weight", "Label"]]

print(f"Number of kinematic/feature variables: {len(feature_cols)}")

# ==============================
# 変数ごとにヒストグラム作成
# ==============================
for col in feature_cols:
    # 値を取り出し (-999 を欠損扱い)
    vals_sig = df.loc[mask_sig, col].replace(-999.0, np.nan).dropna().values
    vals_bg  = df.loc[mask_bg,  col].replace(-999.0, np.nan).dropna().values

    if len(vals_sig) == 0 or len(vals_bg) == 0:
        print(f"Skip {col}: no valid entries after cleaning")
        continue

    # 共通のビンを決めるため min/max を合わせる
    vmin = min(vals_sig.min(), vals_bg.min())
    vmax = max(vals_sig.max(), vals_bg.max())
    # 定義域が点になっているとヒストが描けないので手当する
    if vmin == vmax:
        print(f"Skip {col}: all values are the same ({vmin})")
        continue

    bins = 50

    plt.figure()
    plt.hist(
        vals_sig,
        bins=bins,
        range=(vmin, vmax),
        histtype="step",
        density=True,
        label="Signal (s)",
    )
    plt.hist(
        vals_bg,
        bins=bins,
        range=(vmin, vmax),
        histtype="step",
        density=True,
        label="Background (b)",
    )

    plt.xlabel(col)
    plt.ylabel("Normalized entries")
    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    out_path = os.path.join(output_dir, f"{col}.png")
    plt.savefig(out_path, dpi=150)
    plt.close()

    print(f"Saved: {out_path}")

print("All done.")

training.csv already exists. Skip download.
test.csv already exists. Skip download.
Number of kinematic/feature variables: 30
Saved: plots_kinematics/DER_mass_MMC.png
Saved: plots_kinematics/DER_mass_transverse_met_lep.png
Saved: plots_kinematics/DER_mass_vis.png
Saved: plots_kinematics/DER_pt_h.png
Saved: plots_kinematics/DER_deltaeta_jet_jet.png
Saved: plots_kinematics/DER_mass_jet_jet.png
Saved: plots_kinematics/DER_prodeta_jet_jet.png
Saved: plots_kinematics/DER_deltar_tau_lep.png
Saved: plots_kinematics/DER_pt_tot.png
Saved: plots_kinematics/DER_sum_pt.png
Saved: plots_kinematics/DER_pt_ratio_lep_tau.png
Saved: plots_kinematics/DER_met_phi_centrality.png
Saved: plots_kinematics/DER_lep_eta_centrality.png
Saved: plots_kinematics/PRI_tau_pt.png
Saved: plots_kinematics/PRI_tau_eta.png
Saved: plots_kinematics/PRI_tau_phi.png
Saved: plots_kinematics/PRI_lep_pt.png
Saved: plots_kinematics/PRI_lep_eta.png
Saved: plots_kinematics/PRI_lep_phi.png
Saved: plots_kinematics/PRI_met.png
Saved: 