In [1]:
import os, json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedGroupKFold, GroupShuffleSplit
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight

BASE = "/kaggle/input/3w-dataset/2.0.0"
RANDOM_STATE = 42
N_WELL_FILES = None  # None = all WELL files

OUT_DIR = "/kaggle/working/3w_prepared_v3_1"
os.makedirs(OUT_DIR, exist_ok=True)

CACHE_PATH = f"{OUT_DIR}/df_ml_well_v3_1.parquet"
print("OUT_DIR:", OUT_DIR)


OUT_DIR: /kaggle/working/3w_prepared_v3_1


# 3W Dataset (WELL-only) — Row-per-file Classification (v3_1)

## Summary (Engineer Story)
- Built a file index from the 3W dataset and filtered to **WELL files only** (1119 files, 40 wells).
- Cleaned each file: parsed timestamp index, renamed sensors, converted to numeric types.
- Engineered **row-per-file features**:
  - Continuous sensors: raw median/IQR/last + robust z-stats (mean/std/min/max/last) + delta(first→last) + missing ratio.
  - State/valve signals: last state + transition count/rate + time-in-state proportions (+ “other” state).
- Evaluated with **Group splits by `well_id`** to avoid leakage across wells (harder but realistic).
- Compared baseline (Logistic Regression) vs boosting (HistGradientBoosting with class-weighted sample_weight).
- Result: **HGB outperformed baseline**, reaching about **macro-F1 ~0.42 (repeated group shuffle)** and **fault-vs-normal F1 ~0.76**, with variance due to very rare classes.


In [2]:
def build_file_index(base: str) -> pd.DataFrame:
    paths = []
    for root, _, files in os.walk(base):
        for f in files:
            if f.endswith(".parquet"):
                paths.append(os.path.join(root, f))

    df = pd.DataFrame({"path": paths})
    codes = df["path"].str.extract(r"/2\.0\.0/(\d+)/", expand=False)
    df["event_type_code"] = pd.to_numeric(codes, errors="coerce").astype("Int64")
    df = df.dropna(subset=["event_type_code"])
    df["event_type_code"] = df["event_type_code"].astype(int)
    df["file"] = df["path"].str.split("/").str[-1]
    df["source"] = df["file"].str.extract(r"^(WELL|SIMULATED|DRAWN)")
    df["well_id"] = df["file"].str.extract(r"(WELL-\d+)")
    df["run_ts"] = df["file"].str.extract(r"_(\d{14})")
    df["run_ts"] = pd.to_datetime(df["run_ts"], format="%Y%m%d%H%M%S", errors="coerce")
    return df.sort_values(["event_type_code","source","well_id","run_ts"]).reset_index(drop=True)

df_files = build_file_index(BASE)
df_w_files = df_files[df_files["source"]=="WELL"].copy()
assert df_w_files["well_id"].notna().all()


print("Total files:", len(df_files), "| WELL files:", len(df_w_files), "| Wells:", df_w_files["well_id"].nunique())
display(df_w_files["event_type_code"].value_counts().sort_index())


Total files: 2228 | WELL files: 1119 | Wells: 40


event_type_code
0    594
1      4
2     22
3     32
4    343
5     11
6      6
7     36
8     14
9     57
Name: count, dtype: int64

In [3]:
VAR_RENAME = {
    "ABER-CKGL": "gl_choke_opening_pct",
    "ABER-CKP":  "prod_choke_opening_pct",
    "ESTADO-DHSV":   "dhsv_state",
    "ESTADO-M1":     "prod_master_valve_state",
    "ESTADO-M2":     "ann_master_valve_state",
    "ESTADO-PXO":    "pig_crossover_valve_state",
    "ESTADO-SDV-GL": "gl_shutdown_valve_state",
    "ESTADO-SDV-P":  "prod_shutdown_valve_state",
    "ESTADO-W1":     "prod_wing_valve_state",
    "ESTADO-W2":     "ann_wing_valve_state",
    "ESTADO-XO":     "crossover_valve_state",
    "P-ANULAR":     "annulus_pressure_pa",
    "P-JUS-BS":     "svc_pump_downstream_pressure_pa",
    "P-JUS-CKGL":   "gl_choke_downstream_pressure_pa",
    "P-JUS-CKP":    "prod_choke_downstream_pressure_pa",
    "P-MON-CKGL":   "gl_choke_upstream_pressure_pa",
    "P-MON-CKP":    "prod_choke_upstream_pressure_pa",
    "P-MON-SDV-P":  "prod_sdv_upstream_pressure_pa",
    "P-PDG":        "pdg_downhole_pressure_pa",
    "PT-P":         "xmas_tree_prod_line_pressure_pa",
    "P-TPT":        "tpt_pressure_pa",
    "QBS": "svc_pump_flow_m3s",
    "QGL": "gas_lift_flow_m3s",
    "T-JUS-CKP": "prod_choke_downstream_temp_c",
    "T-MON-CKP": "prod_choke_upstream_temp_c",
    "T-PDG":     "pdg_downhole_temp_c",
    "T-TPT":     "tpt_temp_c",
    "class": "class_code",
    "state": "state_code",
}

EVENT_TYPE_CODE_TO_NAME = {
    0:"Normal Operation", 1:"Abrupt Increase of BSW", 2:"Spurious Closure of DHSV",
    3:"Severe Slugging", 4:"Flow Instability", 5:"Rapid Productivity Loss",
    6:"Quick Restriction in PCK", 7:"Scaling in PCK",
    8:"Hydrate in Production Line", 9:"Hydrate in Service Line",
}

LABEL_COLS = {"class_code", "state_code", "class_label", "state_label"}

def clean_3w_instance(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
        df = df.set_index("timestamp")
    else:
        df.index = pd.to_datetime(df.index, errors="coerce")

    df = df[~df.index.isna()].sort_index()
    df.index.name = "timestamp"
    df = df.rename(columns=VAR_RENAME)

    for c in df.columns:
        if c in ("class_code", "state_code"):
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int16")
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float64")
    return df

def summarize_timeseries_v3_1(df_clean: pd.DataFrame, frac: float = 0.1, state_max: int = 3) -> dict:
    sensors = df_clean.drop(columns=list(LABEL_COLS), errors="ignore")
    num = sensors.select_dtypes(include=[np.number])

    out = {
        "n_obs": int(len(df_clean)),
        "duration_s": float((df_clean.index.max() - df_clean.index.min()).total_seconds())
                      if len(df_clean) else np.nan,
    }
    if num.shape[1] == 0 or len(num) == 0:
        return out

    state_cols = [c for c in num.columns if c.endswith("_state")]
    cont_cols  = [c for c in num.columns if c not in state_cols]

    if len(cont_cols):
        cont = num[cont_cols]
        med_raw = cont.median()
        iqr_raw = (cont.quantile(0.75) - cont.quantile(0.25))

        iqr = iqr_raw.replace(0, np.nan)
        z = (cont - med_raw) / iqr

        k = max(1, int(len(z) * frac))
        first = z.iloc[:k].mean()
        last  = z.iloc[-k:].mean()

        agg = z.agg(["mean","std","min","max"]).T
        miss = cont.isna().mean()

        for col in cont_cols:
            out[f"{col}__raw_median"] = med_raw[col]
            out[f"{col}__raw_iqr"]    = iqr_raw[col]
            s = cont[col].dropna()
            out[f"{col}__raw_last"] = s.iloc[-1] if len(s) else np.nan

            out[f"{col}__z_mean"] = agg.loc[col, "mean"]
            out[f"{col}__z_std"]  = agg.loc[col, "std"]
            out[f"{col}__z_min"]  = agg.loc[col, "min"]
            out[f"{col}__z_max"]  = agg.loc[col, "max"]
            out[f"{col}__z_last"] = z[col].iloc[-1]

            out[f"{col}__delta_last_first"] = (last[col] - first[col])
            out[f"{col}__abs_delta"]        = abs(last[col] - first[col])
            out[f"{col}__missing_frac"]     = miss[col]

    if len(state_cols):
        st = num[state_cols]
        for col in state_cols:
            s = st[col]
            s_non = s.dropna()

            out[f"{col}__missing_frac"] = float(s.isna().mean())
            out[f"{col}__last"] = float(s_non.iloc[-1]) if len(s_non) else np.nan

            if len(s_non) >= 2:
                n_trans = int((s_non != s_non.shift()).sum() - 1)
            else:
                n_trans = 0

            out[f"{col}__n_transitions"] = n_trans
            out[f"{col}__transitions_rate"] = n_trans / max(1, len(s_non))

            known = 0.0
            for v in range(state_max + 1):
                p = float((s_non == v).mean()) if len(s_non) else np.nan
                out[f"{col}__p_state_{v}"] = p
                if not np.isnan(p):
                    known += p
            out[f"{col}__p_state_other"] = (1.0 - known) if len(s_non) else np.nan

    return out


In [4]:
def build_row_per_file_dataset(df_files: pd.DataFrame, n_files: int | None = None, random_state: int = 42) -> pd.DataFrame:
    if n_files is None:
        sample = df_files.reset_index(drop=True)
    else:
        sample = df_files.sample(n_files, random_state=random_state).reset_index(drop=True)

    rows = []
    for _, r in tqdm(sample.iterrows(), total=len(sample), desc="Building WELL dataset"):
        df_raw = pd.read_parquet(r["path"])
        df_clean = clean_3w_instance(df_raw)

        feats = summarize_timeseries_v3_1(df_clean)
        feats["event_type_code"] = int(r["event_type_code"])
        feats["event_type_name"] = EVENT_TYPE_CODE_TO_NAME.get(int(r["event_type_code"]), "Unknown")
        feats["well_id"] = r["well_id"]
        feats["run_ts"] = r["run_ts"]
        feats["file"] = r["file"]
        rows.append(feats)

    return pd.DataFrame(rows)

if os.path.exists(CACHE_PATH):
    df_ml_well = pd.read_parquet(CACHE_PATH)
    print("Loaded cached features:", df_ml_well.shape)
else:
    df_ml_well = build_row_per_file_dataset(df_w_files, n_files=N_WELL_FILES, random_state=RANDOM_STATE)
    df_ml_well.to_parquet(CACHE_PATH, index=False)
    print("Built + saved features:", df_ml_well.shape)

with open(f"{OUT_DIR}/dataset_config.json", "w") as f:
    json.dump({
        "base": BASE,
        "random_state": RANDOM_STATE,
        "n_well_files_used": int(len(df_ml_well)),
        "features_version": "row_per_file_v3_1",
        "notes": "continuous: raw median/iqr/last + robust-z stats + deltas; state: last + transitions + proportions (+other); WELL-only"
    }, f, indent=2)


Building WELL dataset:   0%|          | 0/1119 [00:00<?, ?it/s]

Built + saved features: (1119, 286)


In [5]:
def make_Xy_groups(df: pd.DataFrame):
    y = df["event_type_code"].copy()
    groups = df["well_id"].copy()

    drop_cols = ["event_type_code","event_type_name","file","run_ts","well_id"]
    X = df.drop(columns=drop_cols, errors="ignore").copy()

    X = X.replace([np.inf, -np.inf], np.nan)

    X = X.drop(columns=X.columns[X.isna().all()])          # all-missing
    miss = X.isna().mean()
    X = X.drop(columns=miss[miss > 0.98].index)            # very high missing
    const_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
    X = X.drop(columns=const_cols)                          # constants

    return X, y, groups

X, y, groups = make_Xy_groups(df_ml_well)
print("X:", X.shape, "| y:", y.shape, "| wells:", groups.nunique())
display(y.value_counts().sort_index())


X: (1119, 182) | y: (1119,) | wells: 40


event_type_code
0    594
1      4
2     22
3     32
4    343
5     11
6      6
7     36
8     14
9     57
Name: count, dtype: int64

In [6]:
def eval_repeated_group_shuffle(model, X, y, groups, repeats=30, test_size=0.2):
    gss = GroupShuffleSplit(n_splits=repeats, test_size=test_size, random_state=42)
    major_classes = set(y.value_counts()[lambda s: s >= 10].index)

    f1s, f1_major, f1_bin = [], [], []
    for tr, te in gss.split(X, y, groups=groups):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]

        model.fit(Xtr, ytr)
        pred = model.predict(Xte)

        f1s.append(f1_score(yte, pred, average="macro"))

        mask = yte.isin(major_classes)
        f1_major.append(f1_score(yte[mask], pred[mask], average="macro") if mask.sum() else np.nan)

        yte_bin = (yte != 0).astype(int)
        pred_bin = (pred != 0).astype(int)
        f1_bin.append(f1_score(yte_bin, pred_bin))

    return {
        "macro_f1_mean": float(np.nanmean(f1s)),
        "macro_f1_std":  float(np.nanstd(f1s)),
        "major_macro_f1_mean": float(np.nanmean(f1_major)),
        "fault_vs_normal_f1_mean": float(np.nanmean(f1_bin)),
    }

logreg = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)),
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(max_iter=8000, class_weight="balanced"))
])

class HGBWrapper:
    def __init__(self,
                 max_depth=6,
                 learning_rate=0.06,
                 max_iter=700,
                 max_leaf_nodes=31,
                 min_samples_leaf=20,
                 l2_regularization=0.1):
        self.clf = HistGradientBoostingClassifier(
            max_depth=max_depth,
            learning_rate=learning_rate,
            max_iter=max_iter,
            max_leaf_nodes=max_leaf_nodes,
            min_samples_leaf=min_samples_leaf,
            l2_regularization=l2_regularization
        )

    def fit(self, X, y):
        classes = np.unique(y)
        cw = compute_class_weight(class_weight="balanced", classes=classes, y=y)
        w = pd.Series(y).map(dict(zip(classes, cw))).to_numpy()
        self.clf.fit(X, y, sample_weight=w)
        return self

    def predict(self, X):
        return self.clf.predict(X.fillna(0))

hgb = HGBWrapper()

print("LogReg:", eval_repeated_group_shuffle(logreg, X, y, groups, repeats=30))
print("HGB   :", eval_repeated_group_shuffle(hgb, X, y, groups, repeats=30))


LogReg: {'macro_f1_mean': 0.22398705848338343, 'macro_f1_std': 0.09435939619898277, 'major_macro_f1_mean': 0.24794970780289896, 'fault_vs_normal_f1_mean': 0.6073600789459409}
HGB   : {'macro_f1_mean': 0.4331954202538017, 'macro_f1_std': 0.15897862328939008, 'major_macro_f1_mean': 0.4594612367402339, 'fault_vs_normal_f1_mean': 0.8007543910780439}


## Results (Current)
- Logistic Regression (repeated group shuffle, 30): macro-F1 ≈ 0.22, fault-vs-normal F1 ≈ 0.61
- HistGradientBoosting (repeated group shuffle, 30): macro-F1 ≈ 0.42, fault-vs-normal F1 ≈ 0.76
- Note: Macro-F1 is unstable because some classes have very few samples (e.g., class 1 has 4 files).
