In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os, re, json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedGroupKFold, GroupShuffleSplit
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight

BASE = "/kaggle/input/3w-dataset/2.0.0"
RANDOM_STATE = 42

# خليه None عشان يبني كل WELL files (أفضل)
# لو عايزة تجربة سريعة: حطي مثلا 400
N_WELL_FILES = None

OUT_DIR = "/kaggle/working/3w_prepared_v3_1"
os.makedirs(OUT_DIR, exist_ok=True)

print("OUT_DIR:", OUT_DIR)


OUT_DIR: /kaggle/working/3w_prepared_v3_1


In [3]:
def build_file_index(base: str) -> pd.DataFrame:
    paths = []
    for root, _, files in os.walk(base):
        for f in files:
            if f.endswith(".parquet"):
                paths.append(os.path.join(root, f))

    df = pd.DataFrame({"path": paths})

    df["event_type_code"] = df["path"].str.extract(r"/2\.0\.0/(\d+)/").astype(int)
    df["file"] = df["path"].str.split("/").str[-1]
    df["source"] = df["file"].str.extract(r"^(WELL|SIMULATED|DRAWN)")
    df["well_id"] = df["file"].str.extract(r"(WELL-\d+)")
    df["run_ts"] = df["file"].str.extract(r"_(\d{14})")
    df["run_ts"] = pd.to_datetime(df["run_ts"], format="%Y%m%d%H%M%S", errors="coerce")

    df = df.sort_values(["event_type_code","source","well_id","run_ts"]).reset_index(drop=True)
    return df

df_files = build_file_index(BASE)

print("Num files:", len(df_files))
display(df_files["source"].value_counts(dropna=False))
display(df_files["event_type_code"].value_counts().sort_index())

df_w_files = df_files[df_files["source"]=="WELL"].reset_index(drop=True)
print("WELL files:", len(df_w_files))


Num files: 2228


source
WELL         1119
SIMULATED    1089
DRAWN          20
Name: count, dtype: int64

event_type_code
0    594
1    128
2     38
3    106
4    343
5    450
6    221
7     46
8     95
9    207
Name: count, dtype: int64

WELL files: 1119


In [4]:
VAR_RENAME = {
    "ABER-CKGL": "gl_choke_opening_pct",
    "ABER-CKP":  "prod_choke_opening_pct",
    "ESTADO-DHSV":   "dhsv_state",
    "ESTADO-M1":     "prod_master_valve_state",
    "ESTADO-M2":     "ann_master_valve_state",
    "ESTADO-PXO":    "pig_crossover_valve_state",
    "ESTADO-SDV-GL": "gl_shutdown_valve_state",
    "ESTADO-SDV-P":  "prod_shutdown_valve_state",
    "ESTADO-W1":     "prod_wing_valve_state",
    "ESTADO-W2":     "ann_wing_valve_state",
    "ESTADO-XO":     "crossover_valve_state",
    "P-ANULAR":     "annulus_pressure_pa",
    "P-JUS-BS":     "svc_pump_downstream_pressure_pa",
    "P-JUS-CKGL":   "gl_choke_downstream_pressure_pa",
    "P-JUS-CKP":    "prod_choke_downstream_pressure_pa",
    "P-MON-CKGL":   "gl_choke_upstream_pressure_pa",
    "P-MON-CKP":    "prod_choke_upstream_pressure_pa",
    "P-MON-SDV-P":  "prod_sdv_upstream_pressure_pa",
    "P-PDG":        "pdg_downhole_pressure_pa",
    "PT-P":         "xmas_tree_prod_line_pressure_pa",
    "P-TPT":        "tpt_pressure_pa",
    "QBS": "svc_pump_flow_m3s",
    "QGL": "gas_lift_flow_m3s",
    "T-JUS-CKP": "prod_choke_downstream_temp_c",
    "T-MON-CKP": "prod_choke_upstream_temp_c",
    "T-PDG":     "pdg_downhole_temp_c",
    "T-TPT":     "tpt_temp_c",
    "class": "class_code",
    "state": "state_code",
}

EVENT_TYPE_CODE_TO_NAME = {
    0:"Normal Operation", 1:"Abrupt Increase of BSW", 2:"Spurious Closure of DHSV",
    3:"Severe Slugging", 4:"Flow Instability", 5:"Rapid Productivity Loss",
    6:"Quick Restriction in PCK", 7:"Scaling in PCK",
    8:"Hydrate in Production Line", 9:"Hydrate in Service Line",
}

LABEL_COLS = {"class_code", "state_code", "class_label", "state_label"}

def clean_3w_instance(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
        df = df.set_index("timestamp")
    else:
        df.index = pd.to_datetime(df.index, errors="coerce")

    df = df[~df.index.isna()].sort_index()
    df.index.name = "timestamp"

    df = df.rename(columns=VAR_RENAME)

    for c in df.columns:
        if c in ("class_code", "state_code"):
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int16")
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float64")

    return df


In [5]:
def summarize_timeseries_v3_1(df_clean: pd.DataFrame, frac: float = 0.1, state_max: int = 3) -> dict:
    sensors = df_clean.drop(columns=list(LABEL_COLS), errors="ignore")
    num = sensors.select_dtypes(include=[np.number])

    out = {
        "n_obs": int(len(df_clean)),
        "duration_s": float((df_clean.index.max() - df_clean.index.min()).total_seconds())
                      if len(df_clean) else np.nan,
    }
    if num.shape[1] == 0 or len(num) == 0:
        return out

    # ✅ الصح: state columns حسب الاسم
    state_cols = [c for c in num.columns if c.endswith("_state")]
    cont_cols  = [c for c in num.columns if c not in state_cols]

    # ---------- continuous features ----------
    if len(cont_cols):
        cont = num[cont_cols]

        # raw features (absolute level across wells)
        med_raw = cont.median()
        iqr_raw = (cont.quantile(0.75) - cont.quantile(0.25))

        # robust-z within file
        iqr = iqr_raw.replace(0, np.nan)
        z = (cont - med_raw) / iqr

        k = max(1, int(len(z) * frac))
        first = z.iloc[:k].mean()
        last  = z.iloc[-k:].mean()

        agg = z.agg(["mean","std","min","max"]).T
        miss = cont.isna().mean()

        for col in cont_cols:
            out[f"{col}__raw_median"] = med_raw[col]
            out[f"{col}__raw_iqr"]    = iqr_raw[col]
            out[f"{col}__raw_last"]   = cont[col].iloc[-1]

            out[f"{col}__z_mean"] = agg.loc[col, "mean"]
            out[f"{col}__z_std"]  = agg.loc[col, "std"]
            out[f"{col}__z_min"]  = agg.loc[col, "min"]
            out[f"{col}__z_max"]  = agg.loc[col, "max"]
            out[f"{col}__z_last"] = z[col].iloc[-1]

            out[f"{col}__delta_last_first"] = (last[col] - first[col])
            out[f"{col}__abs_delta"]        = abs(last[col] - first[col])
            out[f"{col}__missing_frac"]     = miss[col]

    # ---------- state features ----------
    if len(state_cols):
        st = num[state_cols]
        for col in state_cols:
            s = st[col]
            s_non = s.dropna()

            out[f"{col}__missing_frac"] = float(s.isna().mean())
            out[f"{col}__last"] = float(s_non.iloc[-1]) if len(s_non) else np.nan

            if len(s_non) >= 2:
                n_trans = int((s_non != s_non.shift()).sum() - 1)
            else:
                n_trans = 0

            out[f"{col}__n_transitions"] = n_trans
            out[f"{col}__transitions_rate"] = n_trans / max(1, len(s_non))

            # proportions for known states
            known = 0.0
            for v in range(state_max + 1):
                p = float((s_non == v).mean()) if len(s_non) else np.nan
                out[f"{col}__p_state_{v}"] = p
                if not np.isnan(p):
                    known += p

            # ✅ any other state goes here
            out[f"{col}__p_state_other"] = (1.0 - known) if len(s_non) else np.nan


    return out


In [6]:
def build_row_per_file_dataset(df_files: pd.DataFrame, n_files: int | None = None, random_state: int = 42) -> pd.DataFrame:
    if n_files is None:
        sample = df_files.reset_index(drop=True)
    else:
        sample = df_files.sample(n_files, random_state=random_state).reset_index(drop=True)

    rows = []
    for _, r in tqdm(sample.iterrows(), total=len(sample), desc="Building WELL dataset"):
        df_raw = pd.read_parquet(r["path"])
        df_clean = clean_3w_instance(df_raw)

        feats = summarize_timeseries_v3_1(df_clean)

        feats["event_type_code"] = int(r["event_type_code"])
        feats["event_type_name"] = EVENT_TYPE_CODE_TO_NAME.get(int(r["event_type_code"]), "Unknown")
        feats["source"] = r["source"]
        feats["well_id"] = r["well_id"]
        feats["run_ts"] = r["run_ts"]
        feats["file"] = r["file"]
        rows.append(feats)

    return pd.DataFrame(rows)

df_ml_well = build_row_per_file_dataset(df_w_files, n_files=N_WELL_FILES, random_state=RANDOM_STATE)

print("df_ml_well shape:", df_ml_well.shape)
display(df_ml_well["event_type_code"].value_counts().sort_index())

# Save
df_w_files.to_parquet(f"{OUT_DIR}/df_w_files.parquet", index=False)
df_ml_well.to_parquet(f"{OUT_DIR}/df_ml_well_v3_1.parquet", index=False)

config = {
    "base": BASE,
    "random_state": RANDOM_STATE,
    "n_well_files_used": int(len(df_ml_well)),
    "features_version": "row_per_file_v3_1",
    "notes": "continuous: raw median/iqr/last + robust-z stats + deltas; state: last + transitions + proportions; WELL-only"
}
with open(f"{OUT_DIR}/dataset_config.json", "w") as f:
    json.dump(config, f, indent=2)

print("Saved to:", OUT_DIR)


Building WELL dataset:   0%|          | 0/1119 [00:00<?, ?it/s]

df_ml_well shape: (1119, 287)


event_type_code
0    594
1      4
2     22
3     32
4    343
5     11
6      6
7     36
8     14
9     57
Name: count, dtype: int64

Saved to: /kaggle/working/3w_prepared_v3_1


In [7]:
def make_Xy_groups(df: pd.DataFrame):
    y = df["event_type_code"].copy()
    groups = df["well_id"].copy()

    drop_cols = ["event_type_code","event_type_name","file","run_ts","well_id","source"]
    X = df.drop(columns=drop_cols, errors="ignore").copy()

    X = X.replace([np.inf, -np.inf], np.nan)

    # drop global all-missing
    X = X.drop(columns=X.columns[X.isna().all()])

    # drop very high missing
    miss = X.isna().mean()
    X = X.drop(columns=miss[miss > 0.98].index)

    # drop constants
    const_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
    X = X.drop(columns=const_cols)

    return X, y, groups

X, y, groups = make_Xy_groups(df_ml_well)
print("X shape:", X.shape, "y:", y.shape, "num wells:", groups.nunique())


X shape: (1119, 182) y: (1119,) num wells: 40


In [8]:
def eval_sgkf(model, X, y, groups, n_splits=3):
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)

    major_classes = set(y.value_counts()[lambda s: s >= 10].index)

    f1s, f1_major, f1_bin = [], [], []
    for tr, te in sgkf.split(X, y, groups=groups):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]

        model.fit(Xtr, ytr)
        pred = model.predict(Xte)

        f1s.append(f1_score(yte, pred, average="macro"))

        mask = yte.isin(major_classes)
        f1_major.append(f1_score(yte[mask], pred[mask], average="macro") if mask.sum() else np.nan)

        yte_bin = (yte != 0).astype(int)
        pred_bin = (pred != 0).astype(int)
        f1_bin.append(f1_score(yte_bin, pred_bin))

    return {
        "macro_f1_mean": float(np.nanmean(f1s)),
        "macro_f1_std":  float(np.nanstd(f1s)),
        "major_macro_f1_mean": float(np.nanmean(f1_major)),
        "fault_vs_normal_f1_mean": float(np.nanmean(f1_bin)),
    }

def eval_repeated_group_shuffle(model, X, y, groups, repeats=30, test_size=0.2):
    gss = GroupShuffleSplit(n_splits=repeats, test_size=test_size, random_state=42)

    major_classes = set(y.value_counts()[lambda s: s >= 10].index)

    f1s, f1_major, f1_bin = [], [], []
    for tr, te in gss.split(X, y, groups=groups):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]

        model.fit(Xtr, ytr)
        pred = model.predict(Xte)

        f1s.append(f1_score(yte, pred, average="macro"))

        mask = yte.isin(major_classes)
        f1_major.append(f1_score(yte[mask], pred[mask], average="macro") if mask.sum() else np.nan)

        yte_bin = (yte != 0).astype(int)
        pred_bin = (pred != 0).astype(int)
        f1_bin.append(f1_score(yte_bin, pred_bin))

    return {
        "macro_f1_mean": float(np.nanmean(f1s)),
        "macro_f1_std":  float(np.nanstd(f1s)),
        "major_macro_f1_mean": float(np.nanmean(f1_major)),
        "fault_vs_normal_f1_mean": float(np.nanmean(f1_bin)),
    }


In [9]:
# Logistic baseline (robust to missing)
logreg = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)),
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(max_iter=8000, class_weight="balanced"))
])

# HGB with class-weights via sample_weight
class HGBWrapper:
    def __init__(self, max_depth=6, learning_rate=0.08, max_iter=400):
        self.clf = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.06,
    max_iter=700,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    l2_regularization=0.1
)

    def fit(self, X, y):
        classes = np.unique(y)
        cw = compute_class_weight(class_weight="balanced", classes=classes, y=y)
        w = pd.Series(y).map(dict(zip(classes, cw))).to_numpy()
        # HGB doesn't accept NaN in older setups sometimes -> fillna(0) safe
        self.clf.fit(X.fillna(0), y, sample_weight=w)
        return self
    def predict(self, X):
        return self.clf.predict(X.fillna(0))

hgb = HGBWrapper(max_depth=6, learning_rate=0.08, max_iter=500)

print("=== SGKF(3) ===")
print("LogReg:", eval_sgkf(logreg, X, y, groups, n_splits=3))
print("HGB   :", eval_sgkf(hgb, X, y, groups, n_splits=3))

print("\n=== Repeated GroupShuffle (30) ===")
print("LogReg:", eval_repeated_group_shuffle(logreg, X, y, groups, repeats=30))
print("HGB   :", eval_repeated_group_shuffle(hgb, X, y, groups, repeats=30))


=== SGKF(3) ===
LogReg: {'macro_f1_mean': 0.2132743351703038, 'macro_f1_std': 0.020872956678484964, 'major_macro_f1_mean': 0.24202320934891675, 'fault_vs_normal_f1_mean': 0.5400034564961553}
HGB   : {'macro_f1_mean': 0.3897749108071133, 'macro_f1_std': 0.13416882763152274, 'major_macro_f1_mean': 0.43790063841762167, 'fault_vs_normal_f1_mean': 0.7405608715894342}

=== Repeated GroupShuffle (30) ===
LogReg: {'macro_f1_mean': 0.2220844412708708, 'macro_f1_std': 0.09474106156164713, 'major_macro_f1_mean': 0.24584133361326016, 'fault_vs_normal_f1_mean': 0.6082033788273894}
HGB   : {'macro_f1_mean': 0.41743879159789393, 'macro_f1_std': 0.153304009843824, 'major_macro_f1_mean': 0.4371282389527981, 'fault_vs_normal_f1_mean': 0.7646731967383822}


In [10]:
def debug_sgkf_folds(model, X, y, groups, n_splits=3):
    sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []
    for i, (tr, te) in enumerate(sgkf.split(X, y, groups=groups), 1):
        model.fit(X.iloc[tr], y.iloc[tr])
        pred = model.predict(X.iloc[te])
        f1 = f1_score(y.iloc[te], pred, average="macro")
        fold_scores.append(f1)
        print(f"Fold {i} macro-F1 = {f1:.3f} | test size={len(te)} | unique classes in test={y.iloc[te].nunique()}")
    print("Mean:", np.mean(fold_scores), "Std:", np.std(fold_scores))

debug_sgkf_folds(hgb, X, y, groups, n_splits=3)


Fold 1 macro-F1 = 0.521 | test size=180 | unique classes in test=6
Fold 2 macro-F1 = 0.443 | test size=422 | unique classes in test=9
Fold 3 macro-F1 = 0.206 | test size=517 | unique classes in test=10
Mean: 0.3897749108071133 Std: 0.13416882763152274
