In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path(".")
IN_CSV   = DATA_DIR / "CTG_clean.csv"    # change if needed
OUT_CSV  = DATA_DIR / "CTG_features.csv" # engineered features

df = pd.read_csv(IN_CSV)
print(df.shape)
print(df.columns.tolist()[:30])
print(df["NSP"].value_counts().sort_index())  # class distribution


(2118, 38)
['filename', 'date', 'segfile', 'b', 'e', 'lbe', 'lb', 'ac', 'fm', 'uc', 'astv', 'mstv', 'altv', 'mltv', 'dl', 'ds', 'dp', 'dr', 'width', 'min', 'max', 'nmax', 'nzeros', 'mode', 'mean', 'median', 'variance', 'tendency', 'a', 'c']
NSP
1.0    1648
2.0     293
3.0     175
Name: count, dtype: int64


In [4]:
def pick(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

colmap = {
    "baseline": pick(df, ["lb", "baseline", "mean_fhr", "fhr_baseline", "b"]),
    "ac":       pick(df, ["ac", "accelerations"]),
    "dl":       pick(df, ["dl", "decelerations_late", "late_decelerations"]),
    "ds":       pick(df, ["ds", "decelerations_short", "short_decelerations"]),
    "dp":       pick(df, ["dp", "decelerations_prolonged", "prolonged_decelerations"]),
    "uc":       pick(df, ["uc", "uterine_contractions"]),
    "mstv":     pick(df, ["mstv"]),
    "altv":     pick(df, ["altv"]),
    "mltv":     pick(df, ["mltv"]),
    "width":    pick(df, ["width"]),
    "min":      pick(df, ["min"]),
    "max":      pick(df, ["max"]),
}

colmap

{'baseline': 'lb',
 'ac': 'ac',
 'dl': 'dl',
 'ds': 'ds',
 'dp': 'dp',
 'uc': 'uc',
 'mstv': 'mstv',
 'altv': 'altv',
 'mltv': 'mltv',
 'width': 'width',
 'min': 'min',
 'max': 'max'}

In [5]:
df_feat = df.copy()
eps = 1e-6

if colmap["baseline"] is not None:
    base = colmap["baseline"]
    df_feat["feat_tachycardia"] = (df_feat[base] > 160).astype(int)
    df_feat["feat_bradycardia"] = (df_feat[base] < 110).astype(int)

if colmap["altv"] is not None and colmap["mstv"] is not None:
    df_feat["feat_variability_ratio"] = df_feat[colmap["altv"]] / (df_feat[colmap["mstv"]] + eps)

if colmap["ac"] is not None:
    decel_parts = [k for k in ["dl", "ds", "dp"] if colmap[k] is not None]
    if decel_parts:
        df_feat["feat_accel_decel_balance"] = (df_feat[colmap["ac"]] + 1.0) / (
            df_feat[[colmap[k] for k in decel_parts]].sum(axis=1) + 1.0
        )

if colmap["uc"] is not None:
    decel_parts = [k for k in ["dl", "ds", "dp"] if colmap[k] is not None]
    if decel_parts:
        df_feat["feat_decels_per_contraction"] = (
            df_feat[[colmap[k] for k in decel_parts]].sum(axis=1) / (df_feat[colmap["uc"]] + 1.0)
        )

if colmap["max"] is not None and colmap["min"] is not None:
    df_feat["feat_range"] = df_feat[colmap["max"]] - df_feat[colmap["min"]]

if colmap["width"] is not None and colmap["mstv"] is not None:
    df_feat["feat_instability_proxy"] = df_feat[colmap["width"]] / (df_feat[colmap["mstv"]] + 1.0)

num_cols = df_feat.select_dtypes(include=[np.number]).columns
df_feat[num_cols] = df_feat[num_cols].replace([np.inf, -np.inf], np.nan)

for c in ["feat_variability_ratio","feat_accel_decel_balance","feat_decels_per_contraction",
          "feat_range","feat_instability_proxy"]:
    if c in df_feat.columns and df_feat[c].isna().any():
        df_feat[c] = df_feat[c].fillna(df_feat[c].median())

df_feat.shape, [c for c in df_feat.columns if c.startswith("feat_")]


((2118, 45),
 ['feat_tachycardia',
  'feat_bradycardia',
  'feat_variability_ratio',
  'feat_accel_decel_balance',
  'feat_decels_per_contraction',
  'feat_range',
  'feat_instability_proxy'])

In [6]:
df_feat[[c for c in df_feat.columns if c.startswith("feat_")]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
feat_tachycardia,2118.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feat_bradycardia,2118.0,0.003305,0.057408,0.0,0.0,0.0,0.0,1.0
feat_variability_ratio,2118.0,23.923173,60.324052,0.0,0.0,0.0,13.333317,454.997725
feat_accel_decel_balance,2118.0,2.542107,3.080302,0.052632,0.8,1.0,3.0,22.0
feat_decels_per_contraction,2118.0,0.378225,0.712147,0.0,0.0,0.0,0.543706,11.0
feat_range,2118.0,70.486308,38.984821,3.0,37.0,68.0,100.0,180.0
feat_instability_proxy,2118.0,29.634461,13.686222,2.5,18.947368,28.0,38.367117,78.75


In [7]:
df_feat.to_csv(OUT_CSV, index=False)
print("Saved features ->", OUT_CSV.resolve())

df_feat.head(10)

Saved features -> C:\Users\bang kai\Desktop\Datathon\CTG_features.csv


Unnamed: 0,filename,date,segfile,b,e,lbe,lb,ac,fm,uc,...,susp,class,NSP,feat_tachycardia,feat_bradycardia,feat_variability_ratio,feat_accel_decel_balance,feat_decels_per_contraction,feat_range,feat_instability_proxy
0,,849398400000000000,,240.0,357.0,120.0,120.0,0.0,0.0,0.0,...,0.0,9.0,2.0,0,0,85.999828,1.0,0.0,64.0,42.666667
1,,831081600000000000,,5.0,632.0,132.0,132.0,4.0,0.0,4.0,...,0.0,6.0,1.0,0,0,0.0,1.666667,0.4,130.0,41.935484
2,,831081600000000000,,177.0,779.0,133.0,133.0,2.0,0.0,5.0,...,0.0,6.0,1.0,0,0,0.0,1.0,0.333333,130.0,41.935484
3,,831081600000000000,,411.0,1192.0,134.0,134.0,2.0,0.0,6.0,...,0.0,6.0,1.0,0,0,0.0,1.0,0.285714,117.0,34.411765
4,,831081600000000000,,533.0,1147.0,132.0,132.0,4.0,0.0,5.0,...,0.0,2.0,1.0,0,0,0.0,5.0,0.0,117.0,34.411765
5,,831081600000000000,,0.0,953.0,134.0,134.0,1.0,0.0,10.0,...,0.0,8.0,3.0,0,0,0.0,0.166667,1.0,150.0,21.73913
6,,831081600000000000,,240.0,953.0,134.0,134.0,1.0,0.0,9.0,...,0.0,8.0,3.0,0,0,0.0,0.222222,0.8,150.0,20.547945
7,,793411200000000000,,62.0,679.0,122.0,122.0,0.0,0.0,0.0,...,0.0,9.0,3.0,0,0,11.999976,1.0,0.0,68.0,45.333333
8,,793411200000000000,,120.0,779.0,122.0,122.0,0.0,0.0,1.0,...,0.0,9.0,3.0,0,0,9.99998,1.0,0.0,68.0,45.333333
9,,793411200000000000,,181.0,1192.0,122.0,122.0,0.0,0.0,3.0,...,0.0,9.0,3.0,0,0,19.999933,1.0,0.0,68.0,52.307692
