In [23]:
import os, json                           # filesystem + metadata
import numpy as np                        # numerics
import pandas as pd                       # dataframes
from sklearn.model_selection import train_test_split   # split before transforms (anti-leakage)
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump                   # persist fitted preprocessor


In [24]:
#Columns to *preserve* for merging later, but EXCLUDE from model features
POSSIBLE_ID_COLS = [
    "Area Code", "Area Name", "PatientID", "StudyID", "Indicator ID", "Parent Code"
]

#Aggregated numeric fields that must be non-negative if present
NONNEGATIVE_COLS = [
    "Value", "Count", "Denominator",
    "Lower CI 95.0 limit", "Upper CI 95.0 limit",
    "Lower CI 99.8 limit", "Upper CI 99.8 limit",
]

#Numeric columns that behave like categories/flags (encode, don't scale)
LIKELY_NUMERIC_CATEGORICAL = [
    "New data", "Compared to goal", "Time period Sortable"
]

ROW_MISSING_THRESHOLD = 0.80# drop rows if >80% of NON-ID columns missing
TEST_SIZE = 0.20# 80/20 split
RANDOM_STATE = 42# reproducibility
SHUFFLE = True# usually True for i.i.d. tabular data

#Outputs
OUT_DIR = "clinical_preml_outputs"
TRAIN_CSV = os.path.join(OUT_DIR, "clinical_train_processed.csv")
TEST_CSV  = os.path.join(OUT_DIR, "clinical_test_processed.csv")
TRAIN_IDS_CSV = os.path.join(OUT_DIR, "clinical_train_ids.csv")
TEST_IDS_CSV  = os.path.join(OUT_DIR, "clinical_test_ids.csv")
METADATA_JSON = os.path.join(OUT_DIR, "clinical_processing_metadata.json")
PREPROCESSOR_JOBLIB = os.path.join(OUT_DIR, "preprocessor.joblib")

os.makedirs(OUT_DIR, exist_ok=True)#ensure output folder exists


In [25]:
# 1) LOAD DATA
merged_c_file_path = "merged_clinical_data_V1.csv"
df_raw = pd.read_csv(merged_c_file_path)#read CSV once
meta = {"Initial shape": list(df_raw.shape)}#record starting shape

In [26]:
# 2) DROP empty & constant columns (Rule 1)
empty_cols = df_raw.columns[df_raw.isna().all()].tolist()  # columns all NaN
df = df_raw.drop(columns=empty_cols)          # drop fully empty columns

# drop columns with no variance (single unique across all rows)
constant_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
df = df.drop(columns=constant_cols)

# log what we removed
meta["dropped_empty_columns"] = empty_cols
meta["dropped_constant_columns"] = constant_cols

In [27]:
# 3) REMOVE exact duplicates (Rule 4)
before = len(df)
df = df.drop_duplicates()#exact row duplicates only
meta["removed_duplicate_rows"] = before - len(df)

In [28]:
# 4) Identify ID columns (Rules 1,3,12)
present_id_cols = [c for c in POSSIBLE_ID_COLS if c in df.columns]   # keep known IDs

# Heuristic: treat ~unique columns as IDs too (avoid leaking identifiers as features)
n = len(df)
for c in df.columns:
    if c not in present_id_cols:
        uniq = df[c].nunique(dropna=True)
        if uniq > 0.8 * n:               # ~unique per row → likely an identifier/timestamp
            present_id_cols.append(c)

present_id_cols = sorted(set(present_id_cols))  # de-dup and sort for stability
meta["id_columns_kept"] = present_id_cols

In [29]:
# 5) Logical range checks (Rule 5)
for col in NONNEGATIVE_COLS:
    if col in df.columns:
        neg_n = int((df[col] < 0).sum())
        if neg_n > 0:
            df.loc[df[col] < 0, col] = np.nan
        meta[f"set_negative_to_nan__{col}"] = neg_n

# Ensure CI lower ≤ upper; if inverted, set both to NaN (conservative)
if {"Lower CI 95.0 limit", "Upper CI 95.0 limit"}.issubset(df.columns):
    mask = df["Lower CI 95.0 limit"] > df["Upper CI 95.0 limit"]
    meta["ci95_inverted_pairs_set_nan"] = int(mask.sum())
    df.loc[mask, ["Lower CI 95.0 limit", "Upper CI 95.0 limit"]] = np.nan

if {"Lower CI 99.8 limit", "Upper CI 99.8 limit"}.issubset(df.columns):
    mask = df["Lower CI 99.8 limit"] > df["Upper CI 99.8 limit"]
    meta["ci998_inverted_pairs_set_nan"] = int(mask.sum())
    df.loc[mask, ["Lower CI 99.8 limit", "Upper CI 99.8 limit"]] = np.nan

#Ensures biological plausibility:
#Negative values in numeric health metrics (e.g., blood pressure, counts) are set to NaN (to be imputed later).
#If confidence interval lower bound > upper bound, both are set to NaN (data entry error).
#Logs the number of fixes applied.

In [30]:
# 6) Minimal categorical tidy (Rule 6)
# Keep regional codes as-is (e.g., Sex), just trim whitespace and convert empty strings to NaN
for c in df.select_dtypes(include=["object"]).columns:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})

#Cleans string columns:
#Strips extra spaces.
#Converts empty strings to NaN.
#No remapping of regional categorical codes (keeps “Sex” as local format).

In [31]:
# 7) Drop rows with >80% missing across NON-ID columns (Rule 2)
feature_candidates = [c for c in df.columns if c not in present_id_cols]  # only features for this test
row_missing_frac = df[feature_candidates].isna().mean(axis=1)             # fraction missing per row
drop_mask = row_missing_frac > ROW_MISSING_THRESHOLD                      # rows exceeding threshold
meta["dropped_rows_gt80pct_missing"] = int(drop_mask.sum())
df = df.loc[~drop_mask].reset_index(drop=True)                            # keep good rows
meta["shape_after_prefilter"] = list(df.shape)

#If a patient record is mostly empty (>80% missing in non-ID columns), it’s dropped (too incomplete to be useful).
#Logs how many rows were removed and the new shape.

In [32]:
# 8) Train/Test split BEFORE any fit (Rule 8)
# If you later add a target y, split X,y and consider stratify=y. Here we prep features only.
X_all = df.copy()                                          # keep everything; IDs separated below

X_train, X_test = train_test_split(
    X_all, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=SHUFFLE
)
meta["train_shape_before_processing"] = list(X_train.shape)
meta["test_shape_before_processing"]  = list(X_test.shape)

# Separate and save IDs (kept for merging; excluded from ML features)
train_ids = X_train[present_id_cols].copy() if present_id_cols else pd.DataFrame(index=X_train.index)
test_ids  = X_test[present_id_cols].copy()  if present_id_cols else pd.DataFrame(index=X_test.index)

# Remove IDs from features (avoid leakage)
X_train = X_train.drop(columns=present_id_cols, errors="ignore")
X_test  = X_test.drop(columns=present_id_cols, errors="ignore")

#Split dataset into train/test before imputing, scaling, or encoding to prevent data leakage.
#Save ID columns separately (so they’re not used in training) but keeps them for later merging with other datasets.

In [33]:
# 9) Feature typing (Rule 7)
# Identify numeric vs text features on TRAIN ONLY (avoids peeking at test)
num_all = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_text = X_train.select_dtypes(exclude=["number"]).columns.tolist()

# Split numeric into continuous vs numeric-categorical by simple cardinality heuristic
numeric_categorical, numeric_continuous = [], []
for c in num_all:
    low_card = X_train[c].nunique(dropna=True) <= 20 or X_train[c].nunique(dropna=True) <= 0.01 * len(X_train)
    if c in LIKELY_NUMERIC_CATEGORICAL or low_card:
        numeric_categorical.append(c)         # will one-hot encode
    else:
        numeric_continuous.append(c)          # will impute + MinMax scale

meta["numeric_continuous_cols"] = numeric_continuous
meta["numeric_categorical_cols"] = numeric_categorical
meta["categorical_text_cols"]    = cat_text

#Split numeric columns into:
#1 Continuous numeric (real-valued measurements → scale)
#2 Numeric categorical (code-like → one-hot encode)
#All string columns are categorical text.
#Logs the classification.

In [34]:
# 10) Build preprocessors (Rules 9, 10, 11)
# Numeric continuous: median impute → MinMax scale to [0,1] (you requested min-max)
num_cont_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  MinMaxScaler())
])

# Numeric categorical (codes/flags): mode impute → OHE
num_cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe",     OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Text categorical: mode impute → OHE
txt_cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe",     OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine into a single column transformer; drop anything not specified
preprocessor = ColumnTransformer(
    transformers=[
        ("num_cont", num_cont_pipe, numeric_continuous),
        ("num_cat",  num_cat_pipe,  numeric_categorical),
        ("txt_cat",  txt_cat_pipe,  cat_text),
    ],
    remainder="drop"
)

# Fit ONLY on training data; transform train and test
Xtr = preprocessor.fit_transform(X_train)   # learn medians, mins/maxes, categories
Xte = preprocessor.transform(X_test)        # apply learned params to test set

#Continuous numeric: Median imputation → Min–Max scaling (0–1).
#Numeric categorical & text categorical: Mode imputation → One-hot encoding.
#ColumnTransformer applies the right pipeline to each feature type.
#Fits only on training data, applies learned transformations to test.

In [35]:
# 11) Recover feature names for OHE blocks
def get_feature_names(prep, num_cont, num_cat, txt_cat):
    names = []
    # continuous numeric: names unchanged by scaler
    names += num_cont
    # numeric-categorical (OHE): expand with categories
    if num_cat:
        names += prep.named_transformers_["num_cat"].named_steps["ohe"] \
                 .get_feature_names_out(num_cat).tolist()
    # text-categorical (OHE): expand with categories
    if txt_cat:
        names += prep.named_transformers_["txt_cat"].named_steps["ohe"] \
                 .get_feature_names_out(txt_cat).tolist()
    return names

feature_names = get_feature_names(preprocessor, numeric_continuous, numeric_categorical, cat_text)

# Convert arrays back to DataFrames (index preserved)
Xtr_df = pd.DataFrame(Xtr, columns=feature_names, index=X_train.index)
Xte_df = pd.DataFrame(Xte, columns=feature_names, index=X_test.index)

#One-hot encoding creates new columns (one per category).
#This step recovers human-readable column names and puts the arrays back into DataFrames with original row indexing.

In [36]:
# 12) SAVE outputs + metadata (Rules 12–14)
# Save processed feature matrices
Xtr_df.to_csv(TRAIN_CSV, index=False)
Xte_df.to_csv(TEST_CSV,  index=False)

# Save ID maps (for future merging with genetics/environment)
if not train_ids.empty:
    train_ids.to_csv(TRAIN_IDS_CSV, index=False)
if not test_ids.empty:
    test_ids.to_csv(TEST_IDS_CSV, index=False)

# Record shapes and initial missingness (train only)
meta.update({
    "final_train_shape": list(Xtr_df.shape),
    "final_test_shape":  list(Xte_df.shape),
    "initial_missing_train_counts": X_train.isna().sum().to_dict()
})

# Persist metadata + fitted preprocessor for reproducibility / inference
with open(METADATA_JSON, "w") as f:
    json.dump(meta, f, indent=2)

dump(preprocessor, PREPROCESSOR_JOBLIB)

print("✅ Processing complete.")
print(f"Train features → {TRAIN_CSV}")
print(f"Test  features → {TEST_CSV}")
if present_id_cols:
    print(f"Train IDs     → {TRAIN_IDS_CSV}")
    print(f"Test  IDs     → {TEST_IDS_CSV}")
print(f"Metadata       → {METADATA_JSON}")
print(f"Preprocessor   → {PREPROCESSOR_JOBLIB}")

#Saves processed training and test features.
#Saves IDs separately for merging later.
#Saves metadata JSON (audit trail of transformations, dropped rows/cols, etc.).
#Saves the fitted preprocessor (so you can apply the exact same transformations to new incoming data at inference time).

✅ Processing complete.
Train features → clinical_preml_outputs\clinical_train_processed.csv
Test  features → clinical_preml_outputs\clinical_test_processed.csv
Train IDs     → clinical_preml_outputs\clinical_train_ids.csv
Test  IDs     → clinical_preml_outputs\clinical_test_ids.csv
Metadata       → clinical_preml_outputs\clinical_processing_metadata.json
Preprocessor   → clinical_preml_outputs\preprocessor.joblib
