In [1]:
import pandas as pd
import numpy as np

In [3]:
# 1) LOAD DATA
merged_c_file_path = "merged_clinical_data_V1.csv"  # change if needed
df = pd.read_csv(merged_c_file_path)
print(f"Initial shape: {df.shape}")

Initial shape: (8562, 28)


In [None]:
# CONFIG — ADJUST AS NEEDED
MISSING_THRESHOLD = 0.8  # 80%
BIOLOGICAL_RANGES = {
    'systolic_bp': (40, 300),  # example range
    'diastolic_bp': (20, 200),
    'heart_rate': (30, 250),
    'bmi': (10, 80)
}
ID_COLUMNS = ['patient_id', 'study_id']  # change to your ID columns
CATEGORICAL_STANDARDISATION = {
    'sex': {'male': 'M', 'm': 'M', 'female': 'F', 'f': 'F'},
    'smoking_status': {'yes': 'Yes', 'y': 'Yes', 'no': 'No', 'n': 'No'}
}
CATEGORICAL_COLUMNS = ['sex', 'smoking_status']  # columns to standardise
NUMERIC_COLUMNS = df.select_dtypes(include=[np.number]).columns.tolist()


In [None]:
# 2) DROP COLUMNS THAT ARE COMPLETELY EMPTY
df.dropna(axis=1, how='all', inplace=True)

In [None]:
# 3) DROP ROWS WITH >80% MISSING CLINICAL MEASUREMENTS
non_id_cols = [col for col in df.columns if col not in ID_COLUMNS]
df = df[df[non_id_cols].isnull().mean(axis=1) < MISSING_THRESHOLD]

In [None]:
# 4) FILL MISSING IDs IF POSSIBLE (here: leave as NaN, but log them)
for id_col in ID_COLUMNS:
    missing_ids = df[id_col].isnull().sum()
    if missing_ids > 0:
        print(f"⚠️ {missing_ids} missing in {id_col}")

In [None]:
# 5) REMOVE EXACT DUPLICATES
df.drop_duplicates(inplace=True)

In [None]:
# 6) VALIDATE BIOLOGICAL RANGES
for col, (low, high) in BIOLOGICAL_RANGES.items():
    if col in df.columns:
        before = df.shape[0]
        df = df[(df[col].isnull()) | ((df[col] >= low) & (df[col] <= high))]
        after = df.shape[0]
        print(f"{col}: removed {before - after} out-of-range rows")

In [None]:
# 7) STANDARDISE CATEGORICAL VARIABLES
for col, mapping in CATEGORICAL_STANDARDISATION.items():
    if col in df.columns:
        df[col] = df[col].str.strip().str.lower().map(mapping).fillna(df[col])

In [None]:
# 8) IMPUTE MISSING VALUES (TRAINING SET ONLY IN PRACTICE)
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:  # continuous
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
    else:  # categorical
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)

In [None]:
# 9) SCALE CONTINUOUS FEATURES
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[NUMERIC_COLUMNS] = scaler.fit_transform(df[NUMERIC_COLUMNS])

In [None]:
# 10) ONE-HOT ENCODE NOMINAL VARIABLES
df = pd.get_dummies(df, columns=CATEGORICAL_COLUMNS, drop_first=True)

In [None]:
# 11) SAVE CLEANED DATA
output_file = "merged_clinical_data_cleaned.csv"
df.to_csv(output_file, index=False)
print(f"✅ Cleaned dataset saved to {output_file}")
print(f"Final shape: {df.shape}")