In [6]:
import pandas as pd
import numpy as np
import os

# --- CONFIGURATION ---
RAW_DATA_PATH = os.path.join("..", "data", "raw", "Dengue-Dataset.csv")
PROCESSED_DATA_DIR = os.path.join("..", "data", "processed")

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

PLATELET_THRESHOLD = 100000

print("Configuration complete.")

Configuration complete.


In [7]:
# Load the raw dataset
try:
    raw_df = pd.read_csv(RAW_DATA_PATH)
    print(f"Successfully loaded raw data. Shape: {raw_df.shape}")
except FileNotFoundError:
    print(f"ERROR: Raw data not found at {RAW_DATA_PATH}")
    assert False, "Please place the raw dataset in the data/raw/ directory."

# --- DATA CLEANING ---

# 1. Remove duplicate rows
clean_df = raw_df.drop_duplicates()
print(f"Shape after dropping duplicates: {clean_df.shape}")

# 2. Clean column names: remove whitespace and special characters
# This prevents many common errors
clean_df.columns = clean_df.columns.str.strip().str.replace(
    "[^A-Za-z0-9_]+", "", regex=True
)

# 3. Drop the original, unnamed index column if it exists from a previous save
if "Unnamed0" in clean_df.columns:
    clean_df = clean_df.drop(columns=["Unnamed0"])

print("\n--- Cleaned Base Data ---")
display(clean_df.head())
clean_df.info()

Successfully loaded raw data. Shape: (1523, 19)
Shape after dropping duplicates: (1511, 19)

--- Cleaned Base Data ---


Unnamed: 0,Gender,Age,Hemoglobingdl,Neutrophils,Lymphocytes,Monocytes,Eosinophils,RBC,HCT,MCVfl,MCHpg,MCHCgdl,RDWCV,TotalPlateletCountcumm,MPVfl,PDW,PCT,TotalWBCcountcumm,Result
0,Male,21,14.8,48,47,3,2,5,48.0,96.0,29.6,30.8,11.6,112000,10.7,15.4,0.12,5100,positive
1,Male,30,15.0,47,49,6,3,5,49.8,96.1,28.4,29.5,11.8,96000,10.6,15.8,0.121,4500,positive
2,Male,51,16.3,41,48,4,5,5,50.1,93.5,31.3,32.7,13.5,184000,10.4,16.4,0.13,6000,negative
3,Female,26,12.3,46,49,7,5,5,44.0,90.0,30.5,30.5,14.7,167000,8.1,17.1,0.11,5000,negative
4,Male,35,16.1,45,46,4,4,5,50.53,91.0,29.12,29.2,15.2,155000,10.52,12.34,0.15,4600,negative


<class 'pandas.core.frame.DataFrame'>
Index: 1511 entries, 0 to 1522
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Gender                  1511 non-null   object 
 1   Age                     1511 non-null   int64  
 2   Hemoglobingdl           1511 non-null   float64
 3   Neutrophils             1511 non-null   int64  
 4   Lymphocytes             1511 non-null   int64  
 5   Monocytes               1511 non-null   int64  
 6   Eosinophils             1511 non-null   int64  
 7   RBC                     1511 non-null   int64  
 8   HCT                     1511 non-null   float64
 9   MCVfl                   1511 non-null   float64
 10  MCHpg                   1511 non-null   float64
 11  MCHCgdl                 1511 non-null   float64
 12  RDWCV                   1511 non-null   float64
 13  TotalPlateletCountcumm  1511 non-null   int64  
 14  MPVfl                   1511 non-null   float

In [8]:
baseline_df = clean_df.copy()
ratio_df = clean_df.copy()
threshold_df = clean_df.copy()

print("Created three independent dataframes: baseline_df, ratio_df, threshold_df")

Created three independent dataframes: baseline_df, ratio_df, threshold_df


In [9]:
# --- 1. Engineer Features for the 'ratio' DataFrame ---
print("\n--- Engineering 'ratio' dataset ---")
# Use .loc to avoid SettingWithCopyWarning
ratio_df.loc[:, "NLR"] = ratio_df["Neutrophils"] / ratio_df["Lymphocytes"].replace(
    0, np.nan
)
ratio_df.loc[:, "PLR"] = ratio_df["TotalPlateletCountcumm"] / ratio_df[
    "Lymphocytes"
].replace(0, np.nan)
# Handle potential division by zero if any NaNs were created
ratio_df.fillna(0, inplace=True)

# --- 2. Engineer Features for the 'threshold' DataFrame ---
print("--- Engineering 'threshold' dataset ---")
# Create the binary 'Low_Platelet' feature
threshold_df.loc[:, "Low_Platelet"] = (
    threshold_df["TotalPlateletCountcumm"] < PLATELET_THRESHOLD
)
# REMOVE the original continuous feature
threshold_df = threshold_df.drop(columns=["TotalPlateletCountcumm"])

print("\nFeature engineering complete.")


--- Engineering 'ratio' dataset ---
--- Engineering 'threshold' dataset ---

Feature engineering complete.


In [10]:
# Validate the final structure of each dataframe before saving
print("\n--- Final Schemas ---")
print(f"Baseline columns: {baseline_df.columns.tolist()}")
print(f"Ratio columns:    {ratio_df.columns.tolist()}")
print(f"Threshold columns: {threshold_df.columns.tolist()}")

# Define output paths
baseline_out = os.path.join(PROCESSED_DATA_DIR, "baseline.csv")
ratio_out = os.path.join(PROCESSED_DATA_DIR, "ratio.csv")
threshold_out = os.path.join(PROCESSED_DATA_DIR, "threshold.csv")

# Export the final, clean artifacts
baseline_df.to_csv(baseline_out, index=False)
ratio_df.to_csv(ratio_out, index=False)
threshold_df.to_csv(threshold_out, index=False)

print("\n--- Export Complete ---")
print(f"Baseline saved to: {baseline_out} (shape={baseline_df.shape})")
print(f"Ratio saved to:    {ratio_out} (shape={ratio_df.shape})")
print(f"Threshold saved to: {threshold_out} (shape={threshold_df.shape})")


--- Final Schemas ---
Baseline columns: ['Gender', 'Age', 'Hemoglobingdl', 'Neutrophils', 'Lymphocytes', 'Monocytes', 'Eosinophils', 'RBC', 'HCT', 'MCVfl', 'MCHpg', 'MCHCgdl', 'RDWCV', 'TotalPlateletCountcumm', 'MPVfl', 'PDW', 'PCT', 'TotalWBCcountcumm', 'Result']
Ratio columns:    ['Gender', 'Age', 'Hemoglobingdl', 'Neutrophils', 'Lymphocytes', 'Monocytes', 'Eosinophils', 'RBC', 'HCT', 'MCVfl', 'MCHpg', 'MCHCgdl', 'RDWCV', 'TotalPlateletCountcumm', 'MPVfl', 'PDW', 'PCT', 'TotalWBCcountcumm', 'Result', 'NLR', 'PLR']
Threshold columns: ['Gender', 'Age', 'Hemoglobingdl', 'Neutrophils', 'Lymphocytes', 'Monocytes', 'Eosinophils', 'RBC', 'HCT', 'MCVfl', 'MCHpg', 'MCHCgdl', 'RDWCV', 'MPVfl', 'PDW', 'PCT', 'TotalWBCcountcumm', 'Result', 'Low_Platelet']

--- Export Complete ---
Baseline saved to: ..\data\processed\baseline.csv (shape=(1511, 19))
Ratio saved to:    ..\data\processed\ratio.csv (shape=(1511, 21))
Threshold saved to: ..\data\processed\threshold.csv (shape=(1511, 19))
