In [13]:
#connects google drive to google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
###importing libraries necessary for visualizations, and data imputation
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [15]:
###importing initial csv file containing the data and taking small sample of the data

# Original pathway /content/drive/MyDrive/Colab Notebooks/capo_data.csv
df=pd.read_csv('/content/drive/MyDrive/DS Capstone/Mini Projects/capo_data.csv')
df.sample(5)

Unnamed: 0,dem_age,dem_sex,dem_pregnant,dem_trimester,exam_height,exam_weight,exam_hr,exam_rr,exam_sbp,exam_dbp,...,lab_crp,lab_pct,lab_vitamind,lab_abg,lab_abgph,lab_abgpaco2,lab_abgpao2,lab_abgbicarb,lab_abgfio2,TCS
2430,40.0,0.0,0.026528,2.157895,17.0,53.0,90.0,23.671101,130.0,80.0,...,22.57,6.48213,34.64,1.0,7.434445,35.962724,67.068697,23.99043,24.278867,4
1141,81.0,0.0,0.026528,2.157895,160.0,50.0,85.0,26.0,130.0,80.0,...,58.0,6.48213,34.64,0.0,7.5,34.3,48.5,26.0,21.0,5
2679,50.0,1.0,0.026528,2.157895,175.0,75.0,80.0,16.0,130.0,70.0,...,286.0,0.2529,34.64,0.0,7.484,34.1,52.7,25.4,21.0,2
2058,60.0,1.0,0.026528,2.157895,184.0,80.0,140.0,23.0,130.0,80.0,...,88.5,6.48213,34.64,0.0,7.45,42.0,53.0,28.9,21.0,4
471,87.0,0.0,0.0,2.157895,158.603831,72.54643,104.0,22.0,100.0,60.0,...,7.4,6.48213,34.64,1.0,7.434445,36.4,102.0,16.9,24.278867,7


In [21]:
# Selects relevant columns for later use in regression
#The demographic and examination data provide patient-specific details useful for predictive modeling.
#The chest x-ray (cx) data is included as it might be correlated with disease severity and recovery time.
#The TCS criteria columns are essential to calculate the Time to Clinical Stability.
demographic_cols = [col for col in df.columns if col.startswith("dem_")]
exam_cols = [col for col in df.columns if col.startswith("exam_")]
cx_cols = [col for col in df.columns if col.startswith("cx_")]
tcs_criteria_cols = [f"day{day}test___{crit}" for day in range(1, 8) for crit in ["cough", "afebrile", "wbc", "oral"]]
lab_columns = ['lab_hematocrit', 'lab_hemoglobin', 'lab_wbc',
       'lab_bands', 'lab_platelets', 'lab_inr', 'lab_na', 'lab_k', 'lab_bun',
       'lab_creatinine', 'lab_bicarb', 'lab_glucose', 'lab_albumin', 'lab_ast',
       'lab_alt', 'lab_bilirubin', 'lab_trop1', 'lab_trop2', 'lab_trop3',
       'lab_ckmb1', 'lab_ckmb2', 'lab_ckmb3', 'lab_ldl', 'lab_hdl',
       'lab_cholesterol', 'lab_triglycerides', 'lab_lactate', 'lab_hga1c',
       'lab_ldh', 'lab_bnp', 'lab_crp', 'lab_pct', 'lab_vitamind', 'lab_abg',
       'lab_abgph', 'lab_abgpaco2', 'lab_abgpao2', 'lab_abgbicarb',
       'lab_abgfio2']

# Combines necessary columns into a concise data frame
selected_cols = demographic_cols + exam_cols + cx_cols + tcs_criteria_cols + lab_columns
df_selected = df[selected_cols]

# Handles missing values
# Drops rows if they have too many missing values (less than 80% of data in selected_cols)
# Dropped rows where more than 20% of values were missing to ensure data quality.
#If a patient record is missing too much data, it might reduce model accuracy and introduce bias.
#Threshold = 80% ensures we retain enough data while maintaining quality.

df_selected = df_selected.dropna(thresh=len(df_selected.columns) * 0.6)  # Gives good amount of regular and lab data

# Imputes missing values using the mean as imputation as all relevant columns are numerical
#Mean imputation preserves the distribution of numerical data.
#It prevents data loss compared to dropping rows entirely
num_cols = df_selected.select_dtypes(include=["number"]).columns
num_imputer = SimpleImputer(strategy="mean")
df_selected[num_cols] = num_imputer.fit_transform(df_selected[num_cols])

# Calculate Time to Clinical Stability (TCS)
#If a patient does not stabilize within 7 days, we cannot determine their exact TCS beyond this timeframe due to data limitations.
#Right-censoring (TCS = 8) ensures these cases are correctly handled in survival analysis.
def calculate_tcs(row):
    for day in range(1, 8):  # Days 1 to 7
        if all(row.get(f"day{day}test___{crit}", 0) == 1 for crit in ["cough", "afebrile", "wbc", "oral"]):
            return day  # Return the first day when all four criteria are met
    return 8  # Right-censored (TCS = 8 if stability not reached)

# Apply the function to each row
df_selected["TCS"] = df_selected.apply(calculate_tcs, axis=1)

# Save cleaned dataset
# original file path /content/drive/MyDrive/Colab Notebooks/cleaned_capo_data.csv

file_path = '/content/drive/MyDrive/DS Capstone/Mini Projects/capo_data.csv'  # Specify the desired file path
df_selected.to_csv(file_path, index=False)

print(f"Data preprocessing complete. Cleaned dataset saved as '{file_path}'.")

Data preprocessing complete. Cleaned dataset saved as '/content/drive/MyDrive/DS Capstone/Mini Projects/capo_data.csv'.
