In [6]:
#connects google drive to google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
###importing libraries necessary for visualizations, and data imputation
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [14]:
###importing initial csv file containing the data and taking small sample of the data
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/capo_data.csv')
df.sample(5)

Unnamed: 0,case_id,dem_age,dem_sex,dem_pregnant,dem_trimester,exam_height,exam_weight,exam_hr,exam_rr,exam_sbp,...,day6test___wbc,day6test___oral,day7test___cough,day7test___afebrile,day7test___wbc,day7test___oral,over7daytest___cough,over7daytest___afebrile,over7daytest___wbc,over7daytest___oral
3680,4257,76.0,0.0,,,,,88.0,,,...,1,1,1,1,1,1,1,1,1,1
7739,8909,36.0,1.0,,,,,106.0,,140.0,...,0,0,0,0,0,0,0,0,0,0
5092,5888,60.0,0.0,0.0,,160.0,55.0,140.0,24.0,120.0,...,0,0,0,0,0,0,0,0,0,0
2627,2969,80.0,1.0,,,,,,,,...,1,0,1,1,1,0,1,1,1,1
1867,2109,19.0,1.0,,,,,,,,...,1,0,1,0,1,1,1,1,1,1


In [23]:
# Selects relevant columns for later use in regression
#The demographic and examination data provide patient-specific details useful for predictive modeling.
#The chest x-ray (cx) data is included as it might be correlated with disease severity and recovery time.
#The TCS criteria columns are essential to calculate the Time to Clinical Stability.
demographic_cols = [col for col in df.columns if col.startswith("dem_")]
exam_cols = [col for col in df.columns if col.startswith("exam_")]
cx_cols = [col for col in df.columns if col.startswith("cx_")]
tcs_criteria_cols = [f"day{day}test___{crit}" for day in range(1, 8) for crit in ["cough", "afebrile", "wbc", "oral"]]

# Combines necessary columns into a concise data frame
selected_cols = demographic_cols + exam_cols + cx_cols + tcs_criteria_cols
df_selected = df[selected_cols]

# Handles missing values
# Drops rows if they have too many missing values (less than 80% of data in selected_cols)
# Dropped rows where more than 20% of values were missing to ensure data quality.
#If a patient record is missing too much data, it might reduce model accuracy and introduce bias.
#Threshold = 80% ensures we retain enough data while maintaining quality.

df_selected = df_selected.dropna(thresh=len(df_selected.columns) * 0.8)  # Can adjust threshold as needed

# Imputes missing values using the mean as imputation as all relevant columns are numerical
#Mean imputation preserves the distribution of numerical data.
#It prevents data loss compared to dropping rows entirely
num_cols = df_selected.select_dtypes(include=["number"]).columns
num_imputer = SimpleImputer(strategy="mean")
df_selected[num_cols] = num_imputer.fit_transform(df_selected[num_cols])

# Calculate Time to Clinical Stability (TCS)
#If a patient does not stabilize within 7 days, we cannot determine their exact TCS beyond this timeframe due to data limitations.
#Right-censoring (TCS = 8) ensures these cases are correctly handled in survival analysis.
def calculate_tcs(row):
    for day in range(1, 8):  # Days 1 to 7
        if all(row.get(f"day{day}test___{crit}", 0) == 1 for crit in ["cough", "afebrile", "wbc", "oral"]):
            return day  # Return the first day when all four criteria are met
    return 8  # Right-censored (TCS = 8 if stability not reached)

# Apply the function to each row
df["TCS"] = df.apply(calculate_tcs, axis=1)

# Save cleaned dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/cleaned_capo_data.csv'  # Specify the desired file path
df_selected.to_csv(file_path, index=False)

print(f"Data preprocessing complete. Cleaned dataset saved as '{file_path}'.")

Data preprocessing complete. Cleaned dataset saved as '/content/drive/MyDrive/Colab Notebooks/cleaned_capo_data.csv'.
