In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
from xlsxwriter import Workbook
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load the dataset
file_path = '/Omicron_pre_processed2Vacc.xlsx' 
dataset = pd.read_excel(file_path)

In [8]:
# Replace the datetime values with NaN in the WBC column
dataset['WBC'] = pd.to_numeric(dataset['WBC'], errors='coerce')

# Check for other non-numeric types again to ensure all are handled
object_columns = dataset.select_dtypes(include=['object']).columns
unique_values = {col: dataset[col].unique() for col in object_columns}
unique_values

{}

# Missing value Imputation


In [9]:
# Calculate the percentage of missing values per column
missing_values_percentage = dataset.isnull().mean() * 100

# Display the results
missing_values_percentage_sorted = missing_values_percentage.sort_values(ascending=False)
missing_values_percentage_sorted.head(30)

GENDER                   24.770642
D-DIMERS_cleaned         16.972477
TnI                      16.055046
FIBRINOGEN               15.596330
DIRECT BIL               12.385321
BIL                      10.550459
TEMP                     10.091743
FERRITIN                 10.091743
INR                       8.256881
APTT                      8.256881
URTIx                     7.798165
HCO3                      7.798165
DYSPNEAx                  6.422018
DIARRHEASx                5.963303
Diastolic_BP              5.963303
PULSE RATE                5.963303
PCO2                      5.963303
Systolic_BP               5.963303
Outcome_numerical         4.128440
VACCINATED                4.128440
PO2/FIO2                  3.669725
FATIGUEx                  3.669725
PH                        3.669725
LDH                       3.211009
CCI                       3.211009
COUGHx                    3.211009
FIO2 eisagwgh_cleaned     2.752294
SGOT                      2.293578
K                   

In [10]:
# Identify continuous and discrete columns
discrete_columns = ['GENDER', 'INTUBATION', 'CPAP', 'HIGH FLOW', 'WHO score', "qSOFA", "Outcome_numerical", "FEVERx",	"COUGHx",	"FATIGUEx",	"DIARRHEASx",	"DYSPNEAx",	"URTIx"]
continuous_columns = [col for col in dataset.columns if col not in discrete_columns]

# Impute continuous columns
continuous_data = dataset[continuous_columns]

# Initialize the IterativeImputer with RandomForestRegressor
imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=5, max_depth=None, random_state=42),
                            max_iter=10, random_state=42)

# Perform the imputation for continuous data
data_imputed = imputer.fit_transform(continuous_data)

# Convert the imputed data back to a DataFrame
data_imputed_df = pd.DataFrame(data_imputed, columns=continuous_columns)
data_imputed_df.index = dataset.index

# Replace the original continuous columns in the dataset with the imputed data
for column in continuous_columns:
    dataset[column] = data_imputed_df[column]

# Define the function for imputing discrete columns using RandomForestClassifier
def impute_numerical_discrete_rf(data, column, other_columns):
    # Prepare the training data (where column is not missing)
    train = data[data[column].notnull()]
    test = data[data[column].isnull()]

    if not test.empty:
        X_train = train[other_columns]
        y_train = train[column].astype('int')  # Ensure the target is integer
        X_test = test[other_columns]

        # Initialize and train classifier
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train, y_train)

        # Predict and fill missing values
        predicted_values = clf.predict(X_test)
        data.loc[data[column].isnull(), column] = predicted_values

    return data

# List of other columns to use as predictors; typically all other columns except the one being imputed
other_columns = [col for col in dataset.columns if col not in discrete_columns]

# Apply the imputation for each discrete column
for col in discrete_columns:
    dataset = impute_numerical_discrete_rf(dataset, col, other_columns)





In [11]:
# Calculate the percentage of missing values per column
missing_values_percentage = dataset.isnull().mean() * 100

# Display the results
missing_values_percentage_sorted = missing_values_percentage.sort_values(ascending=False)
missing_values_percentage_sorted.head(30)

Subject_ID                         0.0
DIZZINESS-INSTABILITY-CONFUSION    0.0
lymphedema                         0.0
prediabetes                        0.0
Medication_meronem                 0.0
Medication_XARELTO                 0.0
Medication_avelox                  0.0
Medication_AMERONEM                0.0
Medication_CANCIDAS                0.0
Medication_APIXABAN_2.5            0.0
Medication_TAQZOCIN                0.0
Medication_zyvoxid                 0.0
Medication_briklin                 0.0
Medication_fungustatin             0.0
Medication_bactrimel               0.0
Medication_1X2                     0.0
Medication_dalacin                 0.0
Medication_garamycin               0.0
Medication_rocephin                0.0
Medication_vibramycin              0.0
Medication_tazocin                 0.0
esrd                               0.0
Pulmonary_Embolism                 0.0
pfo                                0.0
Oncological_Conditions             0.0
VOMITING                 

In [12]:
dataset

Unnamed: 0,Subject_ID,AGE,GENDER,LOS,DAYS OF SYMPTOMS,FEVERx,COUGHx,FATIGUEx,DIARRHEASx,DYSPNEAx,...,Medication_DAKTARIN,Medication_AMBISONE,Medication_enoxaparin.1,Medication_PLAVIX,Medication_CLEXANE,Medication_fondaparinux,Medication_SINTROM,Medication_tinzaparin,Medication_cubicin,Medication_zyvoxid.1
0,1.0,71.0,2.0,6.0,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,103.0,2.0,6.0,2.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,45.0,1.0,6.0,4.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,79.0,1.0,5.0,5.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,76.0,2.0,6.0,9.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,217.0,84.0,1.0,7.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214,218.0,91.0,1.0,6.0,2.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
215,221.0,65.0,1.0,2.0,5.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
216,222.0,67.0,2.0,6.4,10.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Exporting progress 
file_path_dataset12 = '/Omicron_pre_processed3Vacc.xlsx' 
dataset.to_excel(file_path_dataset12, index=False)