# Omicron pre-processing

In [86]:
import pandas as pd
import numpy as np
from datetime import datetime
from xlsxwriter import Workbook
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load the dataset
file_path = '/Covid19_omicron.xlsx'
dataset = pd.read_excel(file_path)

In [88]:
dataset.shape

(226, 151)

## INTUBATION

In [89]:
# Function to clean 'tINTUBATION' based on specific rules
def clean_tintubation(row):
    if row['INTUBATION'] == 0:
        return 0
    elif row['tINTUBATION'] == '4-?':
        return 4
    elif row['tINTUBATION'] == '++':
        return np.nan
    elif isinstance(row['tINTUBATION'], pd.Timestamp):
        if isinstance(row['INTUBATION DATE'], pd.Timestamp):
            return (row['tINTUBATION'] - row['INTUBATION DATE']).days
        else:
            return row['tINTUBATION']
    else:
        return row['tINTUBATION']

# Apply the cleaning function to the 'tINTUBATION' column
dataset['tINTUBATION_cleaned'] = dataset.apply(clean_tintubation, axis=1)

# Update specific 'tINTUBATION_cleaned' values based on manual instructions
dataset.loc[dataset['tINTUBATION_cleaned'] == pd.Timestamp('2021-09-06'), 'tINTUBATION_cleaned'] = 2
dataset.loc[dataset['tINTUBATION_cleaned'] == pd.Timestamp('2021-11-08'), 'tINTUBATION_cleaned'] = 1

dataset = dataset.drop(columns="tINTUBATION")

## CPAP

In [None]:
# Define the transformation function for 'tCPAP'
def transform_tcpap(value):
    if value == '3+15':
        return 18
    elif value == pd.Timestamp('2021-09-06'):
        return 6
    elif value == pd.Timestamp('2021-09-09'):
        return 1
    elif value == pd.Timestamp('2021-09-20'):
        return 1
    elif value == pd.Timestamp('2021-09-02'): 
        return 3
    return value 

# Apply the transformation to 'tCPAP'
dataset['tCPAP_numerical'] = dataset['tCPAP'].apply(transform_tcpap)

# Adjusting manually for the second condition of '2021-09-02 00:00:00' being 2
dataset.loc[dataset['tCPAP'] == pd.Timestamp('2021-09-02'), 'tCPAP_numerical'] = 2
dataset = dataset.drop(columns=['tCPAP', 'DATE CPAP'])


In [91]:
# Import necessary library for handling datetime and apply the transformations
import datetime

# Apply the specified transformations to the 'HIGH FLOW ' column
dataset['HIGH FLOW '] = dataset['HIGH FLOW '].replace({np.nan: 0, datetime.time(0, 0): 1})
# Apply additional transformation for datetime.datetime(1900, 1, 1, 0, 0) in 'HIGH FLOW ' column
dataset['HIGH FLOW '] = dataset['HIGH FLOW '].replace({datetime.datetime(1900, 1, 1, 0, 0): 1})

# Rename the 'HIGH FLOW ' column to 'HIGH FLOW' (removing the extra space)
dataset.rename(columns={'HIGH FLOW ': 'HIGH FLOW'}, inplace=True)

# Recheck the unique values in 'HIGH FLOW ' after this additional transformation
new_updated_unique_high_flow_values = dataset['HIGH FLOW'].unique()
new_updated_unique_high_flow_values



array([0, 1])

In [92]:
# Apply transformation: set 't HIGH FLOW' to 0 where corresponding 'HIGH FLOW' is 0 and 't HIGH FLOW' is NaN
dataset.loc[(dataset['HIGH FLOW'] == 0) & (dataset['t HIGH FLOW'].isna()), 't HIGH FLOW'] = 0

# Verify the changes by checking the updated unique values in 't HIGH FLOW'
updated_unique_t_high_flow_values = dataset['t HIGH FLOW'].unique()
updated_unique_t_high_flow_values


array([ 0.,  3.,  1.,  7.,  5., 10., nan,  4., 17.,  2.])

## Commorbilities

In [93]:
# Identifying unique values in the 'CM/other' column
unique_cm_other_values = dataset['CM/other'].dropna().unique()
unique_cm_other_values


array([0, '1(GLAUCOMA/BL BLINDNESS)', 'HETEROZYGOTE B-TH',
       'BLUNT HEAD TRAUMA', 'RENAL TRANSPLANT (2008)',
       'CAROTID STENOSIS, HYPERPARATHYROIDISM', 'BPH', 'TACHYCARDIA',
       'HBV CHRONIC/DEMENTIA EARLY ONSET', 'MDS 3xMONTH RX', '1/OBESITY',
       'ANEMIA/PREGNANT 34TH WEEK', 'ARRYTHMIA',
       'CATARACTS/REDUCED CRclearance', 'MDS/MM/HBV',
       'VERTIGO/DEMENTIA/PRESSURE ULCER', 'DEMENTIA', 'PSYCHOSIS',
       'PANIC ATTACK', 'ALLERGIC RHINITIS', 'MENINGIOMA/DEMENTIA',
       'AORTIC STENOSIS', 'ANEMIA', 'PFO', 'THYROID',
       'AFIB DURING HOSPITALISATION',
       'BCx ACINETOBACTER Baumani/ΥΠΕΡΟΥΡΙΧΑΙΜΙΑ/ΠΑΛΙΟ ΑΕΕ/ΡΑ',
       'RECENT 7-DAY HOSPITILIZATION FOR COVID (DISCHARGED 10 DAYS AGO)',
       'VIT D DEF/ANEMIA REQ TRANFUSION', 'MS', 'G6PD DEF',
       'CAROTID STENOSIS', 'UROSTOMY-2013(BLADDER CA)',
       'POLYMYALGIA RHEUMATIC/OSTEOPOROSIS', 'EPILEPSY',
       'RENAL TRANSPLANT RECIPIENT (2019)',
       'URETHRAL STENOSIS/MULTIPLE UTIS/GLAUCOMA/POLYP SUR

## Drop date, irrelevant, and high missing value columns 

In [94]:
# Columns to be dropped
columns_to_drop_by_name = ['DAY OF ADMISSION', 'DATE OF DISCHARGE OR EVENT', 'FA ΟΙΚΕΙ','ΑΝΤΙΒ ΑΓΩΓΗ ΝΟΣΟΚ', 'HF DATE', "Vax Date", "DATE TOCILIZUMAB", 'ΑΜ ΡΙΟΥ', 'PRL',	'TSH',	'T3 ', 'T4', 'PH7',	'PO22',	'PCO29',	'FIO2 eisagwgh11',	'P/F DAY 2',	'PH73'	'PO284',	'PCO295',	'FIO2 eisagwgh117',	'P/F RATIO DAY 3',	'P/F RATIO DAY 4',	'P/F RATIO DAY 5',	'P/F RATIO DAY 6',	'P/F RATIO DAY 7',	'P/F RATIO DAY 10', 'APACHE', 'PH73',	'PO284', 'INTUBATION DATE', 'DIABETES TENTO']
'DATE CPAP'
# Drop specified columns from the dataset
dataset = dataset.drop(columns=columns_to_drop_by_name, errors='ignore')  # 'errors='ignore'' to avoid error if column doesn't exist


In [95]:
# Extra columns to drop
columns_to_drop_extra = ['smoking', 'VACCINATED', 'Vax', 'DM YEARS', 'HBA1C', 'BARICITINIB DAYS']
dataset = dataset.drop(columns=columns_to_drop_extra, errors='ignore')

## Gender

In [96]:
# Correcting the gender column
#dataset['GENDER'] = dataset['GENDER'].replace({'Μ': 'M'})

# Map the 'GENDER' column to numerical values: 'F' to 1 and 'M' to 2
dataset['GENDER'] = dataset['GENDER'].map({'F': 1, 'M': 2})

## LOS

In [97]:
# FIxing some values
dataset['LOS'] = dataset['LOS'].apply(lambda x: None if x < 0 else x)


## DOS

In [98]:
# Replace 'ΝΟ' with NaN in the 'DAYS OF SYMPTOMS' column
dataset['DAYS OF SYMPTOMS'] = dataset['DAYS OF SYMPTOMS'].replace('ΝΟ', 0)

## Outcome

In [99]:
# Standardize the unique values for better consistency

# Function to standardize the outcomes
def standardize_outcome(outcome):
    if pd.isna(outcome):
        return 'UNKNOWN'  # Replace NaN with 'UNKNOWN'
    outcome = outcome.upper()  # Convert to uppercase
    outcome = outcome.strip()  # Remove leading and trailing whitespaces
    if outcome == 'ΑΜΑ':  # Handle special case
        outcome = 'AMA'
    if 'DISCHARGE' in outcome:  # Consolidate different forms of 'DISCHARGE'
        outcome = 'DISCHARGE'
    if 'TRANSFER' in outcome and 'ICU' in outcome:  # Consolidate ICU transfer cases
        outcome = 'TRANSFER TO ICU'
    elif 'TRANSFER' in outcome:  # Handle general transfer cases
        outcome = 'TRANSFER'
    return outcome

# Apply standardization to the 'outcome' column
dataset['outcome_standardized'] = dataset['outcome'].apply(standardize_outcome)
unique_outcomes_standardized = dataset['outcome_standardized'].unique()
dataset = dataset.drop(columns='outcome')
# Define the mapping from outcomes to integers
outcome_to_int = {
    'DISCHARGE': 1,
    'DEATH': 2,
    'TRANSFER TO ICU': 3
}

# Apply the mapping to create a new column 'outcome_int'
dataset['Outcome_numerical'] = dataset['outcome_standardized'].map(outcome_to_int)
dataset.drop(columns=["outcome_standardized"], inplace=True)


In [100]:
dataset['Outcome_numerical'].isnull().mean() * 100

7.52212389380531

## Infiltrate

In [101]:
dataset['INFILTRATE'].values

array(['L', nan, nan, 'BL', 'L>R', nan, nan, nan, 'BL', 'R>L', 'R', nan,
       'BL', nan, nan, 'L', 'BL', nan, nan, 'R>L', nan, nan, nan, 'L',
       nan, 'L>R', nan, nan, nan, nan, 'BL', nan, 'BL', nan, nan, 'BL',
       nan, nan, nan, nan, nan, 'BL', nan, nan, nan, 'R>L', 'ΒL', nan,
       'BL', 'BL', 'PNEUMOTHORAX R', nan, nan, 'ΟΚ', nan, 'BL', 'L>R',
       nan, 'R', 'R>L', 'BL', nan, nan, nan, nan, 'BL', nan, nan, nan,
       'BL', nan, nan, nan, 'ΒL', nan, 0, 'BL', 'BL', nan, nan, nan, 'ΟΚ',
       'BL', nan, nan, 'OK', 'OK', nan, 'OK', 'BL', 0, 'ΒL', 'BL', nan,
       'BL', nan, nan, 'BL', nan, 0, nan, nan, 'BL', 'BL', 'BL', nan,
       'OK', nan, nan, nan, nan, nan, 'OK', nan, nan, nan, nan, 0, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       'R', nan, 0, 0, 0, 0, nan, nan, 0, nan, 0, 0, nan, nan, nan, nan,
       0, nan, 'BL', 0, nan, 0, nan, 'BL', nan, nan, nan, nan, na

In [None]:
# Standardize the unique values for 'INFILTRATE' based on common interpretations and prepare for mapping

# Function to standardize the INFILTRATE values
def standardize_infiltrate(value):
    if pd.isna(value) or value == 0:
        return 'ok'  # Treat NaN and 0 as 'ok'
    value = str(value).upper().strip()
    if value in ['BL', 'BILATERAL', 'ΒL']:
        return 'bilateral'
    elif value in ['L>R', 'FLUID L']: 
        return 'l>r'
    elif value == 'L':
        return 'l'
    elif value in ['R>L', 'PNEUMOTHORAX R']: 
        return 'r>l'
    elif value == 'R':
        return 'r'
    elif value in ['OK', 'ΟΚ']:
        return 'ok'
    return 'unknown'

# Apply standardization to the 'INFILTRATE' column
dataset['infiltrate_standardized'] = dataset['INFILTRATE'].apply(standardize_infiltrate)

# Map standardized values to integers based on provided mapping
infiltrate_mapping = {
    'bilateral': 1,
    'r>l': 2,
    'l': 3,
    'ok': 4,
    'r': 5,
    'l>r': 6,
    'fluid l': 7
}
dataset['infiltrate_int'] = dataset['infiltrate_standardized'].map(infiltrate_mapping)
dataset[['INFILTRATE', 'infiltrate_standardized', 'infiltrate_int']].head()  
dataset = dataset.drop(columns='INFILTRATE')


## Others

In [None]:
# Function to standardize the 'OTHER' values based on specific categories of interest
def standardize_other_specific(value):
    if pd.isna(value) or value == 0:
        return 'None'
    value = str(value).upper().strip()  
    # Direct mapping based on specified interest categories
    if 'HEADACHE' in value:
        return 'HEADACHE'
    elif 'ANOSMIA' in value or 'ΑΝΟΣΜΙΑ' in value:
        return 'ANOSMIA'
    elif 'NAUSEA' in value:
        return 'NAUSEA'
    elif any(term in value for term in ['DIZZINESS', 'INSTABILITY', 'CONFUSION', 'SYNCOPE', 'ΛΙΠΟΘΥΜΙΑ', 'DESAT']):
        return 'DIZZINESS-INSTABILITY-CONFUSION'
    elif 'PLEURAL EFFUSION' in value:
        return 'PLEURAL EFFUSION'
    elif 'VOMIT' in value or 'N/V' in value:
        return 'VOMITING'
    else:
        return 'Other' 

# Apply the specific standardization to the 'OTHER' column
dataset['other_specific'] = dataset['OTHER'].apply(standardize_other_specific)

# Initialize columns for each category of interest
categories_of_interest = ['HEADACHE', 'ANOSMIA', 'NAUSEA', 'DIZZINESS-INSTABILITY-CONFUSION', 'PLEURAL EFFUSION', 'VOMITING']
for category in categories_of_interest:
    dataset[category] = dataset['other_specific'].apply(lambda x: 1 if x == category else 0)

# Drop the original 'OTHER' and 'other_specific' columns
dataset.drop(['OTHER', 'other_specific'], axis=1, inplace=True)

## Days of symptoms

In [None]:
# Standardize 'DAYS OF SYMPTOMS' column based on the instructions

# Function to clean and standardize 'DAYS OF SYMPTOMS'
def clean_days_of_symptoms(value):
    if isinstance(value, str): 
        if '15+' in value: 
            return 15
        else: 
            return pd.NA
    else: 
        return value

# Apply the cleaning function to the 'DAYS OF SYMPTOMS' column
dataset['DAYS OF SYMPTOMS'] = dataset['DAYS OF SYMPTOMS'].apply(clean_days_of_symptoms)

# Convert the column to integers, NA will be converted to NaN
dataset['DAYS OF SYMPTOMS'] = pd.to_numeric(dataset['DAYS OF SYMPTOMS'], downcast='integer')

# Check the unique values after standardization
unique_days_of_symptoms_cleaned = dataset['DAYS OF SYMPTOMS'].unique()
unique_days_of_symptoms_cleaned


array([ 3.,  2.,  4.,  5.,  9.,  8.,  7., 11.,  6., 10., 15., 13., 16.,
        1., 14., 22., nan, 12., 20.])

In [105]:
# Check the data types of the specified columns and if they contain only integers
columns_of_interest = ['FEVER', 'COUGH', 'FATIGUE', 'DIARRHEAS', 'DYSPNEA', 'URTI']
column_data_types = {column: dataset[column].dtype for column in columns_of_interest}
column_integrity = {column: dataset[column].apply(lambda x: isinstance(x, int) or pd.isna(x)).all() for column in columns_of_interest}

for column in columns_of_interest:
    # Convert the column to a nullable integer type
    dataset[column] = dataset[column].astype('Int64')



## BP

In [106]:
# Finding the position of the original "BP" column
bp_column_index = dataset.columns.get_loc('BP')

# Splitting the "BP" column into two separate columns for systolic and diastolic values
bp_split = dataset['BP'].str.split('/', expand=True)
dataset['Systolic_BP'] = pd.to_numeric(bp_split[0], errors='coerce')
dataset['Diastolic_BP'] = pd.to_numeric(bp_split[1], errors='coerce')

# Dropping the original "BP" column
dataset.drop('BP', axis=1, inplace=True)

# Inserting the new systolic and diastolic BP columns in the position where the original BP column was
dataset.insert(bp_column_index, 'Diastolic_BP', dataset.pop('Diastolic_BP'))
dataset.insert(bp_column_index, 'Systolic_BP', dataset.pop('Systolic_BP'))

## Med other

In [None]:
# Ensure NaN values remain unchanged and only convert valid non-zero values to 1
dataset['MED/other'] = dataset['MED/other'].apply(lambda x: 1 if x != 0 and not pd.isna(x) else x)

# Verify the changes
dataset['MED/other'].head(10) 


0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    1.0
8    0.0
9    0.0
Name: MED/other, dtype: float64

## CM/other

In [108]:
#Identifying unique values in the 'CM/other' column
unique_cm_other_values = dataset['CM/other'].dropna().unique()

# Filtering out combinatory values and creating a list of individual terms
individual_cm_other_values = [val for val in unique_cm_other_values]

individual_cm_other_values

[0,
 '1(GLAUCOMA/BL BLINDNESS)',
 'HETEROZYGOTE B-TH',
 'BLUNT HEAD TRAUMA',
 'RENAL TRANSPLANT (2008)',
 'CAROTID STENOSIS, HYPERPARATHYROIDISM',
 'BPH',
 'TACHYCARDIA',
 'HBV CHRONIC/DEMENTIA EARLY ONSET',
 'MDS 3xMONTH RX',
 '1/OBESITY',
 'ANEMIA/PREGNANT 34TH WEEK',
 'ARRYTHMIA',
 'CATARACTS/REDUCED CRclearance',
 'MDS/MM/HBV',
 'VERTIGO/DEMENTIA/PRESSURE ULCER',
 'DEMENTIA',
 'PSYCHOSIS',
 'PANIC ATTACK',
 'ALLERGIC RHINITIS',
 'MENINGIOMA/DEMENTIA',
 'AORTIC STENOSIS',
 'ANEMIA',
 'PFO',
 'THYROID',
 'AFIB DURING HOSPITALISATION',
 'BCx ACINETOBACTER Baumani/ΥΠΕΡΟΥΡΙΧΑΙΜΙΑ/ΠΑΛΙΟ ΑΕΕ/ΡΑ',
 'RECENT 7-DAY HOSPITILIZATION FOR COVID (DISCHARGED 10 DAYS AGO)',
 'VIT D DEF/ANEMIA REQ TRANFUSION',
 'MS',
 'G6PD DEF',
 'CAROTID STENOSIS',
 'UROSTOMY-2013(BLADDER CA)',
 'POLYMYALGIA RHEUMATIC/OSTEOPOROSIS',
 'EPILEPSY',
 'RENAL TRANSPLANT RECIPIENT (2019)',
 'URETHRAL STENOSIS/MULTIPLE UTIS/GLAUCOMA/POLYP SURGERY',
 'CA LUNG/PROSTATE/URIC ACID',
 'OBESITY',
 'NAFLD',
 'PAH',
 "PARKINSON'S"

In [109]:
# List of unique values provided
unique_values_list = [
    0,
    '1(GLAUCOMA/BL BLINDNESS)',
    'HETEROZYGOTE B-TH',
    'BLUNT HEAD TRAUMA',
    'RENAL TRANSPLANT (2008)',
    'CAROTID STENOSIS, HYPERPARATHYROIDISM',
    'BPH',
    'TACHYCARDIA',
    'HBV CHRONIC/DEMENTIA EARLY ONSET',
    'MDS 3xMONTH RX',
    '1/OBESITY',
    'ANEMIA/PREGNANT 34TH WEEK',
    'ARRYTHMIA',
    'CATARACTS/REDUCED CRclearance',
    'MDS/MM/HBV',
    'VERTIGO/DEMENTIA/PRESSURE ULCER',
    'DEMENTIA',
    'PSYCHOSIS',
    'PANIC ATTACK',
    'ALLERGIC RHINITIS',
    'MENINGIOMA/DEMENTIA',
    'AORTIC STENOSIS',
    'ANEMIA',
    'PFO',
    'THYROID',
    'AFIB DURING HOSPITALISATION',
    'BCx ACINETOBACTER Baumani/ΥΠΕΡΟΥΡΙΧΑΙΜΙΑ/ΠΑΛΙΟ ΑΕΕ/ΡΑ',
    'RECENT 7-DAY HOSPITILIZATION FOR COVID (DISCHARGED 10 DAYS AGO)',
    'VIT D DEF/ANEMIA REQ TRANFUSION',
    'MS',
    'G6PD DEF',
    'CAROTID STENOSIS',
    'UROSTOMY-2013(BLADDER CA)',
    'POLYMYALGIA RHEUMATIC/OSTEOPOROSIS',
    'EPILEPSY',
    'RENAL TRANSPLANT RECIPIENT (2019)',
    'URETHRAL STENOSIS/MULTIPLE UTIS/GLAUCOMA/POLYP SURGERY',
    'CA LUNG/PROSTATE/URIC ACID',
    'OBESITY',
    'NAFLD',
    'PAH',
    "PARKINSON'S",
    'KYPHOSIS',
    'obesity',
    'ULCERATIVE COLITIS',
    'RECENT 15DAY HOSPITALIZATION FOR PNA/AKI/SHOULDER INFECTIVOUS ARTHRITIS',
    'RA',
    'C-SECTION 24/11/21 (39+2 WEEEKS)',
    'MENTAL RETARDATION/EPILEPSY',
    'BMI',
    'THROMBOPHILIA/ΣΠΛΗΝΕΚΤΟΜΗ',
    'OSTEOPOROSIS/PERIPHERAL NEURITIS/EPILEPSY',
    'RA/UA',
    'CA+META',
    'OSTEOPOROSIS/PERIPHERAL ARTERY OCCLUSION/ COPD WITH HOME O2 ADMIN',
    'METALLIC AOV',
    'URIC ACID',
    'HASHIMOTO',
    'CABG/BPH',
    'GERD',
    'HBV POSITIVE',
    'RENAL TRANSPLANT RECIPIENT 2019',
    'PE 6YEARS AGO',
    'ESRD/PHTN/RECENT HOSPITALISATION/C.DIF?/LYMPHEDEMA',
    'ARRYTHMIA/BPH/DEMENTIA',
    'BPH/LOBECTOM RIGHT',
    'PREDIABETES',
    'GLAUCOMA/VERTIGO',
    'SLE/MONONEURITIS/HYPOTHYROIDISM',
    'COLON  CA+METS PULM/LIVER',
    'HETEROZYGOTE THALAS',
    'TRIPLE BYPASS',
    'HYPERURECEMIA',
    'ARRYTHMIA/HYPERURECEMIA',
    'echinoccocus hepatic 15 years',
    'ANKYL SPOND',
    'HIV',
    'BPH/VERTIGO',
    'URETEROSTOMY',
    'PCOS',
    'METALLIC MITRAL VALVE/ICD',
    'BIPOLAR',
    1,
    'SLEEP APNEA',
    'MORBIDLY OBESE',
    'OHS/OBESE'
]

# Function to standardize the unique values
def standardize_unique_values(val):
    if isinstance(val, int) or val.isdigit():  # Check if the value is numerical
        return val  # Keep numerical values as they are
    val = val.lower()  # Convert text to lowercase
    val = val.replace('1(', '').replace(')', '')  # Remove '1(' and ')' from conditions
    val = val.replace('bph', 'benign prostatic hyperplasia')
    val = val.replace('hbv', 'hepatitis b virus')
    val = val.replace('hiv', 'human immunodeficiency virus')
    val = val.replace('ms', 'multiple sclerosis')
    val = val.replace('pah', 'pulmonary arterial hypertension')
    val = val.replace('ra', 'rheumatoid arthritis')
    val = val.replace('b-th', 'beta-thalassemia')
    val = val.replace("parkinson's", 'parkinson disease')
    val = val.replace('g6pd def', 'g6pd deficiency')
    val = val.replace('nafld', 'non-alcoholic fatty liver disease')
    val = val.replace('obesity', 'obese').replace('morbidly obese', 'obese')  # Normalize obesity terms
    val = val.replace('arrythmia', 'arrhythmia').replace('hyperurecemia', 'hyperuricemia')  # Correct common misspellings
    val = val.replace('thyroid', 'thyroid disorder')
    val = val.replace('anemia', 'anemia')  # Redundant but for consistency with other conditions
    val = val.strip()
    return val

# Standardize each value in the list
standardized_unique_values = [standardize_unique_values(val) for val in unique_values_list]
standardized_unique_values


[0,
 'glaucoma/bl blindness',
 'heterozygote beta-thalassemia',
 'blunt head trheumatoid arthritisuma',
 'renal trheumatoid arthritisnsplant (2008',
 'carotid stenosis, hyperparheumatoid arthritisthyroid disorderism',
 'benign prostatic hyperplasia',
 'tachycardia',
 'hepatitis b virus chronic/dementia early onset',
 'mds 3xmonth rx',
 '1/obese',
 'anemia/pregnant 34th week',
 'arrhythmia',
 'catarheumatoid arthritiscts/reduced crclearheumatoid arthritisnce',
 'mds/mm/hepatitis b virus',
 'vertigo/dementia/pressure ulcer',
 'dementia',
 'psychosis',
 'panic attack',
 'allergic rhinitis',
 'meningioma/dementia',
 'aortic stenosis',
 'anemia',
 'pfo',
 'thyroid disorder',
 'afib during hospitalisation',
 'bcx acinetobacter baumani/υπερουριχαιμια/παλιο αεε/ρα',
 'recent 7-day hospitilization for covid (discharged 10 days ago',
 'vit d def/anemia req trheumatoid arthritisnfusion',
 'multiple sclerosis',
 'g6pd deficiency',
 'carotid stenosis',
 'urostomy-2013(bladder ca',
 'polymyalgia rhe

In [110]:
# Define categories for standardization
categories = {
    "Cardiovascular Conditions": ["tachycardia", "aortic stenosis", "arrhythmia", "pulmonary arterial hypertension", 
                                  "afib during hospitalisation", "triple bypass", "arrythmia/hyperuricemia"],
    "Neurological Conditions": ["vertigo", "epilepsy", "mental retardation", "multiple sclerosis", "parkinson disease",
                                "sle/mononeuritis/hypothyroid disorderism"],
    "Psychological Conditions": ["dementia", "psychosis", "panic attack", "bipolar", "sleep apnea"],
    "Renal and Urological Conditions": ["renal transplant", "urethral stenosis", "benign prostatic hyperplasia",
                                        "renal transplant recipient", "ureterostomy"],
    "Endocrine and Metabolic Conditions": ["thyroid disorder", "g6pd deficiency", "diabetes", "hashimoto", 
                                           "non-alcoholic fatty liver disease", "hyperuricemia"],
    "Gastrointestinal Conditions": ["ulcerative colitis", "gerd"],
    "Musculoskeletal Conditions": ["polymyalgia rheumatic", "osteoporosis", "ankylosing spondylitis", "kyphosis"],
    "Infectious Diseases": ["hepatitis b virus", "human immunodeficiency virus", "echinoccocus hepatic"],
    "Respiratory Conditions": ["recent hospitalisation for covid", "sleep apnea"],
    "Ophthalmological Conditions": ["glaucoma", "cataracts"],
    "Dermatological Conditions": [],  # No specific examples in this list
    "Hematological Conditions": ["anemia", "thalassemia", "sickle cell"],
    "Oncological Conditions": ["ca lung", "metastasis"],
    "Miscellaneous": ["allergic rhinitis", "pregnant", "obesity", "blunt head trauma", "carotid stenosis", 
                      "vitamin d deficiency", "pfo", "pe years ago", "esrd", "lymphedema", "prediabetes"]
}

# Function to categorize each condition
def categorize_conditions(condition):
    for category, conditions in categories.items():
        if any(cond in condition for cond in conditions):  # Check if the condition is part of the category
            return category
    return "Miscellaneous"  # Default category if no match is found

# Categorize each unique value
categorized_conditions = {cond: categorize_conditions(cond) for cond in standardized_unique_values if isinstance(cond, str)}

# Display the categorized conditions
categorized_conditions


{'glaucoma/bl blindness': 'Ophthalmological Conditions',
 'heterozygote beta-thalassemia': 'Hematological Conditions',
 'blunt head trheumatoid arthritisuma': 'Miscellaneous',
 'renal trheumatoid arthritisnsplant (2008': 'Miscellaneous',
 'carotid stenosis, hyperparheumatoid arthritisthyroid disorderism': 'Endocrine and Metabolic Conditions',
 'benign prostatic hyperplasia': 'Renal and Urological Conditions',
 'tachycardia': 'Cardiovascular Conditions',
 'hepatitis b virus chronic/dementia early onset': 'Psychological Conditions',
 'mds 3xmonth rx': 'Miscellaneous',
 '1/obese': 'Miscellaneous',
 'anemia/pregnant 34th week': 'Hematological Conditions',
 'arrhythmia': 'Cardiovascular Conditions',
 'catarheumatoid arthritiscts/reduced crclearheumatoid arthritisnce': 'Miscellaneous',
 'mds/mm/hepatitis b virus': 'Infectious Diseases',
 'vertigo/dementia/pressure ulcer': 'Neurological Conditions',
 'dementia': 'Psychological Conditions',
 'psychosis': 'Psychological Conditions',
 'panic att

In [None]:
# For conditions within each category, mark presence (1) or absence (0) for each patient
for index, row in dataset.iterrows():
    # Extract the condition from the 'CM/other' column for the current patient
    cm_other_condition = str(row['CM/other']).lower()  # Ensure matching is case-insensitive
    
    # Update category columns based on presence of any condition within the category for the patient
    for category, conditions in categories.items():
        column_name = category.replace(" ", "_").replace("/", "_").replace("-", "_")
        # Check if any of the conditions listed under the current category are present for the patient
        if any(cond in cm_other_condition for cond in conditions):
            dataset[column_name] = dataset['CM/other'].apply(lambda x: 1 if x == category else 0)

# Additionally, update for Miscellaneous conditions as individual columns
for misc_condition in categories['Miscellaneous']:
    column_name = misc_condition.replace(" ", "_").replace("/", "_").replace("-", "_")
    dataset[column_name] = dataset['CM/other'].apply(lambda x: 1 if misc_condition in str(x).lower() else 0)
    
dataset.drop(columns=["Miscellaneous", "CM/other"], inplace=True)

In [None]:
# 'CM/metabolic/other' exists in the dataset?
if 'CM/metabolic/other' in dataset.columns:
    # Convert all unique non-zero, non-NaN values to 1
    dataset['CM/metabolic/other'] = dataset['CM/metabolic/other'].apply(lambda x: 1 if x != 0 and not pd.isna(x) else x)

## Medications

In [113]:
# Re-defining the medication_columns list as provided
medication_columns = [
    'ANTIBIOTIC 1', 'ANTIBIOTIC 2', 'ANTIBIOTIC 3', 'ANTIBIOTIC 4', 'ANTIBIOTIC 5', 
    'ANTIBIOTIC 6', 'ANTIBIOTIC 7', 'ANTIBIOTIC 8', 'ANTIBIOTIC 9', 'ANTICOAGULANT', 
    'REMDESIVIR', 'TAMIFLU', 'KALETRA', 'CORTICOSTEROIDS', 'BARITICINIB', 'ANAKINRA', 
    'TOCILIZUMAB', 'PLAQUENIL', 'ANTIFUNGAL 1', 'ANTIFUNGAL 2', 'OTHER2'
]

# Identifying unique values in medication columns in the dataset
unique_medication_values = set()
for col in medication_columns:
    if col in dataset.columns:  # Check if the column exists
        unique_medication_values.update(dataset[col].dropna().unique())

# Dictionary for mapping non-standard names to standard names
name_mapping = {
    'zirhromax': 'zithromax',
    'collisitn': 'colistin',
    'collisitin': 'colistin',
    'colisitn': 'colistin',
    'noradren/solumedrol': 'solumedrol',
    'nivestin/zovirax/solucortef': 'zovirax/solucortef',
    'vonvon': 'voncon',
    'DEXATON': 'dexamethasone',  # Assuming Dexaton is dexamethasone
    'collistin': 'colistin',
    'ΑΛΛΕΡΓΙΑ': 'allergia'  # Translating from Greek
    # Add more mappings as needed
}

# Standardizing unique medication values
standardized_unique_medication_values = set([name_mapping.get(med, med) for med in unique_medication_values])

# Creating new columns for each standardized medication value in dataset
for med in standardized_unique_medication_values:
    column_name = "Medication_" + str(med).replace(" ", "_").replace("/", "_").replace("-", "_")
    dataset[column_name] = dataset[medication_columns].apply(lambda row: 1 if med in row.values else 0, axis=1)

# Dropping the original medication columns from dataset
dataset.drop(columns=medication_columns, inplace=True)

# Displaying the first few rows to verify the new columns
dataset[[col for col in dataset.columns if col.startswith("Medication_")]].head()


Unnamed: 0,Medication_0,Medication_1.0,Medication_CLEXANE_0.4,Medication_ROCEPHIN,"Medication_CLEXANE_0,4",Medication_CLEXANE_0.8,Medication_avelox,Medication_FLAGYL,Medication_CYMEVENE,Medication_ZOVIRAX,...,Medication_FONDAPARINUX_7.5,Medication_BRESEC,Medication_TAVANIC_,Medication_CUBICIN,Medication_AVELOX,Medication_XARELTO,Medication_enoxaparin_0.6,Medication_NISTAMYCINE,Medication_DAKTARIN_GEL,Medication_FONDAPARINUX_2.6
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
import pandas as pd
import re
from collections import defaultdict

# Redefine the refined_group_medication_columns function
def refined_group_medication_columns(columns):
    base_name_pattern = re.compile(r'(Medication_[a-zA-Z]+)')
    grouped_columns = defaultdict(list)
    
    for col in columns:
        if col.startswith('Medication_'):
            match = base_name_pattern.match(col)
            if match:
                base_name = match.group(1)  # Extract the base name of the medication
                grouped_columns[base_name].append(col)
    
    # Filter out groups that have only one variant
    grouped_columns = {base: variants for base, variants in grouped_columns.items() if len(variants) > 1}
    
    return grouped_columns

# Redefine the merge_existing_medication_columns function
def merge_existing_medication_columns(data, grouped_columns):
    for base_name, variants in grouped_columns.items():
        existing_variants = [col for col in variants if col in data.columns]
        if existing_variants:
            combined_column = data[existing_variants].max(axis=1)
            data = data.drop(columns=existing_variants, errors='ignore')
            clean_base_name = base_name.replace('Medication_', '')  # Remove prefix for readability
            data[f'Medication_{clean_base_name}'] = combined_column
    return data

# Apply the changes to the 'dataset'
grouped_medication_columns = refined_group_medication_columns(list(dataset.columns))
dataset = merge_existing_medication_columns(dataset.copy(), grouped_medication_columns)

# Display the first few rows of the merged dataset
print(dataset.head())


     AGE  GENDER  LOS  DAYS OF SYMPTOMS  FEVER  COUGH  FATIGUE  DIARRHEAS  \
0   71.0     2.0  6.0               3.0      1      0        0          0   
1  103.0     2.0  6.0               2.0      1      1        0          0   
2   45.0     1.0  6.0               4.0      1      0        1          0   
3   79.0     1.0  5.0               5.0      1      0        1          0   
4   76.0     2.0  6.0               9.0      1      0        1          0   

   DYSPNEA  URTI  ...  Medication_ZYVOXID  Medication_AMBISONE  \
0        0     0  ...                   0                    0   
1        0     0  ...                   0                    0   
2        0     0  ...                   0                    0   
3        0     0  ...                   0                    0   
4        0     0  ...                   1                    0   

   Medication_ENOXAPARIN  Medication_enoxaparin  Medication_PLAVIX  \
0                      1                      0                  0   


## Dropping Rows and columns

In [None]:
# Calculate the number of missing values for each row and sort them in descending order
missing_values_per_row_sorted = dataset.isnull().sum(axis=1).sort_values(ascending=False)

# Display the rows with the highest number of missing values
missing_values_per_row_sorted.head(50) 


225    88
224    88
223    88
219    87
218    87
152    85
203    76
62     62
140    47
211    43
119    40
185    34
48     34
159    34
142    33
67     30
193    30
72     30
208    29
209    29
213    29
180    28
210    27
34     26
222    26
207    26
194    25
206    25
220    25
221    25
24     24
27     24
187    20
133    20
104    19
184    19
196    19
85     19
178    18
115    18
189    18
135    18
105    17
183    17
21     17
6      17
110    16
52     16
113    16
195    15
dtype: int64

In [116]:
# Drop the top 8 rows with the most missing values based on the sorted index
dataset = dataset.drop(index=missing_values_per_row_sorted.head(8).index)

# Display the shape of the new dataset after dropping the rows
dataset


Unnamed: 0,AGE,GENDER,LOS,DAYS OF SYMPTOMS,FEVER,COUGH,FATIGUE,DIARRHEAS,DYSPNEA,URTI,...,Medication_ZYVOXID,Medication_AMBISONE,Medication_ENOXAPARIN,Medication_enoxaparin,Medication_PLAVIX,Medication_FONDAPARINUX,Medication_SINTROM,Medication_DAKTARIN,Medication_TAVANIC,Medication_CUBICIN
0,71.0,2.0,6.0,3.0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,103.0,2.0,6.0,2.0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,45.0,1.0,6.0,4.0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,79.0,1.0,5.0,5.0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,76.0,2.0,6.0,9.0,1,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,84.0,,7.0,2.0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
217,91.0,1.0,6.0,2.0,0,0,1,0,,,...,0,0,0,0,0,0,0,0,0,0
220,65.0,,2.0,5.0,1,1,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
221,67.0,,,10.0,1,0,0,0,1,,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Calculate the percentage of missing values for the specified columns
missing_values_percentage_specified = dataset.isnull().mean() * 100
missing_values_percentage_specified.sort_values(ascending=False).head(60)
# Identify columns with more than 31% missing values
columns_to_drop = missing_values_percentage_specified[missing_values_percentage_specified > 31].index.tolist()
#dataset = dataset.drop(columns= columns_to_drop)
# Display columns to be dropped
columns_to_drop

['SpO2%', 'rr', 'tCPAP_numerical']

In [118]:
dataset.reset_index(inplace=True)
dataset.rename(columns={'index': 'Index'}, inplace=True)
dataset['Index'] = dataset['Index'] + 1
# Rename the 'Index' column to 'Subject_ID' 
dataset = dataset.rename(columns={'Index': 'Subject_ID'})

In [119]:
# Count the number of missing values in the 'outcome' column
missing_outcome_count = dataset['Outcome_numerical'].isnull().sum()

# Identify the rows with missing values in the 'outcome' column
missing_outcome_rows = dataset[dataset['Outcome_numerical'].isnull()]

print("Number of missing values in 'Outcome_numerical':", missing_outcome_count)
print("Rows with missing 'outcome' values:\n", missing_outcome_rows)


Number of missing values in 'Outcome_numerical': 9
Rows with missing 'outcome' values:
      Subject_ID   AGE  GENDER  LOS  DAYS OF SYMPTOMS  FEVER  COUGH  FATIGUE  \
73           75  37.0     2.0  4.0               3.0      1      0        0   
112         114  27.0     1.0  3.0               5.0      1      0        1   
122         124  45.0     NaN  1.0               5.0      1      1        1   
167         170  69.0     1.0  5.0               1.0      0      1        0   
172         175  47.0     NaN  4.0               7.0      1      0        1   
175         178  53.0     1.0  9.0               5.0      1      1        0   
188         191  71.0     NaN  6.0               2.0      0      0        0   
216         222  67.0     NaN  NaN              10.0      1      0        0   
217         223  55.0     NaN  NaN               4.0      1      0        0   

     DIARRHEAS  DYSPNEA  ...  Medication_ZYVOXID  Medication_AMBISONE  \
73           0        0  ...                   0

In [120]:
# Adjusting the process to only replace missing values for columns that exist in the dataset
columns_to_replace_missing = ['tINTUBATION_cleaned', 'CPAP', 'tCPAP_numerical', 'HIGH FLOW', 't HIGH FLOW', 'CM/GI', 'CM/renal ', 'cm/autoimm', 'CM/neoplasm', 'CM/hepatobiliary', 'CM/CNS', 'CM/lipid', 'CM/asthma', 'CM/metabolic/other', 'CM/COPD', 'CM/renal', 'MED/bones', 'CM/CAD', 'CM/AF', 'CM/CHF', 'CM/DM', 'MED/CNS', 'MED/GI', 'MED/resp', 'MED/antineo', 'MED/other', 'MED/immuno', 'MED/anticoag', 'MED/endocr', 'CM/HBP', 'MED/DM', 'MED/antiPLTs', 'MED/statins', 'MED/cardio' ]
existing_columns = dataset.columns

# Replacing missing values with 0 in the existing specified columns
for col in columns_to_replace_missing:
    if col in existing_columns:
        dataset[col] = dataset[col].fillna(0)

In [121]:
# List of specified columns
specified_columns = [
    "AGE", "GENDER", "LOS", "DAYS OF SYMPTOMS", "INTUBATION", "tINTUBATION_cleaned", "CPAP", "tCPAP_numerical", 
    "HIGH FLOW", "t HIGH FLOW", "GSC", 'Systolic_BP', 'Diastolic_BP', "PULSE RATE", "rr", "TEMP", "PH", "PO2", "PCO2", 
    "HCO3", "FIO2 eisagwgh", "WHO score", "WBC", "LDH", "CPK", "CRP", "FERRITIN", "PERNEUTROPHILS", 
    "PERLYMPHOCYTES", "ABSLYMPHOCYTES", "Hb", "PLT", "INR", "APTT", "FIBRINOGEN", "D-DIMERS", 
    "K", "Na", "Glu", "UREA", "Cr", "BIL", "DIRECT BIL", "SGOT", "SGPT", "TnI", "CHOL", "TGL"
]

# Calculate the percentage of missing values for the specified columns
missing_values_percentage_specified = dataset[specified_columns].isnull().mean() * 100
missing_values_percentage_specified.sort_values(ascending=False).head(30)

rr                91.284404
GENDER            24.770642
D-DIMERS          16.513761
TnI               16.055046
FIBRINOGEN        15.596330
TGL               12.844037
CHOL              12.385321
DIRECT BIL        12.385321
BIL               10.550459
TEMP              10.091743
FERRITIN          10.091743
APTT               8.256881
INR                7.798165
HCO3               7.798165
Diastolic_BP       5.963303
PCO2               5.963303
Systolic_BP        5.963303
PULSE RATE         5.504587
PH                 3.669725
LDH                3.211009
FIO2 eisagwgh      2.752294
SGOT               2.293578
K                  2.293578
CPK                2.293578
PO2                2.293578
SGPT               1.834862
Cr                 1.834862
UREA               1.834862
Glu                1.834862
ABSLYMPHOCYTES     1.834862
dtype: float64

In [122]:
# Dropping the columns 'CHOL', 'TGL', and 'rr'
dataset = dataset.drop(columns=['CHOL', 'TGL', 'rr', 'SpO2%', 'P/F ratio', 'PSI', "infiltrate_standardized", "RDW", "P/F ratio", "Medication_0", "Medication_1.0", "Medication_Q", "SOFA", "Medication_1(1_DAY)", "Medication_1(2DAYS)"])

In [123]:
# Define the non-numerical values to be replaced
non_numerical_values_details = {
    'GSC': ['TRANSFER FROM SURGERY'],
    'PULSE RATE': ['92`'],
    'WBC': [pd.to_datetime('1900-01-04 09:36')],
    'INR': ['ΑΠΡΟΣΔΙΟΡΙΣΤΟ'],
    'D-DIMERS': ['-'],
    'tINTUBATION_cleaned': ['?']
}

# Function to replace specific non-numerical values with NaN
def replace_non_numerical_with_nan(df, non_numerical_details):
    for column, values in non_numerical_details.items():
        for value in values:
            df[column] = df[column].replace(value, pd.NA)
    return df

# Replace non-numerical values with NaN in the dataframe
dataset = replace_non_numerical_with_nan(dataset, non_numerical_values_details)


In [124]:
# Replace the datetime values with NaN in the WBC column
dataset['WBC'] = pd.to_numeric(dataset['WBC'], errors='coerce')

In [125]:
missing_values_percentage_specified = dataset.isnull().mean() * 100
missing_values_percentage_specified.sort_values(ascending=False).head(30)

GENDER               24.770642
D-DIMERS             16.972477
TnI                  16.055046
FIBRINOGEN           15.596330
DIRECT BIL           12.385321
BIL                  10.550459
FERRITIN             10.091743
TEMP                 10.091743
INR                   8.256881
APTT                  8.256881
HCO3                  7.798165
URTI                  7.798165
DYSPNEA               6.422018
Systolic_BP           5.963303
PCO2                  5.963303
DIARRHEAS             5.963303
PULSE RATE            5.963303
Diastolic_BP          5.963303
Outcome_numerical     4.128440
PH                    3.669725
PO2/FIO2              3.669725
FATIGUE               3.669725
COUGH                 3.211009
CCI                   3.211009
LDH                   3.211009
FIO2 eisagwgh         2.752294
SGOT                  2.293578
FEVER                 2.293578
PO2                   2.293578
K                     2.293578
dtype: float64

In [126]:
dataset.columns.values

array(['Subject_ID', 'AGE', 'GENDER', 'LOS', 'DAYS OF SYMPTOMS', 'FEVER',
       'COUGH', 'FATIGUE', 'DIARRHEAS', 'DYSPNEA', 'URTI', 'INTUBATION',
       'CPAP', 'HIGH FLOW', 't HIGH FLOW', 'MED/cardio', 'MED/statins',
       'MED/antiPLTs', 'MED/anticoag', 'MED/resp', 'MED/endocr', 'MED/DM',
       'MED/CNS', 'MED/GI', 'MED/bones', 'MED/immuno', 'MED/antineo',
       'MED/other', 'CM/CHF', 'CM/CAD', 'CM/AF', 'CM/HBP', 'CM/asthma',
       'CM/COPD', 'CM/DM', 'CM/neoplasm', 'CM/CNS', 'CM/hepatobiliary',
       'CM/GI', 'CM/renal', 'cm/autoimm', 'CM/lipid',
       'CM/metabolic/other', 'GSC', 'Systolic_BP', 'Diastolic_BP',
       'PULSE RATE', 'TEMP', 'PH', 'PO2', 'PCO2', 'HCO3', 'FIO2 eisagwgh',
       'WHO score', 'WBC', 'PERNEUTROPHILS', 'PERLYMPHOCYTES',
       'ABSLYMPHOCYTES', 'Hb', 'PLT', 'INR', 'APTT', 'FIBRINOGEN',
       'D-DIMERS', 'K', 'Na', 'Glu', 'UREA', 'Cr', 'BIL', 'DIRECT BIL',
       'SGOT', 'SGPT', 'LDH', 'CPK', 'CRP', 'FERRITIN', 'TnI', 'PO2/FIO2',
       'qSOFA', 'CC