### This is the first experiment where we will be implementing the RNN

In [29]:
import pandas as pd
import numpy as np

In [30]:
xls = pd.ExcelFile('Datasets/PhyAdRsnComorbMed.xlsx')
df1 = pd.read_excel(xls, 'Hospital-length-of-stay')
df2 = pd.read_excel(xls, 'Days-breakdown')

### Removing the patients that expired

In [31]:
df1 = df1[df1['did_the_patient_expire_in_hospital']=='No'][['parent_id','hospital_length_of_stay']]
df1

Unnamed: 0,parent_id,hospital_length_of_stay
1,2,5
2,3,7
5,6,32
6,7,4
7,8,10
...,...,...
502,516,13
504,519,9
505,521,7
506,522,5


### Merging on the basis of parent_id

In [32]:
df = df2.merge(df1, how='inner', on='parent_id')
df.head(20)

Unnamed: 0,id,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,...,chest_x_ray,cxr_findings,chest_ct,chest_ct_findings,head_ct,head_ct_findings,antimicrobial,anticoagulation,steroid,hospital_length_of_stay
0,9,2,1,122.0,80.0,72.0,24.0,92.0,36.3,,...,No,"""[\""Bilateral Ground Glass\"",\""Bilateral Conso...",No,"""[]""",No,"""[]""",Yes,Yes,Yes,5
1,10,2,2,114.0,72.0,60.0,18.0,92.0,35.9,,...,No,"""[]""",No,"""[]""",No,"""[]""",Yes,Yes,Yes,5
2,11,2,3,117.0,73.0,62.0,16.0,95.0,36.0,,...,No,"""[]""",No,"""[]""",No,"""[]""",Yes,Yes,Yes,5
3,12,2,4,125.0,76.0,76.0,18.0,97.0,36.3,,...,No,"""[]""",No,"""[]""",No,"""[]""",Yes,Yes,Yes,5
4,13,2,5,133.0,85.0,73.0,20.0,94.0,36.7,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,No,No,5
5,14,2,6,,,,,,,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,No,No,5
6,15,2,7,,,,,,,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,No,No,5
7,16,2,14,,,,,,,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,No,No,5
8,17,3,1,112.0,69.0,66.0,18.0,96.0,36.5,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,Yes,Yes,7
9,18,3,2,150.0,93.0,67.0,19.0,93.0,36.1,,...,No,"""[]""",No,"""[]""",No,"""[]""",No,Yes,Yes,7


### Handling null values

### Initially we will handle the null values for the columns which cant have null as the value for their respctive columns systolic_blood_pressure	,diastolic_blood_pressure, heart_rate, respiratory_rate, oxygen_saturation	and temperature
We will enter the mean for their respective ids

In [33]:
df['d_dimer'] = pd.to_numeric(df['d_dimer'], errors='coerce').astype('Float64')

In [34]:
def enter_mean_values(df,column_name):
    for index,value in df[['parent_id',column_name]].iterrows():
        if pd.isna(value[1]):
            mean = df[df['parent_id']==value[0]][column_name].mean()
            df.loc[index, column_name] = mean

In [None]:
# List of columns you want to apply the function to
variables = [
    'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate', 'respiratory_rate', 
    'oxygen_saturation', 'temperature', 'highest_mean_arterial_pressure', 'lowest_mean_arterial_pressure', 
    'highest_heart_rate', 'lowest_heart_rate', 'highest_creatinine', 'lowest_urine_output', 
    'highest_gcs', 'highest_ph', 'lowest_ph', 'fluid_balance', 'wbc', 'rbc', 'hemoglobin', 
    'hematocrit', 'platelet_count', 'aptt_aptr', 'pt', 'alt', 'ast', 'mch', 'mcv', 'mchc', 'rdw', 
    'serum_creatinine', 'sodium', 'potassium', 'total_serum_bilirubin', 'lactate', 'pao2', 'pao2_fio2', 
    'ph', 'high_sensitivity_cardiac_troponin', 'esr', 'inr', 'ferritin', 'd_dimer', 'crp', 'hs_crp'
]



# Loop through the columns in the variables list and check for the error
for column in variables:
    try:
        # Try to apply the mean filling logic for each column
        enter_mean_values(df, column)
    except TypeError as e:
        # Catch the TypeError and print which column caused the error
        print(f"Error with column '{column}': {e}")
    except Exception as e:
        # Catch any other exceptions and print them
        print(f"An unexpected error occurred with column '{column}': {e}")

### Now we will apply binary encoding to the columns having yes and no values
- We applied binary encoding as 0 means no and 1 means yes
- We did this because Yes and no does not have any numerical values where yes being greater than no.
- If there was ordinal relationship between the two we could have used label encoding. Example ranks.

In [None]:
## function to apply binary encoding

def apply_binary_encoding(df,column):
    df[column]=df[column].map({'Yes':1,'No':0})

In [None]:
## applying binary encoding to the columns

columns_binary = [
    'intubated', 'cardiac_arrest', 'arrested_time', 'major_cardiac_events', 
    'clinically_diagnosed_infections', 'mechanical_ventilation', 'antiarrhythmic_therapies', 
    'renal_replacement_therapy_dialysis', 'cardiovascular_mechanical_support', 'echocardiogram', 
    'chest_x_ray', 'chest_ct', 'head_ct', 'antimicrobial', 'anticoagulation', 'steroid'
]
for columns in columns_binary:
    apply_binary_encoding(df,columns)

In [None]:
df.head()

### Data preprocessing the texts and applying one hot encoding
- There are columns such as cxr_findings which have values such as "[\"Unilateral Consolidation\"] for which we will first seperate the list of texts in the brackets and then apply one hot encoding.

In [None]:
## We will now address the columns 'cxr_findings', 'chest_ct_findings' and 'head_ct_findings'

## we will create different datasets for the above columns

df_cxr_findings = df[['id','cxr_findings']]

df_chest_ct_findings = df[['id','chest_ct_findings']]

df_head_ct_findings = df[['id','head_ct_findings']]

## We will use the same function like we used in the previous notebook

## defining a function to remove all the strings except a number, letter or a comma

def filter_characters(value):
    # Convert to string
    value = str(value)
    
    # Initialize an empty result string
    result = ''
    
    # Iterate through each character in the string
    for char in value:
        # Append if character is a letter, number, comma, or space
        if char.isalnum() or char == ',' or char.isspace():
            result += char
    
    return result


# Apply the function to the columns
df_cxr_findings['cxr_findings'] = df_cxr_findings['cxr_findings'].apply(filter_characters)

df_chest_ct_findings['chest_ct_findings'] = df_chest_ct_findings['chest_ct_findings'].apply(filter_characters)

df_head_ct_findings['head_ct_findings'] = df['head_ct_findings'].apply(filter_characters)

# Now we will apply one hot encoding to the the columns

## applying the function to split into lists

def one_hot_encode_comorbidities(df, column_name):
    # Split the specified column into lists of comorbidities
    df[column_name] = df[column_name].str.split(',')
    
     # Use MultiLabelBinarizer to one-hot encode the comorbidities
    from sklearn.preprocessing import MultiLabelBinarizer
    
    # Use MultiLabelBinarizer to one-hot encode the comorbidities
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df[column_name])
    
    # Create a DataFrame from the one-hot encoded data
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    result_df = pd.concat([df.drop(column_name, axis=1), one_hot_encoded_df], axis=1)
    
    result_df = result_df.drop(result_df.columns[1],axis=1)
    
    return result_df

# Apply the one-hot encoding function
df_cxr_findings = one_hot_encode_comorbidities(df_cxr_findings,'cxr_findings')
df_chest_ct_findings = one_hot_encode_comorbidities(df_chest_ct_findings,'chest_ct_findings')
df_head_ct_findings = one_hot_encode_comorbidities(df_head_ct_findings,'head_ct_findings')

# we will now merge all the dataframes based on id

df = df.merge(df_cxr_findings, on='id').merge(df_chest_ct_findings, on='id').merge(df_head_ct_findings, on='id')

# we will now drop the column named id, cxr_findings, chest_ct_findings and head_ct_findings

# Drop the specified columns
columns_to_drop = ['id', 'cxr_findings', 'chest_ct_findings', 'head_ct_findings']
df = df.drop(columns=columns_to_drop)

# final dataframe

df.head()


### Droping the columns ejection_fraction and wall_motion_abnormality as they have no values in the whole dataset

In [None]:
df = df.drop(['ejection_fraction','wall_motion_abnormality'],axis=1)

In [None]:
df

### Now we will add 0 values to the rows that have Nan

In [None]:
df = df.fillna(0)

In [None]:
df

In [None]:
# df.to_csv('pre.csv')

### Seggregating two dataframes having hospital lengths of stay until 14 and more than 14

In [None]:
df_14 = df[df['hospital_length_of_stay']<15]
df_over_14 = df[df['hospital_length_of_stay']>14]

In [None]:
df_14

In [None]:
df_over_14

In [28]:
df_14.to_csv("df_under_14.csv")
df_over_14.to_csv("df_over_14.csv")