### This is the first experiment where we will be implementing the RNN

In [3]:
import pandas as pd
import numpy as np

In [9]:
xls = pd.ExcelFile('../Datasets/PhyAdRsnComorbMed.xlsx')
df1 = pd.read_excel(xls, 'Hospital-length-of-stay')
df = pd.read_excel(xls, 'Days-breakdown')

### Removing the patients that expired

In [10]:
df1 = df1[df1['did_the_patient_expire_in_hospital']=='No'][['parent_id','hospital_length_of_stay']]
df1

Unnamed: 0,parent_id,hospital_length_of_stay
1,2,5
2,3,7
5,6,32
6,7,4
7,8,10
...,...,...
502,516,13
504,519,9
505,521,7
506,522,5


### Merging on the basis of parent_id

### Handling null values

### Initially we will handle the null values for the columns which cant have null as the value for their respctive columns systolic_blood_pressure	,diastolic_blood_pressure, heart_rate, respiratory_rate, oxygen_saturation	and temperature
We will enter the mean for their respective ids

In [11]:
df['d_dimer'] = pd.to_numeric(df['d_dimer'], errors='coerce').astype('Float64')

In [12]:
def enter_mean_values(df,column_name):
    for index,value in df[['parent_id',column_name]].iterrows():
        if pd.isna(value[1]):
            mean = df[df['parent_id']==value[0]][column_name].mean()
            df.loc[index, column_name] = mean

In [13]:
# List of columns you want to apply the function to
variables = [
    'systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate', 'respiratory_rate', 
    'oxygen_saturation', 'temperature', 'highest_mean_arterial_pressure', 'lowest_mean_arterial_pressure', 
    'highest_heart_rate', 'lowest_heart_rate', 'highest_creatinine', 'lowest_urine_output', 
    'highest_gcs', 'highest_ph', 'lowest_ph', 'fluid_balance', 'wbc', 'rbc', 'hemoglobin', 
    'hematocrit', 'platelet_count', 'aptt_aptr', 'pt', 'alt', 'ast', 'mch', 'mcv', 'mchc', 'rdw', 
    'serum_creatinine', 'sodium', 'potassium', 'total_serum_bilirubin', 'lactate', 'pao2', 'pao2_fio2', 
    'ph', 'high_sensitivity_cardiac_troponin', 'esr', 'inr', 'ferritin', 'd_dimer', 'crp', 'hs_crp'
]



# Loop through the columns in the variables list and check for the error
for column in variables:
    try:
        # Try to apply the mean filling logic for each column
        enter_mean_values(df, column)
    except TypeError as e:
        # Catch the TypeError and print which column caused the error
        print(f"Error with column '{column}': {e}")
    except Exception as e:
        # Catch any other exceptions and print them
        print(f"An unexpected error occurred with column '{column}': {e}")

### Now we will apply binary encoding to the columns having yes and no values
- We applied binary encoding as 0 means no and 1 means yes
- We did this because Yes and no does not have any numerical values where yes being greater than no.
- If there was ordinal relationship between the two we could have used label encoding. Example ranks.

In [14]:
## function to apply binary encoding

def apply_binary_encoding(df,column):
    df[column]=df[column].map({'Yes':1,'No':0})

In [15]:
## applying binary encoding to the columns

columns_binary = [
    'intubated', 'cardiac_arrest', 'arrested_time', 'major_cardiac_events', 
    'clinically_diagnosed_infections', 'mechanical_ventilation', 'antiarrhythmic_therapies', 
    'renal_replacement_therapy_dialysis', 'cardiovascular_mechanical_support', 'echocardiogram', 
    'chest_x_ray', 'chest_ct', 'head_ct', 'antimicrobial', 'anticoagulation', 'steroid'
]
for columns in columns_binary:
    apply_binary_encoding(df,columns)

In [16]:
df.head()

Unnamed: 0,id,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,...,wall_motion_abnormality,chest_x_ray,cxr_findings,chest_ct,chest_ct_findings,head_ct,head_ct_findings,antimicrobial,anticoagulation,steroid
0,1,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,...,,0,"""[\""Unilateral Consolidation\""]""",0,"""[\""Bilateral consolidation/infiltration\""]""",0,"""[]""",1,1,0
1,2,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1
2,3,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1
3,4,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,...,,1,"""[\""Unilateral Consolidation\""]""",0,"""[]""",0,"""[]""",1,1,1
4,5,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,...,,0,"""[]""",0,"""[]""",0,"""[]""",1,1,1


### Data preprocessing the texts and applying one hot encoding
- There are columns such as cxr_findings which have values such as "[\"Unilateral Consolidation\"] for which we will first seperate the list of texts in the brackets and then apply one hot encoding.

In [17]:
## We will now address the columns 'cxr_findings', 'chest_ct_findings' and 'head_ct_findings'

## we will create different datasets for the above columns

df_cxr_findings = df[['id','cxr_findings']]

df_chest_ct_findings = df[['id','chest_ct_findings']]

df_head_ct_findings = df[['id','head_ct_findings']]

## We will use the same function like we used in the previous notebook

## defining a function to remove all the strings except a number, letter or a comma

def filter_characters(value):
    # Convert to string
    value = str(value)
    
    # Initialize an empty result string
    result = ''
    
    # Iterate through each character in the string
    for char in value:
        # Append if character is a letter, number, comma, or space
        if char.isalnum() or char == ',' or char.isspace():
            result += char
    
    return result


# Apply the function to the columns
df_cxr_findings['cxr_findings'] = df_cxr_findings['cxr_findings'].apply(filter_characters)

df_chest_ct_findings['chest_ct_findings'] = df_chest_ct_findings['chest_ct_findings'].apply(filter_characters)

df_head_ct_findings['head_ct_findings'] = df['head_ct_findings'].apply(filter_characters)

# Now we will apply one hot encoding to the the columns

## applying the function to split into lists

def one_hot_encode_comorbidities(df, column_name):
    # Split the specified column into lists of comorbidities
    df[column_name] = df[column_name].str.split(',')
    
     # Use MultiLabelBinarizer to one-hot encode the comorbidities
    from sklearn.preprocessing import MultiLabelBinarizer
    
    # Use MultiLabelBinarizer to one-hot encode the comorbidities
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df[column_name])
    
    # Create a DataFrame from the one-hot encoded data
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    result_df = pd.concat([df.drop(column_name, axis=1), one_hot_encoded_df], axis=1)
    
    result_df = result_df.drop(result_df.columns[1],axis=1)
    
    return result_df

# Apply the one-hot encoding function
df_cxr_findings = one_hot_encode_comorbidities(df_cxr_findings,'cxr_findings')
df_chest_ct_findings = one_hot_encode_comorbidities(df_chest_ct_findings,'chest_ct_findings')
df_head_ct_findings = one_hot_encode_comorbidities(df_head_ct_findings,'head_ct_findings')

# we will now merge all the dataframes based on id

df = df.merge(df_cxr_findings, on='id').merge(df_chest_ct_findings, on='id').merge(df_head_ct_findings, on='id')

# we will now drop the column named id, cxr_findings, chest_ct_findings and head_ct_findings

# Drop the specified columns
columns_to_drop = ['id', 'cxr_findings', 'chest_ct_findings', 'head_ct_findings']
df = df.drop(columns=columns_to_drop)

# final dataframe

df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cxr_findings['cxr_findings'] = df_cxr_findings['cxr_findings'].apply(filter_characters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chest_ct_findings['chest_ct_findings'] = df_chest_ct_findings['chest_ct_findings'].apply(filter_characters)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0


### Droping the columns ejection_fraction and wall_motion_abnormality as they have no values in the whole dataset

In [18]:
df = df.drop(['ejection_fraction','wall_motion_abnormality'],axis=1)

In [19]:
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4059,522,4,98.0,73.0,82.0,24.0,95.0,36.1,,,...,0,0,0,0,0,0,0,0,0,0
4060,522,5,110.0,64.0,68.0,18.0,95.0,36.5,,,...,0,0,0,0,0,0,0,0,0,0
4061,522,6,112.2,68.0,74.4,20.0,95.2,36.4,,,...,0,0,0,0,0,0,0,0,0,0
4062,522,7,112.2,68.0,74.4,20.0,95.2,36.4,,,...,0,0,0,0,0,0,0,0,0,0


### Now we will add 0 values to the rows that have Nan

In [20]:
df = df.fillna(0)

In [21]:
df

Unnamed: 0,parent_id,day,systolic_blood_pressure,diastolic_blood_pressure,heart_rate,respiratory_rate,oxygen_saturation,temperature,highest_mean_arterial_pressure,lowest_mean_arterial_pressure,...,Bilateral Ground Glass Opacities,Bilateral consolidationinfiltration,Emphysematous or Bronchiectasis changes,Emphysematous or Bronchiectatic changes,Pulmonary Embolism,Scarring or Fibrosis,Unilateral Ground Glass Opacities,Unilateral consolidationinfiltration,Subarachnoid Hemorrhage,Subdural Hemorrhage
0,1,1,119.0,54.0,79.0,18.0,94.0,37.2,85.0,85.0,...,0,1,0,0,0,0,0,0,0,0
1,1,2,133.0,64.0,73.0,18.0,98.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,140.0,74.0,70.0,20.0,95.0,37.5,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,154.0,78.0,77.0,18.0,95.0,37.1,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,155.0,61.0,64.0,16.0,92.0,36.7,85.0,85.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4059,522,4,98.0,73.0,82.0,24.0,95.0,36.1,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4060,522,5,110.0,64.0,68.0,18.0,95.0,36.5,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4061,522,6,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4062,522,7,112.2,68.0,74.4,20.0,95.2,36.4,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Merging With 

In [22]:
# df.to_csv('pre.csv')

### Seggregating two dataframes having hospital lengths of stay until 14 and more than 14

In [23]:
df_14 = df[df['hospital_length_of_stay']<15]
df_over_14 = df[df['hospital_length_of_stay']>14]

KeyError: 'hospital_length_of_stay'

In [None]:
df_14

In [None]:
df_over_14

In [None]:
df_14.to_csv("df_under_14.csv")
df_over_14.to_csv("df_over_14.csv")

### Merging With HLOS