In [1]:
#libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:

# Load the preprocessed csv file as the dataframe
df = pd.read_csv("preprocessed.csv") 


In [3]:

# 1. Handle Missing Values
# Fill missing values for numeric columns with the mean
df['age'].fillna(df['age'].mean(), inplace=True)
df['labresult'].fillna(df['labresult'].mean(), inplace=True)

# Fill missing values for categorical columns with the mode
categorical_cols = ['ethnicity', 'hospitaladmitsource', 'hospitaldischargelocation', 
                    'hospitaldischargestatus', 'icd9code', 'treatmentstring']
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['labresult'].fillna(df['labresult'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [4]:

# 2. Create New Features
# (a) Age Groups:Groups patients into categories based on their age
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 40, 65, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

# (b) Length of Diagnosis Description
df['admitdxtext_length'] = df['admitdxtext'].apply(len)

# (c) Binary Gender Encoding : Groups patients into categories based on their age
df['is_female'] = (df['gender'] == 'Female').astype(int)

# (d) Combine Diagnostic Information
df['diagnosis_combined'] = df['admitdxname'] + " | " + df['diagnosisstring']

# (e) Flag for Critical Discharge Status : Converts the hospitaldischargestatus column into a binary feature, where 1 represents a critical discharge (e.g., deceased, hospice, or severely ill).
#Why it’s useful: It simplifies discharge outcomes, making it easier to model predictions related to patient severity at discharge.
df['critical_discharge'] = df['hospitaldischargestatus'].apply(lambda x: 1 if x in ['Expired', 'Critical'] else 0)


In [5]:

# 3. Encode Categorical Features
# Use Label Encoding for categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for inverse transformations if needed


#Converts categorical features into numerical representations (either via one-hot encoding or integer encoding).

In [6]:

# 4. Drop Irrelevant Columns
# Drop columns that might not be useful for analysis or modeling
df.drop(['gender', 'admitdxtext'], axis=1, inplace=True)


In [7]:

# Display the final DataFrame
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   patientunitstayid          1000 non-null   int64   
 1   admitdxname                1000 non-null   object  
 2   age                        1000 non-null   float64 
 3   ethnicity                  1000 non-null   int32   
 4   hospitaladmitsource        1000 non-null   int32   
 5   hospitaldischargelocation  1000 non-null   int32   
 6   hospitaldischargestatus    1000 non-null   int32   
 7   labname                    1000 non-null   object  
 8   labresult                  1000 non-null   float64 
 9   diagnosisstring            998 non-null    object  
 10  icd9code                   1000 non-null   int32   
 11  treatmentstring            1000 non-null   int32   
 12  age_group                  1000 non-null   category
 13  admitdxtext_length         1000 no

In [8]:
#fill Na with UNKNOWN
df['diagnosisstring'].fillna('Unknown', inplace=True)
df['diagnosis_combined'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['diagnosisstring'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['diagnosis_combined'].fillna('Unknown', inplace=True)


In [9]:
df['age_group'] = df['age_group'].cat.codes


In [10]:
#dropping the original cols and combined to avoid adding unnecessary noise to the model
df.drop(['admitdxname', 'labname', 'diagnosis_combined'], axis=1, inplace=True)


In [11]:
print(df['critical_discharge'].value_counts(normalize=True))


critical_discharge
0    0.783
1    0.217
Name: proportion, dtype: float64


variable is imbalanced, with 78.3% of the data in class 0 and only 21.7% in class 1.
Some models have parameters to handle class imbalance by assigning higher weights to the minority class.

In [12]:
df.to_csv("feature_engineered.csv",index=False)