In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Importing Data

imported the "healthcare_datasets.csv" file into a dataframe using the pd.read_csv() method

In [13]:
df = pd.read_csv('healthcare_datasets.csv')
df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84,Male,95,157,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40,Male,163,285,
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72,Male,NAN,168,
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39,Male,118,288,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28,Female,128,229,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,73,Female,96,273,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,30,Female,98,168,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,52,Male,94,247,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,21,Male,169,166,Diabetes


Removal of Duplicates

Used the drop_duplicates() method on to the dataframe which removes all the duplicate values in the dataset

In [14]:
df = df.drop_duplicates()
df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84,Male,95,157,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40,Male,163,285,
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72,Male,NAN,168,
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39,Male,118,288,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28,Female,128,229,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,73,Female,96,273,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,30,Female,98,168,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,52,Male,94,247,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,21,Male,169,166,Diabetes


Handling Missing values

1. Column Removal : set the threshold to 0.9, which means that if that column has a 90 percentage of missing values it is removed 

In [15]:
threshold = 0.9
df = df.loc[:,df.isna().mean() < threshold]
df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84,Male,95,157,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40,Male,163,285,
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72,Male,NAN,168,
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39,Male,118,288,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28,Female,128,229,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,73,Female,96,273,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,30,Female,98,168,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,52,Male,94,247,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,21,Male,169,166,Diabetes


2. Imputation : 
   
   For Categorical Values: changed all the missing values to Unknown

   For Numerical Values  : changed all the missing values to the mean of that respective column

In [16]:
num_cols = ['Age','Blood_Pressure','Cholesterol']
cat_cols = ['Gender','Condition']

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')   
    df[col] = df[col].fillna(df[col].mean())


df[cat_cols] = df[cat_cols].fillna('Unknown')
df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84.0,Male,95.000000,157.0,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40.0,Male,163.000000,285.0,Unknown
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72.0,Male,131.023015,168.0,Unknown
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39.0,Male,118.000000,288.0,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28.0,Female,128.000000,229.0,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,73.0,Female,96.000000,273.0,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,30.0,Female,98.000000,168.0,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,52.0,Male,94.000000,247.0,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,21.0,Male,169.000000,166.0,Diabetes


Outlier handling

Outliers in this case are the values that are present in the lower most and the highermost percentile in that column. So I removed the lower and upper percentile values in that column which are outliers.

Used the clip() method to clip or remove the upper and lower bound values in that column

In [17]:
lower_percentile = 0.01
upper_percentile = 0.98

for col in df[num_cols]:
        lb = df[col].quantile(lower_percentile)
        ub = df[col].quantile(upper_percentile)
        
        df[col] = df[col].clip(lower=lb, upper=ub)

df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,84.0,Male,95.000000,157.0,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,40.0,Male,163.000000,285.0,Unknown
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,72.0,Male,131.023015,168.0,Unknown
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,39.0,Male,118.000000,288.0,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,28.0,Female,128.000000,229.0,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,73.0,Female,96.000000,273.0,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,30.0,Female,98.000000,168.0,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,52.0,Male,94.000000,247.0,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,21.0,Male,169.000000,166.0,Diabetes


Transformation

Used a StandardScaler and OneHotEncoder for numerical and categorical vlaues respectively

In [18]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df

Unnamed: 0,Patient_ID,Age,Gender,Blood_Pressure,Cholesterol,Condition
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,1.429498,Male,-1.226222,-0.700961,Hypertension
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,-0.669900,Male,1.088860,1.514393,Unknown
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,0.856935,Male,0.000193,-0.510579,Unknown
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,-0.717614,Male,-0.443180,1.566315,Diabetes
4,044838d9-29a8-46da-8d74-18705c8df5d7,-1.242463,Female,-0.102727,0.545176,Diabetes
...,...,...,...,...,...,...
3016,6516343d-4f84-42cc-9805-307da385d003,0.904649,Female,-1.192177,1.306703,Hypertension
3017,7ef7b2c4-4cbd-44de-887d-aa07548e7e18,-1.147036,Female,-1.124086,-0.510579,Hypertension
3018,e060b7f8-a5ff-44ab-8caf-cd909f08c724,-0.097337,Male,-1.260268,0.856710,Hypertension
3019,ed118b6f-bf81-4af9-99ee-98eafb3c52e1,-1.576458,Male,1.293132,-0.545194,Diabetes


In [22]:
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df[cat_cols])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(cat_cols))

df_encoded = pd.concat([df, one_hot_df], axis=1)

df = df_encoded.drop(cat_cols, axis=1)

df

Unnamed: 0,Patient_ID,Age,Blood_Pressure,Cholesterol,Gender_Female,Gender_Male,Gender_NAN,Condition_Diabetes,Condition_Hypertension,Condition_Unknown
0,0f5343ac-b858-4726-b8e6-d0e5a7cd73ff,1.429498,-1.226222,-0.700961,0.0,1.0,0.0,0.0,1.0,0.0
1,ff6cbae7-14d6-486d-a03b-c174dce2b9c0,-0.669900,1.088860,1.514393,0.0,1.0,0.0,0.0,0.0,1.0
2,cc32f20a-c701-4398-85cc-30cfe5e7a62a,0.856935,0.000193,-0.510579,0.0,1.0,0.0,0.0,0.0,1.0
3,93c316b3-c2ff-42a9-9895-7ce3e3d6cd8a,-0.717614,-0.443180,1.566315,0.0,1.0,0.0,1.0,0.0,0.0
4,044838d9-29a8-46da-8d74-18705c8df5d7,-1.242463,-0.102727,0.545176,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
124,,,,,1.0,0.0,0.0,1.0,0.0,0.0
151,,,,,0.0,1.0,0.0,1.0,0.0,0.0
152,,,,,0.0,1.0,0.0,1.0,0.0,0.0
153,,,,,0.0,1.0,0.0,0.0,0.0,1.0
