In [29]:
import numpy as np
import pandas as pd
from scipy import sparse

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

In [30]:
data1 = pd.read_csv('C:/Users/DELL/Downloads/ML_project/healthcare_dataset.csv')
data1.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [31]:
data1.describe()

Unnamed: 0,Age,Billing Amount,Room Number
count,55500.0,55500.0,55500.0
mean,51.539459,25539.316097,301.134829
std,19.602454,14211.454431,115.243069
min,13.0,-2008.49214,101.0
25%,35.0,13241.224652,202.0
50%,52.0,25538.069376,302.0
75%,68.0,37820.508436,401.0
max,89.0,52764.276736,500.0


In [32]:
data1.describe(include='object')

Unnamed: 0,Name,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Admission Type,Discharge Date,Medication,Test Results
count,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500
unique,49992,2,8,6,1827,40341,39876,5,3,1856,5,3
top,DAvId muNoZ,Male,A-,Arthritis,2024-03-16,Michael Smith,LLC Smith,Cigna,Elective,2020-03-15,Lipitor,Abnormal
freq,3,27774,6969,9308,50,27,44,11249,18655,53,11140,18627


In [33]:
data1.isnull().sum()

Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64

In [34]:
#Billing Amount 
BA_mean = 25538.069376
BA_q1 = 13241.224652
BA_q3 = 37820.508436 
IQR = BA_q3 - BA_q1
lim1 = BA_q1 - (1.5 * IQR)
lim2 = BA_q3 + (1.5 * IQR)

data1[data1["Billing Amount"] < lim1].sum()

Name                    0
Age                     0
Gender                  0
Blood Type              0
Medical Condition       0
Date of Admission       0
Doctor                  0
Hospital                0
Insurance Provider      0
Billing Amount        0.0
Room Number             0
Admission Type          0
Discharge Date          0
Medication              0
Test Results            0
dtype: object

In [35]:
data1["Date of Admission"] = pd.to_datetime(data1["Date of Admission"])
data1["Discharge Date"] = pd.to_datetime(data1["Discharge Date"])
data1["Days Admissioned"] = (data1["Discharge Date"] - data1["Date of Admission"]).dt.days
data1["Days Admissioned"]

0         2
1         6
2        15
3        30
4        20
         ..
55495    30
55496     9
55497    28
55498     6
55499    27
Name: Days Admissioned, Length: 55500, dtype: int64

In [36]:
#dropping noise columns 
data1.drop(["Name", "Discharge Date","Room Number", "Date of Admission", "Doctor", "Hospital", "Insurance Provider", "Billing Amount"], axis=1, inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                55500 non-null  int64 
 1   Gender             55500 non-null  object
 2   Blood Type         55500 non-null  object
 3   Medical Condition  55500 non-null  object
 4   Admission Type     55500 non-null  object
 5   Medication         55500 non-null  object
 6   Test Results       55500 non-null  object
 7   Days Admissioned   55500 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 3.4+ MB


In [38]:
Y = data1["Test Results"]
X = data1.drop("Test Results", axis=1)


In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                55500 non-null  int64 
 1   Gender             55500 non-null  object
 2   Blood Type         55500 non-null  object
 3   Medical Condition  55500 non-null  object
 4   Admission Type     55500 non-null  object
 5   Medication         55500 non-null  object
 6   Days Admissioned   55500 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 3.0+ MB


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=101)

In [41]:
#Encoding using one_hot since all data is nominal
cat_col = ["Gender", "Blood Type", "Medical Condition", "Admission Type", "Medication"]
num_col = ["Age", "Days Admissioned"]

preprocessor = ColumnTransformer(
    transformers=[
        ("t1", OneHotEncoder(handle_unknown='ignore'), cat_col),
        ("t2", MinMaxScaler(), num_col)
    ],
    remainder='passthrough'
)

In [42]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [43]:
le = LabelEncoder()

Y_train = le.fit_transform(Y_train)
Y_test = le.fit_transform(Y_test)

In [44]:
sparse.save_npz("X_train.npz",X_train)
sparse.save_npz("X_test.npz",X_test)

np.save("Y_train.npy", Y_train)
np.save("Y_test.npy", Y_test)

In [45]:
np.shape(Y_test)

(11100,)