In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# Health Care Dataset

## Understanding the data

In [3]:
health_care = pd.read_csv('data/healthcare_dataset.csv')

In [4]:
health_care.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal


In [5]:
health_care.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,51.4522,19.588974,18.0,35.0,52.0,68.0,85.0
Billing Amount,10000.0,25516.806778,14067.292709,1000.180837,13506.523967,25258.112566,37733.913727,49995.902283
Room Number,10000.0,300.082,115.806027,101.0,199.0,299.0,400.0,500.0


In [6]:
health_care.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                10000 non-null  object 
 1   Age                 10000 non-null  int64  
 2   Gender              10000 non-null  object 
 3   Blood Type          10000 non-null  object 
 4   Medical Condition   10000 non-null  object 
 5   Date of Admission   10000 non-null  object 
 6   Doctor              10000 non-null  object 
 7   Hospital            10000 non-null  object 
 8   Insurance Provider  10000 non-null  object 
 9   Billing Amount      10000 non-null  float64
 10  Room Number         10000 non-null  int64  
 11  Admission Type      10000 non-null  object 
 12  Discharge Date      10000 non-null  object 
 13  Medication          10000 non-null  object 
 14  Test Results        10000 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 1.1+

In [7]:
# Check the unique values in some columns
health_care['Blood Type'].unique()

array(['O-', 'O+', 'B-', 'AB+', 'A+', 'AB-', 'A-', 'B+'], dtype=object)

In [8]:
health_care['Medical Condition'].unique()

array(['Diabetes', 'Asthma', 'Obesity', 'Arthritis', 'Hypertension',
       'Cancer'], dtype=object)

In [9]:
health_care.Doctor.unique(), len(health_care.Doctor.unique())

(array(['Patrick Parker', 'Diane Jackson', 'Paul Baker', ...,
        'Robert Nicholson', 'Jamie Lewis', 'Tasha Avila'], dtype=object),
 9416)

In [10]:
health_care.Hospital.unique(), len(health_care.Hospital.unique())

(array(['Wallace-Hamilton', 'Burke, Griffin and Cooper', 'Walton LLC', ...,
        'Nash-Krueger', 'Wilson-Lyons', 'Torres, Young and Stewart'],
       dtype=object),
 8639)

In [11]:
health_care['Insurance Provider'].unique(), len(health_care['Insurance Provider'].unique())

(array(['Medicare', 'UnitedHealthcare', 'Aetna', 'Cigna', 'Blue Cross'],
       dtype=object),
 5)

In [12]:
health_care['Admission Type'].unique(), len(health_care['Admission Type'].unique())

(array(['Elective', 'Emergency', 'Urgent'], dtype=object), 3)

In [13]:
health_care.Medication.unique(), len(health_care.Medication.unique())

(array(['Aspirin', 'Lipitor', 'Penicillin', 'Paracetamol', 'Ibuprofen'],
       dtype=object),
 5)

In [14]:
health_care['Room Number'].unique(), len(health_care['Room Number'].unique())

(array([146, 404, 292, 480, 477, 180, 161, 384, 215, 310, 306, 126, 444,
        492, 120, 315, 475, 125, 366, 238, 364, 130, 293, 379, 298, 392,
        162, 456, 197, 247, 228, 137, 192, 258, 219, 414, 110, 465, 469,
        182, 119, 388, 412, 359, 186, 437, 132, 271, 361, 303, 317, 439,
        153, 438, 380, 194, 199, 301, 223, 410, 205, 134, 407, 188, 213,
        405, 358, 147, 115, 436, 263, 493, 460, 356, 142, 139, 482, 141,
        397, 347, 245, 143, 108, 268, 176, 462, 484, 329, 335, 201, 309,
        389, 217, 299, 275, 181, 401, 214, 267, 211, 184, 140, 416, 179,
        289, 350, 104, 220, 464, 419, 445, 398, 336, 413, 145, 148, 432,
        430, 406, 295, 175, 409, 424, 168, 136, 459, 261, 257, 170, 451,
        372, 202, 394, 264, 279, 260, 499, 452, 365, 340, 360, 290, 103,
        187, 378, 334, 470, 252, 450, 106, 259, 344, 489, 276, 155, 455,
        425, 400, 127, 333, 443, 129, 164, 486, 440, 265, 193, 222, 488,
        472, 391, 230, 322, 272, 154, 236, 375, 221

In [15]:
health_care['Test Results'].unique(), len(health_care['Test Results'].unique())

(array(['Inconclusive', 'Normal', 'Abnormal'], dtype=object), 3)

In [16]:
health_care.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

## Feature Engineering

In [17]:
# Create new column for treatment period
health_care['Treatment Period'] = (pd.to_datetime(health_care['Discharge Date']) - pd.to_datetime(health_care['Date of Admission'])).dt.days

In [18]:
health_care.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,Treatment Period
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive,14
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal,14
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal,30
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal,1
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal,24


In [19]:
# Leave only the usefull columns
health_care = health_care[['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication', 'Treatment Period', 'Test Results']]

In [20]:
health_care.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period,Test Results
0,81,Female,O-,Diabetes,Elective,Aspirin,14,Inconclusive
1,35,Male,O+,Asthma,Emergency,Lipitor,14,Normal
2,61,Male,B-,Obesity,Emergency,Lipitor,30,Normal
3,49,Male,B-,Asthma,Urgent,Penicillin,1,Abnormal
4,51,Male,O-,Arthritis,Urgent,Paracetamol,24,Normal


## Label Encoding

In [21]:
lc = LabelEncoder()
for col in health_care.columns:
    if col!='Age' and col != 'Treatment Period':
        health_care.loc[:,col]=lc.fit_transform(health_care[col])
health_care.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication,Treatment Period,Test Results
0,81,0,7,3,0,0,14,1
1,35,1,6,1,1,2,14,2
2,61,1,5,5,1,2,30,2
3,49,1,5,1,2,4,1,0
4,51,1,7,0,2,3,24,2


In [22]:
health_care.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Age                10000 non-null  int64
 1   Gender             10000 non-null  int32
 2   Blood Type         10000 non-null  int32
 3   Medical Condition  10000 non-null  int32
 4   Admission Type     10000 non-null  int32
 5   Medication         10000 non-null  int32
 6   Treatment Period   10000 non-null  int64
 7   Test Results       10000 non-null  int32
dtypes: int32(6), int64(2)
memory usage: 390.8 KB


In [28]:
health_care.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,51.4522,19.588974,18.0,35.0,52.0,68.0,85.0
Gender,10000.0,0.4925,0.499969,0.0,0.0,0.0,1.0,1.0
Blood Type,10000.0,3.5011,2.286071,0.0,2.0,3.0,5.0,7.0
Medical Condition,10000.0,2.4875,1.701803,0.0,1.0,2.0,4.0,5.0
Admission Type,10000.0,1.0149,0.814337,0.0,0.0,1.0,2.0,2.0
Medication,10000.0,2.0208,1.41858,0.0,1.0,2.0,3.0,4.0
Treatment Period,10000.0,15.5618,8.612038,1.0,8.0,16.0,23.0,30.0
Test Results,10000.0,0.9811,0.819762,0.0,0.0,1.0,2.0,2.0
