In [333]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [334]:
raw_data = pd.read_csv("../../raw/rmssd/healthcare_dataset.csv")
raw_data.head(5)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [335]:
raw_data.shape

(55500, 15)

In [336]:
raw_data.dropna().shape

(55500, 15)

In [337]:
raw_data.columns

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')

In [338]:
raw_data.dtypes

Name                   object
Age                     int64
Gender                 object
Blood Type             object
Medical Condition      object
Date of Admission      object
Doctor                 object
Hospital               object
Insurance Provider     object
Billing Amount        float64
Room Number             int64
Admission Type         object
Discharge Date         object
Medication             object
Test Results           object
dtype: object

In [339]:
filter_data_values = raw_data[['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Medication', 'Test Results']]

Convertir los datos tipo objeto a tipo numerico (ordinal (no usar), presonal, one hot encoder, frequency)

In [340]:
filter_data_values["Test Results"].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

In [341]:
filter_data_values.dropna().shape

(55500, 7)

In [342]:
filter_data_values.describe()

Unnamed: 0,Age
count,55500.0
mean,51.539459
std,19.602454
min,13.0
25%,35.0
50%,52.0
75%,68.0
max,89.0


In [343]:
filter_data_values.dtypes

Age                   int64
Gender               object
Blood Type           object
Medical Condition    object
Admission Type       object
Medication           object
Test Results         object
dtype: object

In [344]:
filter_data_values['Test Results'].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

filter_data_values['Gender']= filter_data_values['Gender'].replace({"Female": 0, "Male": 1})
filter_data_values['Blood Type']= filter_data_values['Blood Type'].replace({"A+": 0, "A-": 1, "AB+": 2, "AB-": 3, "B+": 4, "B-": 5, "O+": 6, "O-": 7})
filter_data_values['Medical Condition']= filter_data_values['Medical Condition'].replace({"Diabetes": 0, "Cancer": 1, "Arthritis": 2, "Hypertension": 3, "Asthma": 4, "Obesity": 5})
filter_data_values['Admission Type']= filter_data_values['Admission Type'].replace({"Urgent": 0, "Elective": 1, "Emergency": 2})
filter_data_values['Medication']= filter_data_values['Medication'].replace({"Ibuprofen": 0, "Aspirin": 1, "Lipitor": 2, "Penicillin": 3, "Paracetamol": 4})
filter_data_values['Test Results']= filter_data_values['Test Results'].replace({"Abnormal": 0, "Normal": 1, "Inconclusive": 2})

In [345]:
filter_data_values["Test Results"].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

In [346]:
response = filter_data_values["Test Results"]
data = filter_data_values.drop(columns=["Test Results"])

In [347]:
data

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Admission Type,Medication
0,30,Male,B-,Cancer,Urgent,Paracetamol
1,62,Male,A+,Obesity,Emergency,Ibuprofen
2,76,Female,A-,Obesity,Emergency,Aspirin
3,28,Female,O+,Diabetes,Elective,Ibuprofen
4,43,Female,AB+,Cancer,Urgent,Penicillin
...,...,...,...,...,...,...
55495,42,Female,O+,Asthma,Elective,Penicillin
55496,61,Female,AB-,Obesity,Elective,Aspirin
55497,38,Female,B+,Hypertension,Urgent,Ibuprofen
55498,43,Male,O-,Arthritis,Elective,Ibuprofen


In [348]:
data_num = data.select_dtypes(exclude='object')
data_object = data.select_dtypes(include='object')

encoder = OneHotEncoder(sparse_output= False).set_output(transform="pandas")

In [349]:
for col_name in data_object.columns:
    encoded_array = encoder.fit_transform(data_object[[col_name]])
    data_object = pd.concat([data_object, encoded_array], axis=1).drop(col_name, axis=1)

In [350]:
data= pd.concat([data_num, data_object], axis=1)

In [351]:
X_train, X_test, y_train, y_test = train_test_split(data, response, test_size=0.1, random_state=10)

In [352]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=10)

In [353]:
X_train.to_csv("../../process_dataset/rmssd/train_data.csv", index=False)
X_val.to_csv("../../process_dataset/rmssd/val_data.csv", index=False)
X_test.to_csv("../../process_dataset/rmssd/test_data.csv", index=False)

In [354]:
with open("../../process_dataset/rmssd/y_train.npy", 'wb') as doc_export:
    np.save(doc_export, y_train)

In [355]:
with open("../../process_dataset/rmssd/y_val.npy", 'wb') as doc_export:
    np.save(doc_export, y_val)

In [356]:
with open("../../process_dataset/rmssd/y_test.npy", 'wb') as doc_export:
    np.save(doc_export, y_test)

In [357]:
df_counts = pd.DataFrame()
df_counts["y_values"] = y_train

In [358]:
df_counts["y_values"].value_counts()

y_values
Abnormal        13413
Normal          13326
Inconclusive    13221
Name: count, dtype: int64