In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
!jupyter --version

jupyter core     : 4.7.1
jupyter-notebook : 6.4.3
qtconsole        : 5.1.0
ipython          : 7.26.0
ipykernel        : 6.2.0
jupyter client   : 6.1.12
jupyter lab      : not installed
nbconvert        : 6.1.0
ipywidgets       : 7.6.3
nbformat         : 5.1.3
traitlets        : 5.0.5


In [3]:
import pkg_resources
import sys
print("Pandas: " + pkg_resources.get_distribution("pandas").version)
print("NumPy: " + pkg_resources.get_distribution("numpy").version)
print("Scikit-Learn: " + pkg_resources.get_distribution("scikit-learn").version)
print("Python: " + sys.version)

Pandas: 1.3.2
NumPy: 1.20.3
Scikit-Learn: 0.24.2
Python: 3.8.11 (default, Aug  6 2021, 09:57:55) [MSC v.1916 64 bit (AMD64)]


In [4]:
data = pd.read_csv('medical_clean.csv')
#Divide the data depending on the variable type
data_num = data.iloc[:, np.r_[14:17, 20]].copy() #Numeric variables
data_enc = data.iloc[:, np.r_[17, 18]].copy() #Non-binary Categorical Variables
data_yn = data.iloc[:, np.r_[24, 26, 27, 29:38]].copy() #Binary Categorical Variables
data_ord = data.iloc[:, [11, 25, 28]].copy() #Ordinal Categorical Variables
data_tar = data['Initial_days'] #Target Variable

In [5]:
def meddata_preprocessing(data_num, data_enc, data_yn, data_ord, data_tar): #This is saved as a function for reusability in later tasks
    for i in range(len(data_num.columns)):
        #Compute the mean and standard deviation of each column
        mean, std = np.mean(data_num.iloc[:,i]), np.std(data_num.iloc[:,i])
        #Set the upper and lower bounds at three standard deviations from the mean
        upper, lower = mean + 3 * std, mean - 3 * std
        #Record the index for each row that contains a value outside the previously set boundaries
        drop = [inx for inx, x in enumerate(data_num.iloc[:, i]) if x < lower or x > upper]
        #Compare the list of indices to be dropped with those within the dataframe and drop those that still remain
        #within the data frame while ignoring those that were already dropped
        for d in drop:
            if d in data_num.index:
                data_num = data_num.drop(d)
    #One-hot encode categorical variables
    data_enc = pd.get_dummies(data_enc, prefix=data_enc.columns, drop_first=True)
        
    for col in range(len(data_yn.columns)):
        #Replace values of "Yes" with 1 and values of "No" with 0
        for inx, val in enumerate(data_yn.iloc[:, col]):
            if val == 'Yes':
                data_yn.iloc[inx, col] = 1
            else:
                data_yn.iloc[inx, col] = 0
    #Determine levels of ordinal variables
    scale_mapper = {
        "Area" : {
            "Rural" : 0,
            "Suburban" : 0.5,
            "Urban" : 1
        },
        "Initial_admin" : {
            "Emergency Admission" : 1,
            "Observation Admission" : 0.5,
            "Elective Admission" : 0
        },
        "Complication_risk" : {
            "Low" : 0,
            "Medium" : 0.5,
            "High" : 1
        }
    }
    #Replace values with numerical equivalents specified above
    for col in data_ord.columns:
        data_ord[col] = data_ord[col].copy().replace(scale_mapper[col])
    
    mm = MinMaxScaler() #Instantiate the MinMaxScaler method
    data_num[data_num.columns] = mm.fit_transform(data_num) #Normalize the data
    
    #The use of an inner join preserves the dropping of rows performed on data_num
    #The data_yn dataframe is converted to a numeric datatype, int32, before joining
    data_clean = data_num.copy().join(data_enc, how='inner').join(data_yn.astype('int32'), how='inner').join(data_ord, how='inner').join(data_tar, how='inner')
    
    return data_clean #Returns a fully prepared data set

In [6]:
data_clean = meddata_preprocessing(data_num, data_enc, data_yn, data_ord, data_tar)
data_clean

Unnamed: 0,Children,Age,Income,VitD_levels,Marital_Married,Marital_Never Married,Marital_Separated,Marital_Widowed,Gender_Male,Gender_Nonbinary,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Area,Initial_admin,Complication_risk,Initial_days
0,0.125,0.492958,0.417305,0.562756,0,0,0,0,1,0,...,0,1,1,1,0,1,0.5,1.0,0.5,10.585770
1,0.375,0.464789,0.225268,0.550632,1,0,0,0,0,0,...,0,0,0,0,1,0,1.0,1.0,1.0,15.129562
2,0.375,0.492958,0.068645,0.497410,0,0,0,1,0,0,...,0,0,0,0,0,0,0.5,0.0,0.5,4.772177
3,0.000,0.845070,0.191156,0.408150,1,0,0,0,1,0,...,0,0,0,0,1,1,0.5,0.0,0.5,1.714879
4,0.125,0.056338,0.005097,0.460128,0,0,0,1,0,0,...,1,0,0,1,0,0,0.0,0.0,0.0,1.254807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.250,0.098592,0.221220,0.432505,0,0,0,1,1,0,...,0,0,1,0,1,0,1.0,1.0,0.5,51.561220
9996,0.500,0.971831,0.071605,0.504615,0,0,0,1,1,0,...,0,0,0,0,0,1,1.0,0.0,0.5,68.668240
9997,0.375,0.380282,0.317553,0.441440,0,0,1,0,0,0,...,0,0,1,1,0,0,0.0,0.0,1.0,70.154180
9998,0.375,0.352113,0.142680,0.609113,0,0,0,0,1,0,...,0,1,0,0,0,0,0.0,1.0,0.5,63.356900


In [7]:
data_clean.to_csv('medical_data_prepared_task_2.csv')

In [8]:
X = data_clean.drop(labels=['Initial_days'], axis=1).copy()
y = data_clean['Initial_days'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
pred = rf_reg.predict(X_test)
print(mean_squared_error(y_test, pred))

715.0815899016233


In [10]:
rf_reg.score(X_test, y_test)

-0.029063107310114633