In [1]:
using DataFrames, StatsBase 

Useful Functions

In [2]:
#function that calculates out-of-sample error
#w = parameter array fit to model
#model = function that fits a model on a dataset
#test_set = dataset being tested
function out_sample_error(w,model,test_set)
    test_size = size(test_set,1)
    model_X = model(test_set)[2]
    model_output = round(model_X*w)
    model_error = 0
    for i in 1:test_size
        if test_set[:Survived][i] != model_output[i]
            model_error = model_error + 1
        end
    end
    return model_error / test_size
end

#function that runs a bootstrap estimation on the training set
#model = function that fits a model on a dataset
#resample_num = number of samples
function bootstrap_estimate(model,training_data
    ;resample_num = 1000)
    w_bs = 0
    td_size = size(training_data,1)
    for i in 1:resample_num
        #randomly samples from training set
        train_sample = training_data[sample(1:td_size,round(Int,.5*td_size)), :] 
        #incrememts bootstrap parameter by parameter of new sample
        w_bs = w_bs + model(train_sample)[1]
    end
    #finds average parameter among trials
    w_bs = w_bs / resample_num
    return w_bs
end 

#function that partitions data into train and test sets,
#runs model on train and checks error on test
#model = function that fits a model on a dataset
#training_data = dataset to be partitioned and crossvalidated
#k = number of partitions 
function cross_validate(model, training_data;
    k = 5)
    error_cv = 0
    td_size = size(training_data,1)
    #size of 1 partition when they are k partitions 
    partition = round(Int,td_size / k)
    for i in 1:k
        #creates training and test sets for this iteration
        s = (i-1)*partition + 1 
        if i == k 
            f = td_size
        else
            f = i*partition
        end
        #test partition with size 1/kth of dataset
        round_test = training_data[s:f,:]
        round_train_1 = training_data[1:s-1,:]
        round_train_2 = training_data[f+1:td_size,:]
        #training partition
        round_train = [round_train_1;round_train_2]
        #w when model is run on training set
        round_w = model(round_train)[1]
        #error in w when model run on test set
        round_error = out_sample_error(round_w,model,round_test)
        error_cv = error_cv + round_error
    end
    #average error among partitions 
    error_cv = error_cv / k
    return error_cv
end

#helper function for split_data
function build_subset(index,data)
    subset = similar(data,0)
    for i in 1:length(index)
        new_row = DataArray(data[data[:x] .== index[i],:])
        push!(subset,new_row)
    end
    return subset
end 

#splits dataset into train and test sets
#splits data based on the discharge identifier column :x (integer)
#split_raio = ratio of training set size to entire dataset size
function split_data(data_set;split_ratio = .8)
    n = size(data_set,1)
    split = Int(round(split_ratio*n))
    rand_index = shuffle(data_set[:x])
    train_index = rand_index[1:split]
    test_index = rand_index[split + 1:n]
    train = build_subset(train_index,data_set)
    test = build_subset(test_index,data_set)
    return(train,test)
end 


split_data (generic function with 1 method)

Analysis

In [3]:
#import data
heart_attack = readtable("heart_attack.csv")

Unnamed: 0,x,Health_Service_Area,Hospital_County,Operating_Certificate_Number,Facility_Id,Facility_Name,Age_Group,Zip_Code_3_digits,Gender,Race,Ethnicity,Length_of_Stay,Admit_Day_of_Week,Type_of_Admission,Patient_Disposition,Discharge_Year,Discharge_Day_of_Week,CCS_Diagnosis_Code,CCS_Diagnosis_Description,CCS_Procedure_Code,CCS_Procedure_Description,APR_DRG_Code,APR_DRG_Description,APR_MDC_Code,APR_MDC_Description,APR_Severity_of_Illness_Code,APR_Severity_of_Illness_Description,APR_Risk_of_Mortality,APR_Medical_Surgical_Description,Payment_Typology_1,Payment_Typology_2,Payment_Typology_3,Attending_Provider_License_Number,Operating_Provider_License_Number,Other_Provider_License_Number,Birth_Weight,Abortion_Edit_Indicator,Emergency_Department_Indicator,Total_Charges,Total_Costs,Survived,Male,White,Black,Other_Race,private_insurance,medicare,medicaid,old50_69,old70
1,3577,Western NY,Cattaraugus,401001,66,Olean General Hospital,70 or Older,147,M,White,Not Span/Hispanic,9,TUE,Emergency,Short-term Hospital,2012,THU,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicaid,,264217,149079,,0,N,Y,$18461.03,$5148.18,1,1,1,0,0,0,1,0,0,1
2,5411,Western NY,Cattaraugus,401001,66,Olean General Hospital,18 to 29,147,M,White,Not Span/Hispanic,2,TUE,Emergency,Expired,2012,THU,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Moderate,Medical,Self-Pay,,,141658,3053,,0,N,Y,$16729.38,$7543.35,0,1,1,0,0,0,0,0,0,0
3,8169,Western NY,Chautauqua,601000,98,Brooks Memorial Hospital,70 or Older,147,M,White,Not Span/Hispanic,1,SAT,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,229,NONOP RMVL FOREIGN BODY,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Extreme,Medical,Medicare,,,239319,217335,217335,0,N,Y,$7145.49,$6382.73,0,1,1,0,0,0,1,0,0,1
4,9390,Western NY,Chautauqua,601000,98,Brooks Memorial Hospital,70 or Older,141,F,White,Not Span/Hispanic,2,MON,Urgent,Expired,2012,WED,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Blue Cross/Blue Shield,,,261189,240432,,0,N,N,$6343.38,$10022.09,0,0,1,0,0,0,0,0,0,1
5,16311,Western NY,Chautauqua,602001,103,Woman's Christian Association,50 to 69,147,M,White,Not Span/Hispanic,5,FRI,Urgent,Home or Self Care,2012,WED,107,CARDIAC ARREST & VF,225,CONV OF CARDIAC RHYTHM,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Major,Medical,Medicare,Medicare,,138697,129581,,0,N,Y,$18324.66,$9180.61,1,1,1,0,0,0,1,0,1,0
6,17014,Western NY,Chautauqua,602001,103,Woman's Christian Association,70 or Older,147,M,White,Not Span/Hispanic,2,WED,Urgent,Home or Self Care,2012,FRI,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicare,,142873,172738,,0,N,Y,$6871.94,$2752.50,1,1,1,0,0,0,1,0,0,1
7,17754,Western NY,Chautauqua,602001,103,Woman's Christian Association,70 or Older,147,M,White,Not Span/Hispanic,4,FRI,Urgent,Skilled Nursing Home,2012,TUE,107,CARDIAC ARREST & VF,177,COMP AXIAL TOMOGR (CT),196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicare,,3876,130444,,0,N,Y,$7905.27,$3959.88,1,1,1,0,0,0,1,0,0,1
8,19217,Western NY,Chautauqua,427000,114,TLC Health Network Lake Shore Hospital,70 or Older,141,M,White,Not Span/Hispanic,1,TUE,Emergency,Expired,2012,TUE,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Medicare,,,200012,118047,,0,N,Y,$3810.29,$1638.08,0,1,1,0,0,0,1,0,0,1
9,21080,Western NY,Erie,1401014,207,Buffalo General Hospital,50 to 69,141,M,White,Not Span/Hispanic,3,TUE,Urgent,Home or Self Care,2012,FRI,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,1,Minor,Minor,Surgical,Blue Cross/Blue Shield,Self-Pay,,144605,144605,,0,N,N,$61974.35,$28926.80,1,1,1,0,0,0,0,0,1,0
10,21508,Western NY,Erie,1401014,207,Buffalo General Hospital,70 or Older,140,F,White,Not Span/Hispanic,3,MON,Urgent,Home or Self Care,2012,THU,107,CARDIAC ARREST & VF,62,OT DX CARDIOVASC PRCS,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Private Health Insurance,Self-Pay,229275,229275,,0,N,N,$7754.39,$3803.11,1,0,1,0,0,0,1,0,0,1


Data Cleaning

In [4]:
#hospital stays with length > 120 days labeled as "120 +"
#change these to length 120
heart_attack[heart_attack[:Length_of_Stay] .== "120 +",12] = "120"

#convert length of stay value to float
heart_attack[:Length_of_Stay] = float(heart_attack[:Length_of_Stay])

1072-element Array{Float64,1}:
  9.0
  2.0
  1.0
  2.0
  5.0
  2.0
  4.0
  1.0
  3.0
  3.0
 47.0
  6.0
  4.0
  ⋮  
  6.0
 21.0
  5.0
  1.0
  1.0
  1.0
  2.0
  1.0
  2.0
 20.0
 21.0
  1.0

Split Into Train and Test

In [5]:
#only looking at NYC hospital data
NYC_data = heart_attack[heart_attack[:Health_Service_Area] .== "New York City",:]
train = split_data(NYC_data)[1]
test = split_data(NYC_data)[2]

Unnamed: 0,x,Health_Service_Area,Hospital_County,Operating_Certificate_Number,Facility_Id,Facility_Name,Age_Group,Zip_Code_3_digits,Gender,Race,Ethnicity,Length_of_Stay,Admit_Day_of_Week,Type_of_Admission,Patient_Disposition,Discharge_Year,Discharge_Day_of_Week,CCS_Diagnosis_Code,CCS_Diagnosis_Description,CCS_Procedure_Code,CCS_Procedure_Description,APR_DRG_Code,APR_DRG_Description,APR_MDC_Code,APR_MDC_Description,APR_Severity_of_Illness_Code,APR_Severity_of_Illness_Description,APR_Risk_of_Mortality,APR_Medical_Surgical_Description,Payment_Typology_1,Payment_Typology_2,Payment_Typology_3,Attending_Provider_License_Number,Operating_Provider_License_Number,Other_Provider_License_Number,Birth_Weight,Abortion_Edit_Indicator,Emergency_Department_Indicator,Total_Charges,Total_Costs,Survived,Male,White,Black,Other_Race,private_insurance,medicare,medicaid,old50_69,old70
1,1604997,New York City,Manhattan,7002017,1450,Lenox Hill Hospital,70 or Older,100,F,White,Not Span/Hispanic,3.0,TUE,Emergency,Expired,2012,FRI,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Private Health Insurance,,184033,239777,,0,N,Y,$41288.36,$11749.60,0,0,1,0,0,0,1,0,0,1
2,1903831,New York City,Queens,7003000,1626,Elmhurst Hospital Center,50 to 69,113,M,White,Not Span/Hispanic,11.0,MON,Emergency,Expired,2012,FRI,107,CARDIAC ARREST & VF,225,CONV OF CARDIAC RHYTHM,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Medicare,Medicaid,164296,164296,,0,N,Y,$52637.74,$36686.03,0,1,1,0,0,0,1,0,1,0
3,2124919,New York City,Richmond,7004010,1738,Richmond University Medical Center,50 to 69,103,F,Black/African American,Not Span/Hispanic,1.0,SAT,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,177,COMP AXIAL TOMOGR (CT),196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Medicare,Medicaid,,134763,134763,,0,N,Y,$15734.38,$3282.26,0,0,0,1,0,0,1,0,1,0
4,1284650,New York City,Kings,7001019,1304,NYU Lutheran Medical Center,70 or Older,112,M,White,Not Span/Hispanic,1.0,WED,Emergency,Expired,2012,WED,107,CARDIAC ARREST & VF,225,CONV OF CARDIAC RHYTHM,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Medicaid,,246946,246946,212481,0,N,Y,$9228.33,$4421.88,0,1,1,0,0,0,1,0,0,1
5,1163948,New York City,Bronx,7000006,3058,Montefiore Med Center - Jack D Weiler Hosp of A Einstein College Div,50 to 69,104,F,Other Race,Spanish/Hispanic,4.0,SUN,Emergency,Home or Self Care,2012,THU,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Major,Medical,Private Health Insurance,Self-Pay,,217774,217774,,0,N,Y,$38346.31,$15813.00,1,0,0,0,1,1,0,0,1,0
6,1782937,New York City,Manhattan,7002053,1463,NYU Hospitals Center,70 or Older,OOS,M,Other Race,Not Span/Hispanic,7.0,WED,Urgent,Home w/ Home Health Services,2012,WED,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,2,Moderate,Major,Surgical,Medicaid,,,248717,159739,,0,N,N,$372898.42,$78258.68,1,1,0,0,1,0,0,1,0,1
7,1403716,New York City,Kings,7001035,1318,Wyckoff Heights Medical Center,70 or Older,113,F,White,Unknown,6.0,WED,Emergency,Expired,2012,TUE,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Extreme,Medical,Medicare,Medicaid,Self-Pay,178351,223552,,0,N,Y,$36516.64,$10896.61,0,0,1,0,0,0,1,0,0,1
8,1024361,New York City,Bronx,7000006,1169,Montefiore Medical Center - Henry & Lucy Moses Div,50 to 69,104,M,Other Race,Spanish/Hispanic,3.0,SAT,Emergency,Home or Self Care,2012,TUE,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,1,Minor,Minor,Surgical,Private Health Insurance,Self-Pay,,250183,110785,,0,N,Y,$87057.65,$24756.44,1,1,0,0,1,1,0,0,1,0
9,1869177,New York City,Manhattan,7002032,1469,Mount Sinai St. Lukes,70 or Older,100,F,Black/African American,Spanish/Hispanic,8.0,SAT,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Medicare,Medicaid,166696,166696,,0,N,Y,$95699.58,$28923.60,0,0,0,1,0,0,1,0,0,1
10,1248820,New York City,Kings,7001016,1301,Kings County Hospital Center,70 or Older,112,F,Other Race,Spanish/Hispanic,23.0,SAT,Emergency,Skilled Nursing Home,2012,MON,107,CARDIAC ARREST & VF,34,TRACHEOSTOMY; TEMP/PERM,5,TRACHEOSTOMY W MV 96+ HOURS W/O EXTENSIVE PROCEDURE,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Surgical,Medicare,Medicare,Medicaid,142307,155307,,0,N,Y,$90683.32,$78908.91,1,0,0,0,1,0,1,0,0,1


Fit Models on Training Data

In [6]:
#MODEL ONE
#conditions on severity of illness and length of stay
function model_one(training_data)
    td_size = size(training_data,1)
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 

#MODEL TWO
#conditions on severity of illness, length of stay, and gender
function model_two(training_data)
    td_size = size(training_data,1)
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] training_data[:Male] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 

#MODEL THREE
#conditions on severity of illness, length of stay, gender, and age
function model_three(training_data)
    td_size = size(training_data,1)
    #binary value, if age is >= 70
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] training_data[:Male] training_data[:old70] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 

model_three (generic function with 1 method)

In [7]:
#parameters w for each model
w1 = model_one(train)[1]
w2 = model_two(train)[1]
w3 = model_three(train)[1]

5-element Array{Float64,1}:
 -0.261695 
  0.0214423
  0.165245 
 -0.168826 
  1.07584  

Cross Validation of Models

In [8]:
@show cross_validate(model_one, train)
@show cross_validate(model_two, train) 
@show cross_validate(model_three, train) 

cross_validate(model_one,train) = 0.22273082942097028
cross_validate(model_two,train) = 0.20320813771517998
cross_validate(model_three,train) = 0.18928012519561815


0.18928012519561815

Bootstrap Estimate for Models

In [9]:
w1_bs = bootstrap_estimate(model_one,train)
w2_bs = bootstrap_estimate(model_two,train)
w3_bs = bootstrap_estimate(model_three,train)

5-element Array{Float64,1}:
 -0.263623 
  0.0224991
  0.163289 
 -0.165949 
  1.07377  

Check Model Performance on Validation Set

In [10]:
@show out_sample_error(w1,model_one,test)
@show out_sample_error(w1_bs,model_one,test)
@show out_sample_error(w2,model_two,test)
@show out_sample_error(w2_bs,model_two,test)
@show out_sample_error(w3,model_three,test)
@show out_sample_error(w3_bs,model_three,test)

out_sample_error(w1,model_one,test) = 0.16666666666666666
out_sample_error(w1_bs,model_one,test) = 0.16666666666666666
out_sample_error(w2,model_two,test) = 0.1
out_sample_error(w2_bs,model_two,test) = 0.1111111111111111
out_sample_error(w3,model_three,test) = 0.12222222222222222
out_sample_error(w3_bs,model_three,test) = 0.13333333333333333


0.13333333333333333