In [11]:
using DataFrames, StatsBase 

Useful Functions

In [12]:
#function that calculates out-of-sample error
#w = parameter array fit to model
#model = function that fits a model on a dataset
#test_set = dataset being tested
function out_sample_error(w,model,test_set)
    test_size = size(test_set,1)
    model_X = model(test_set)[2]
    model_output = round(model_X*w)
    model_error = 0
    for i in 1:test_size
        if test_set[:Survived][i] != model_output[i]
            model_error = model_error + 1
        end
    end
    return model_error / test_size
end

#function that runs a bootstrap estimation on the training set
#model = function that fits a model on a dataset
#resample_num = number of samples
function bootstrap_estimate(model,training_data
    ;resample_num = 1000)
    w_bs = 0
    td_size = size(training_data,1)
    for i in 1:resample_num
        #randomly samples from training set
        train_sample = training_data[sample(1:td_size,round(Int,.5*td_size)), :] 
        #incrememts bootstrap parameter by parameter of new sample
        w_bs = w_bs + model(train_sample)[1]
    end
    #finds average parameter among trials
    w_bs = w_bs / resample_num
    return w_bs
end 

#function that partitions data into train and test sets,
#runs model on train and checks error on test
#model = function that fits a model on a dataset
#training_data = dataset to be partitioned and crossvalidated
#k = number of partitions 
function cross_validate(model, training_data;
    k = 5)
    error_cv = 0
    td_size = size(training_data,1)
    #size of 1 partition when they are k partitions 
    partition = round(Int,td_size / k)
    for i in 1:k
        #creates training and test sets for this iteration
        s = (i-1)*partition + 1 
        if i == k 
            f = td_size
        else
            f = i*partition
        end
        #test partition with size 1/kth of dataset
        round_test = training_data[s:f,:]
        round_train_1 = training_data[1:s-1,:]
        round_train_2 = training_data[f+1:td_size,:]
        #training partition
        round_train = [round_train_1;round_train_2]
        #w when model is run on training set
        round_w = model(round_train)[1]
        #error in w when model run on test set
        round_error = out_sample_error(round_w,model,round_test)
        error_cv = error_cv + round_error
    end
    #average error among partitions 
    error_cv = error_cv / k
    return error_cv
end

#helper function for split_data
function build_subset(index,data)
    subset = similar(data,0)
    for i in 1:length(index)
        new_row = DataArray(data[data[:x] .== index[i],:])
        push!(subset,new_row)
    end
    return subset
end 

#splits dataset into train and test sets
#splits data based on the discharge identifier column :x (integer)
#split_raio = ratio of training set size to entire dataset size
function split_data(data_set;split_ratio = .8)
    n = size(data_set,1)
    split = Int(round(split_ratio*n))
    rand_index = shuffle(data_set[:x])
    train_index = rand_index[1:split]
    test_index = rand_index[split + 1:n]
    train = build_subset(train_index,data_set)
    test = build_subset(test_index,data_set)
    return(train,test)
end 




split_data (generic function with 1 method)

Analysis

In [13]:
#import data
heart_attack = readtable("heart_attack.csv")

Unnamed: 0,x,Health_Service_Area,Hospital_County,Operating_Certificate_Number,Facility_Id,Facility_Name,Age_Group,Zip_Code_3_digits,Gender,Race,Ethnicity,Length_of_Stay,Admit_Day_of_Week,Type_of_Admission,Patient_Disposition,Discharge_Year,Discharge_Day_of_Week,CCS_Diagnosis_Code,CCS_Diagnosis_Description,CCS_Procedure_Code,CCS_Procedure_Description,APR_DRG_Code,APR_DRG_Description,APR_MDC_Code,APR_MDC_Description,APR_Severity_of_Illness_Code,APR_Severity_of_Illness_Description,APR_Risk_of_Mortality,APR_Medical_Surgical_Description,Payment_Typology_1,Payment_Typology_2,Payment_Typology_3,Attending_Provider_License_Number,Operating_Provider_License_Number,Other_Provider_License_Number,Birth_Weight,Abortion_Edit_Indicator,Emergency_Department_Indicator,Total_Charges,Total_Costs,Survived,Male,White,Black,Other_Race,private_insurance,medicare,medicaid,old50_69,old70
1,3577,Western NY,Cattaraugus,401001,66,Olean General Hospital,70 or Older,147,M,White,Not Span/Hispanic,9,TUE,Emergency,Short-term Hospital,2012,THU,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicaid,,264217,149079,,0,N,Y,$18461.03,$5148.18,1,1,1,0,0,0,1,0,0,1
2,5411,Western NY,Cattaraugus,401001,66,Olean General Hospital,18 to 29,147,M,White,Not Span/Hispanic,2,TUE,Emergency,Expired,2012,THU,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Moderate,Medical,Self-Pay,,,141658,3053,,0,N,Y,$16729.38,$7543.35,0,1,1,0,0,0,0,0,0,0
3,8169,Western NY,Chautauqua,601000,98,Brooks Memorial Hospital,70 or Older,147,M,White,Not Span/Hispanic,1,SAT,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,229,NONOP RMVL FOREIGN BODY,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Extreme,Medical,Medicare,,,239319,217335,217335,0,N,Y,$7145.49,$6382.73,0,1,1,0,0,0,1,0,0,1
4,9390,Western NY,Chautauqua,601000,98,Brooks Memorial Hospital,70 or Older,141,F,White,Not Span/Hispanic,2,MON,Urgent,Expired,2012,WED,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Blue Cross/Blue Shield,,,261189,240432,,0,N,N,$6343.38,$10022.09,0,0,1,0,0,0,0,0,0,1
5,16311,Western NY,Chautauqua,602001,103,Woman's Christian Association,50 to 69,147,M,White,Not Span/Hispanic,5,FRI,Urgent,Home or Self Care,2012,WED,107,CARDIAC ARREST & VF,225,CONV OF CARDIAC RHYTHM,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Major,Medical,Medicare,Medicare,,138697,129581,,0,N,Y,$18324.66,$9180.61,1,1,1,0,0,0,1,0,1,0
6,17014,Western NY,Chautauqua,602001,103,Woman's Christian Association,70 or Older,147,M,White,Not Span/Hispanic,2,WED,Urgent,Home or Self Care,2012,FRI,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicare,,142873,172738,,0,N,Y,$6871.94,$2752.50,1,1,1,0,0,0,1,0,0,1
7,17754,Western NY,Chautauqua,602001,103,Woman's Christian Association,70 or Older,147,M,White,Not Span/Hispanic,4,FRI,Urgent,Skilled Nursing Home,2012,TUE,107,CARDIAC ARREST & VF,177,COMP AXIAL TOMOGR (CT),196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Medicare,,3876,130444,,0,N,Y,$7905.27,$3959.88,1,1,1,0,0,0,1,0,0,1
8,19217,Western NY,Chautauqua,427000,114,TLC Health Network Lake Shore Hospital,70 or Older,141,M,White,Not Span/Hispanic,1,TUE,Emergency,Expired,2012,TUE,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Medicare,,,200012,118047,,0,N,Y,$3810.29,$1638.08,0,1,1,0,0,0,1,0,0,1
9,21080,Western NY,Erie,1401014,207,Buffalo General Hospital,50 to 69,141,M,White,Not Span/Hispanic,3,TUE,Urgent,Home or Self Care,2012,FRI,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,1,Minor,Minor,Surgical,Blue Cross/Blue Shield,Self-Pay,,144605,144605,,0,N,N,$61974.35,$28926.80,1,1,1,0,0,0,0,0,1,0
10,21508,Western NY,Erie,1401014,207,Buffalo General Hospital,70 or Older,140,F,White,Not Span/Hispanic,3,MON,Urgent,Home or Self Care,2012,THU,107,CARDIAC ARREST & VF,62,OT DX CARDIOVASC PRCS,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,2,Moderate,Moderate,Medical,Medicare,Private Health Insurance,Self-Pay,229275,229275,,0,N,N,$7754.39,$3803.11,1,0,1,0,0,0,1,0,0,1


Data Cleaning

In [14]:
#hospital stays with length > 120 days labeled as "120 +"
#change these to length 120
heart_attack[heart_attack[:Length_of_Stay] .== "120 +",12] = "120"

#convert length of stay value to float
heart_attack[:Length_of_Stay] = float(heart_attack[:Length_of_Stay])

1072-element Array{Float64,1}:
  9.0
  2.0
  1.0
  2.0
  5.0
  2.0
  4.0
  1.0
  3.0
  3.0
 47.0
  6.0
  4.0
  ⋮  
  6.0
 21.0
  5.0
  1.0
  1.0
  1.0
  2.0
  1.0
  2.0
 20.0
 21.0
  1.0

Split Into Train and Test

In [15]:
#only looking at NYC hospital data
NYC_data = heart_attack[heart_attack[:Health_Service_Area] .== "New York City",:]
train = split_data(NYC_data)[1]
test = split_data(NYC_data)[2]

Unnamed: 0,x,Health_Service_Area,Hospital_County,Operating_Certificate_Number,Facility_Id,Facility_Name,Age_Group,Zip_Code_3_digits,Gender,Race,Ethnicity,Length_of_Stay,Admit_Day_of_Week,Type_of_Admission,Patient_Disposition,Discharge_Year,Discharge_Day_of_Week,CCS_Diagnosis_Code,CCS_Diagnosis_Description,CCS_Procedure_Code,CCS_Procedure_Description,APR_DRG_Code,APR_DRG_Description,APR_MDC_Code,APR_MDC_Description,APR_Severity_of_Illness_Code,APR_Severity_of_Illness_Description,APR_Risk_of_Mortality,APR_Medical_Surgical_Description,Payment_Typology_1,Payment_Typology_2,Payment_Typology_3,Attending_Provider_License_Number,Operating_Provider_License_Number,Other_Provider_License_Number,Birth_Weight,Abortion_Edit_Indicator,Emergency_Department_Indicator,Total_Charges,Total_Costs,Survived,Male,White,Black,Other_Race,private_insurance,medicare,medicaid,old50_69,old70
1,1968792,New York City,Queens,7003003,1629,Jamaica Hospital Medical Center,30 to 49,114,F,Other Race,Unknown,1.0,SUN,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Medicaid,,192298,192298,,0,N,Y,$8720.00,$3799.26,0,0,0,0,1,0,1,0,0,0
2,1912122,New York City,Queens,7003000,1626,Elmhurst Hospital Center,50 to 69,113,F,Other Race,Not Span/Hispanic,1.0,SAT,Emergency,Home or Self Care,2012,SUN,107,CARDIAC ARREST & VF,227,"OT DX PRC (INTERVW,EVAL",196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Moderate,Medical,Medicaid,Medicaid,,219103,232135,,0,N,Y,$4859.37,$3386.75,1,0,0,0,1,0,0,1,1,0
3,1094822,New York City,Bronx,7000014,1176,SBH Health System,30 to 49,100,M,Black/African American,Unknown,4.0,SAT,Emergency,Expired,2012,WED,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Self-Pay,,,149375,149375,,0,N,Y,$51202.11,$19671.47,0,1,0,1,0,0,0,0,0,0
4,1706853,New York City,Manhattan,7002024,1456,Mount Sinai Hospital,50 to 69,100,M,Other Race,Not Span/Hispanic,1.0,SUN,Emergency,Expired,2012,SUN,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Private Health Insurance,Medicaid,Self-Pay,184576,184576,,0,N,Y,$16588.80,$7462.75,0,1,0,0,1,1,0,0,1,0
5,1912687,New York City,Queens,7003000,1626,Elmhurst Hospital Center,70 or Older,113,F,Other Race,Not Span/Hispanic,5.0,SAT,Emergency,Expired,2012,THU,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,4,Extreme,Extreme,Medical,Medicare,Medicare,Blue Cross/Blue Shield,228949,228949,,0,N,Y,$31035.28,$21630.13,0,0,0,0,1,0,1,0,0,1
6,2045483,New York City,Queens,7003010,1637,New York Hospital Medical Center of Queens,70 or Older,113,F,White,Spanish/Hispanic,27.0,TUE,Emergency,Skilled Nursing Home,2012,MON,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,2,Moderate,Major,Surgical,Medicare,Medicaid,Self-Pay,166440,206004,,0,N,Y,$232717.61,$113991.19,1,0,1,0,0,0,1,0,0,1
7,1609266,New York City,Manhattan,7002017,1450,Lenox Hill Hospital,18 to 29,104,M,Other Race,Spanish/Hispanic,1.0,FRI,Elective,Home or Self Care,2012,SAT,107,CARDIAC ARREST & VF,62,OT DX CARDIOVASC PRCS,191,CARDIAC CATHETERIZATION W CIRC DISORD EXC ISCHEMIC HEART DISEASE,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Private Health Insurance,,,247537,247537,,0,N,N,$31919.28,$9714.07,1,1,0,0,1,1,0,0,0,0
8,2149327,New York City,Richmond,7004003,1740,Staten Island University Hosp-North,70 or Older,103,M,Other Race,Not Span/Hispanic,7.0,SAT,Emergency,Home w/ Home Health Services,2012,SAT,107,CARDIAC ARREST & VF,48,CARDIAC PACEMAKER/DEFIB,161,CARDIAC DEFIBRILLATOR & HEART ASSIST IMPLANT,5,Diseases and Disorders of the Circulatory System,3,Major,Extreme,Surgical,Medicare,Medicare,,204466,199001,,0,N,Y,$162937.73,$95338.57,1,1,0,0,1,0,1,0,0,1
9,1186500,New York City,Kings,7001002,1286,Brookdale Hospital Medical Center,70 or Older,112,M,White,Not Span/Hispanic,1.0,TUE,Emergency,Expired,2012,TUE,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Medicare,,,113717,113717,113717,0,N,Y,$6212.99,$3148.96,0,1,1,0,0,0,1,0,0,1
10,1473003,New York City,Manhattan,7002000,1437,New York Downtown Hospital,50 to 69,100,M,Other Race,Not Span/Hispanic,1.0,THU,Emergency,Expired,2012,FRI,107,CARDIAC ARREST & VF,216,RESP INTUB/MECH VENTIL,196,CARDIAC ARREST,5,Diseases and Disorders of the Circulatory System,3,Major,Major,Medical,Medicare,Self-Pay,,184722,184722,,0,N,Y,$8352.94,$3366.17,0,1,0,0,1,0,1,0,1,0


Fit Models on Training Data

In [16]:
#MODEL ONE
#conditions on severity of illness and length of stay
function model_one(training_data)
    td_size = size(training_data,1)
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 

#MODEL TWO
#conditions on severity of illness, length of stay, and gender
function model_two(training_data)
    td_size = size(training_data,1)
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] training_data[:Male] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 

#MODEL THREE
#conditions on severity of illness, length of stay, gender, and age
function model_three(training_data)
    td_size = size(training_data,1)
    #binary value, if age is >= 70
    X = [training_data[:APR_Severity_of_Illness_Code].data training_data[:Length_of_Stay] training_data[:Male] training_data[:old70] ones(td_size)]
    y = training_data[:Survived].data
    w = X\y
    return (w,X)
end 



model_three (generic function with 1 method)

In [17]:
#parameters w for each model
w1 = model_one(train)[1]
w2 = model_two(train)[1]
w3 = model_three(train)[1]

5-element Array{Float64,1}:
 -0.2532   
  0.0191878
  0.189943 
 -0.182002 
  1.05583  

Cross Validation of Models

In [18]:
@show cross_validate(model_one, train)
@show cross_validate(model_two, train) 
@show cross_validate(model_three, train) 

cross_validate(model_one,train) = 0.21177621283255088
cross_validate(model_two,train) = 0.19237089201877933
cross_validate(model_three,train) = 0.22844287949921754


0.22844287949921754

Bootstrap Estimate for Models

In [19]:
w1_bs = bootstrap_estimate(model_one,train)
w2_bs = bootstrap_estimate(model_two,train)
w3_bs = bootstrap_estimate(model_three,train)

5-element Array{Float64,1}:
 -0.25665 
  0.021066
  0.191289
 -0.176353
  1.05403 

Check Model Performance on Validation Set

In [20]:
@show out_sample_error(w1,model_one,test)
@show out_sample_error(w1_bs,model_one,test)
@show out_sample_error(w2,model_two,test)
@show out_sample_error(w2_bs,model_two,test)
@show out_sample_error(w3,model_three,test)
@show out_sample_error(w3_bs,model_three,test)

out_sample_error(w1,model_one,test) = 0.2222222222222222
out_sample_error(w1_bs,model_one,test) = 0.2222222222222222
out_sample_error(w2,model_two,test) = 0.17777777777777778
out_sample_error(w2_bs,model_two,test) = 0.17777777777777778
out_sample_error(w3,model_three,test) = 0.2
out_sample_error(w3_bs,model_three,test) = 0.2


0.2