In [1]:
using DataFrames

In [2]:
#Cleaning helper functions

#Pre Condition: missing values have been removed since all entries will become column names 
function one_hot_encode(data::DataFrame, column_name::Symbol)
    n = size(data,1) #length of columns 
    binary_data = DataFrame() #new dataframe to store cleaned data
    old_row = data[column_name]
    #creates new columns out of entries of old column
    for new_col in unique(data[column_name])
        binary_data[Symbol(new_col)] = -1*ones(n)
    end
    #fills in binary values {1,-1} for each row
    for i in 1:n
        binary_data[i,Symbol(old_row[i])] = 1.0
    end
    return binary_data
end 

#Pre Condition: data is heart attack data from SPARCS dataset
function remove_missing_data(data::DataFrame)
    cdata = copy(data)
    #Health Service Area 
    cdata = cdata[cdata[:Health_Service_Area] .!= "",:]
    #Length of Stay
    cdata = cdata[cdata[:Length_of_Stay] .!= "120 +",:]
    cdata[:Length_of_Stay] = float(cdata[:Length_of_Stay])
    #Payment_Typology
    cdata = cdata[cdata[:Payment_Typology_1] .!= "Miscellaneous/Other",:]
    cdata = cdata[cdata[:Payment_Typology_1] .!= "Unknown",:]
    #Patient_Dispostion 
    cdata = cdata[cdata[:Patient_Disposition] .!= "Another Type Not Listed",:]
    #Type of Admission
    cdata = cdata[cdata[:Type_of_Admission] .!= "Not Available",:]
    #Race
    cdata = cdata[cdata[:Race] .!= "Unknown",:]
    #Total Charges
    n = size(cdata,1)
    for i in 1:n
        if cdata[:Total_Charges][i][2:end] == "" #impute better
            val = "0"
        else 
            val = cdata[:Total_Charges][i][2:end]
        end
            cdata[:Total_Charges][i] = val
    end
    cdata[:Total_Charges] = float(cdata[:Total_Charges])
    return cdata
end



one_hot_encode (generic function with 1 method)

In [4]:
#Pre_Condition: data is heart_attack data from SPARCS dataset
function clean(data::DataFrame)
    temp = remove_missing_data(data)
    clean_data = DataFrame()
    categorical_col = [:Health_Service_Area,:Age_Group,:Hospital_County,
                    :Patient_Disposition,:Gender,:Race,
                    :Type_of_Admission,:Payment_Typology_1]
    for col in categorical_col
        clean_data = hcat(clean_data,one_hot_encode(temp,col))
    end
    delete!(clean_data,:F)
    clean_data[:Length_of_Stay] = temp[:Length_of_Stay]
    clean_data[:Total_Charges] = temp[:Total_Charges]
    clean_data[:APR_Severity_of_Illness_Code] = temp[:APR_Severity_of_Illness_Code]
    clean_data[:offset] = ones(size(clean_data,1))
    return clean_data
end

clean (generic function with 1 method)

Hip Replacement Data Cleaning

In [5]:
HR = readtable("HR_raw.csv")
HR = clean(HR)
delete!(HR, :U)
HR = HR[HR[:Total_Charges] .< 250000,:]
writetable("HR_clean.csv", HR)

Depression Data Cleaning

In [24]:
depression = readtable("depression_raw.csv")
depression = clean(depression)
writetable("depression_clean.csv", depression)

Heart Attack Data Clening

In [24]:
heart_attack = readtable("heart_attack_raw.csv")

#create and assign simplified categories 
continued_care = ["Short-term Hospital","Skilled Nursing Home", 
    "Home w/ Home Health Services", "Left Against Medical Advice", 
    "Federal Health Care Facility", "Inpatient Rehabilitation Facility"  ]                
hospice = ["Hospice - Home","Hospice - Medical Facility"]             
other = ["Another Type Not Listed", "Court/Law Enforcement", 
    "Facility w/ Custodial/Supportive Care",  "Cancer Center or Children's Hospital" , 
    "Psychiatric Hospital or Unit of Hosp"]

for i in 1:length(heart_attack[:Patient_Disposition])
    if heart_attack[i,:Patient_Disposition] in continued_care
        heart_attack[i,:Patient_Disposition] = "Continued_Care"
    end
    if heart_attack[i,:Patient_Disposition] in hospice
        heart_attack[i,:Patient_Disposition] = "Hospice"
    end 
    if heart_attack[i,:Patient_Disposition] in other
        heart_attack[i,:Patient_Disposition] = "Other"
    end
end

#clean data
heart_attack = clean(heart_attack)
n = size(heart_attack,1)
outcome = Array(Integer,n)
for i in 1:n
    if heart_attack[i,:Continued_Care] == 1.0
        outcome[i] = 1
    end
    if heart_attack[i,:Expired] == 1.0
        outcome[i] = 2
    end
    if heart_attack[i,Symbol("Home or Self Care")] == 1.0
        outcome[i] = 3
    end
    if heart_attack[i,:Hospice] == 1.0
        outcome[i] = 4
    end
    if heart_attack[i,:Other] == 1.0
        outcome[i] = 5
    end
end
        
heart_attack[:outcome] = outcome
delete!(heart_attack,:Continued_Care)
delete!(heart_attack,:Expired)
delete!(heart_attack,Symbol("Home or Self Care"))
delete!(heart_attack,:Hospice)
delete!(heart_attack,:Other)


writetable("multi_heart_attack_clean.csv", heart_attack)

5-element Array{String,1}:
 "Another Type Not Listed"              
 "Court/Law Enforcement"                
 "Facility w/ Custodial/Supportive Care"
 "Cancer Center or Children's Hospital" 
 "Psychiatric Hospital or Unit of Hosp" 