In [1]:
using DataFrames

In [3]:
#Pre Condition: missing values have been removed since all entries will become column names 
function one_hot_encode(data::DataFrame, column_name::Symbol)
    n = size(data,1) #length of columns 
    binary_data = DataFrame() #new dataframe to store cleaned data
    old_row = data[column_name]
    #creates new columns out of entries of old column
    for new_col in unique(data[column_name])
        binary_data[Symbol(new_col)] = -1*ones(n)
    end
    #fills in binary values {1,-1} for each row
    for i in 1:n
        binary_data[i,Symbol(old_row[i])] = 1.0
    end
    return binary_data
end 

#clean = one_hot_encode(data,:Health_Service_Area)

one_hot_encode (generic function with 1 method)

In [4]:
#Pre Condition: data is heart attack data from SPARCS dataset
function remove_missing_data(data::DataFrame)
    cdata = copy(data)
    #Length of Stay
    cdata = cdata[cdata[:Length_of_Stay] .!= "120 +",:]
    cdata[:Length_of_Stay] = float(cdata[:Length_of_Stay])
    #Payment_Typology
    cdata = cdata[cdata[:Payment_Typology_1] .!= "Miscellaneous/Other",:]
    cdata = cdata[cdata[:Payment_Typology_1] .!= "Unknown",:]
    #Patient_Dispostion 
    cdata = cdata[cdata[:Patient_Disposition] .!= "Another Type Not Listed",:]
    #Type of Admission
    cdata = cdata[cdata[:Type_of_Admission] .!= "Not Available",:]
    #Race
    cdata = cdata[cdata[:Race] .!= "Unknown",:]
    #Total Charges
    n = size(cdata,1)
    for i in 1:n
        if cdata[:Total_Charges][i][2:end] == "" #impute better
            val = "0"
        else 
            val = cdata[:Total_Charges][i][2:end]
        end
            cdata[:Total_Charges][i] = val
    end
    cdata[:Total_Charges] = float(cdata[:Total_Charges])
    return cdata
end

remove_missing_data (generic function with 1 method)

In [15]:
#Pre_Condition: data is heart_attack data from SPARCS dataset
function clean(data::DataFrame)
    temp = remove_missing_data(data)
    clean_data = DataFrame()
    categorical_col = [:Health_Service_Area,:Age_Group,:Hospital_County,
                    :Patient_Disposition,:Gender,:Race,
                    :Type_of_Admission,:Payment_Typology_1]
    for col in categorical_col
        clean_data = hcat(clean_data,one_hot_encode(temp,col))
    end
    delete!(clean_data,:F)
    clean_data[:Length_of_Stay] = temp[:Length_of_Stay]
    clean_data[:Total_Charges] = temp[:Total_Charges]
    clean_data[:APR_Severity_of_Illness_Code] = temp[:APR_Severity_of_Illness_Code]
    clean_data[:offset] = ones(size(clean_data,1))
    return clean_data
end



clean (generic function with 1 method)

In [23]:
heart_attack = readtable("heart_attack_raw.csv")
heart_attack = clean(heart_attack)
writetable("heart_attack_clean.csv", heart_attack)