In [1]:
#This program uses the boostrapping approach to estimate the parameters of an ODE system.
using Pkg, NBInclude, Plots, XLSX, Dates, DataFrames, Random, Statistics
using DifferentialEquations, Sundials, ForwardDiff, Optim
using Distributions, ProgressMeter, Distributed, StaticArrays, LaTeXStrings
@nbinclude("ODE_models.ipynb")
#@nbinclude("Norm_functions.ipynb")
@nbinclude("ODE_Fit.ipynb")
@nbinclude("ODE_Fit_IPN.ipynb")
@nbinclude("HelperFunctions.ipynb")
#@nbinclude("More_helpers.ipynb")

data_to_df (generic function with 1 method)

In [2]:
function ODE_Param_Est(;ODE_model::Function, ODE_vars::Array{Symbol}, vars_to_fit::Array{Symbol},
                        paramIC_dict::Dict, data_dict::Dict, f_calc_ICs::Function, func_sim_data::Function,
                        fit_range::Array{String,1}, date_obs_last::String, forecast_until::String, N0::Real,
                        norm::Function, norm_weights::Array{Real,1} = fill(1.0,1e6), norm_scale::Real = 1.0,
                        optimizer::Any = ConjugateGradient(), constraints::Dict = Dict(), func_get_x0::Function,
                        BS_samples::Int64 = 1, num_x0::Int64 = 1, pred_band_pct = 90, x_pctiles::Array,
                        integrator_options::Dict, optimizer_options::Dict, save_options::Dict)
   
    
    #############################################
    #STEP 0: Some Error Handling and other stuff
    #############################################
    
    if !(issubset(Set(vars_to_fit), Set(ODE_vars)))
        error("Error: vars_to_fit must be a subset of ODE_vars")
    end
    
    if (pred_band_pct < 0 || pred_band_pct > 100)
        error("Error: pred_band_pct must be in [0,100]")
    end
    
    if (minimum(x_pctiles) < 0 || maximum(x_pctiles) > 100)
        error("Error: x_pctiles can only contain values in [0,100]")
    end
     
    #Create an array of the variables (Symbols) NOT fitted
    vars_not_fit = collect(setdiff(Set(ODE_vars), Set(vars_to_fit)))
    
    #The arguments related to saving the outputs
    save_figs = get(save_options, :save_figs, true)  #defaults to true
    save_params = get(save_options, :save_params, true) #defaults to true
    figs_folder = get(save_options, :figs_folder, "C:/Users/Michael/Documents/COVID-19/BS_plots/")
    params_folder = get(save_options, :params_workbook, "C:/Users/Michael/Documents/COVID-19/test.xlsx")
    
    
    ##############################################
    #STEP 1: Parse the Observed (*Real*) Data 
    ##############################################
    
    #Get the file path of the Excel Workbook
    data_path = get(data_dict, :file_path, "data_path is invalid")
    
    #Get the name of the spreadsheet containing the observed data
    data_sheet = get(data_dict, :data_sheet, "data_sheet is invalid")
    
    #Get the format of the dates in the spreadsheet (default is "mm-dd-yyyy")
    date_format = get(data_dict, :date_format, "mm-dd-yyyy")
    
    #Read in the Excel workbook
    xf_data = XLSX.readxlsx(data_path)
    
    #Select the spreadsheet
    data_obs = xf_data[data_sheet]
    
    #Put the spreadsheet data into a DataFrame
    data_obs_all = DataFrame(XLSX.gettable(data_obs, header = true)...)
 
    #Convert the first column to the "Date" type 
    data_obs_all[!,1] = convert.(Date, data_obs_all.Date)
  
    #Create Date objects for the first and last days to be fitted
    date_fit_first, date_fit_last = Date.(fit_range[1:2], date_format)
    
    #Convert date_obs_last from a String to a Date
    date_obs_last = Date(date_obs_last, date_format)
    
    #Convert forecast_until from a String to a Date
    date_pred_last = Date(forecast_until, date_format)
    
    #Find the row numbers of date_fit_first, date_fit_last, date_obs_last
    local row_fit_first::Int64 
    local row_fit_last::Int64   
    local row_obs_last::Int64
    
    try 
        row_fit_first = findall(data_obs_all.Date .== date_fit_first)[1]
    catch
        error("Error: The first date in fit_range was not found in the spreadsheet.")
    end
    try
        row_fit_last = findall(data_obs_all.Date .== date_fit_last)[1]
    catch
        error("Error: The second date in fit_range was not found in the spreadsheet.")
    end
    try 
        row_obs_last = findall(data_obs_all.Date .== date_obs_last)[1]
    catch
        error("Error: The date entered for data_obs_last was not found in the spreadsheet.")
    end
    
    #The dates for the *fitted* data
    dates_fit = data_obs_all.Date[row_fit_first:row_fit_last]

    #The dates of *not* fitted but observed data that we want to plot at the very end
    dates_nf = data_obs_all.Date[row_fit_last+1:row_obs_last]
    
    #The observed data that will be fitted (does not include the Date column)
    data_obs_fit = data_obs_all[row_fit_first:row_fit_last, vars_to_fit]
  
    #The observed data that will *NOT* fitted, but *will be plotted*
    data_obs_nf = data_obs_all[row_fit_last+1:row_obs_last, vars_to_fit]
    
    #Note 2/5/21: No need to scale the data here because ODE_Fit takes care of this. 
    
    
    ##############################################
    #STEP 2: Convert the Dates to Integer values
    ##############################################
   
    #Note: The t values here are the number of days since date_first.
    
    #Calculate the Rata Die value of date_first
    t0_rata = Dates.datetime2rata(date_fit_first)
    
    #The t values for the data to be *fitted*
    t_fit = Array{Float64}(Dates.datetime2rata.(dates_fit) .- t0_rata)
    
    #The t values for the data *not* fitted 
    t_nf = Array{Float64}(Dates.datetime2rata.(dates_nf) .- t0_rata)
    
    #Calculate the Rata Die value of date_pred_last
    t_pred_last = Dates.datetime2rata(date_pred_last) - t0_rata
    
    #All t values for which we'll solve & plot the ODE at the very end 
    t_all = collect(0.0:1:t_pred_last)
    #2/5/21: Maybe make the step size smaller to get a denser solution? 
    
    
    #########################################
    #STEP 3: Parse the Parameter and IC Data
    #########################################
    
    #Get the path of the Excel workbook containing the Param & IC data
    ParIC_path = get(paramIC_dict, :file_path, "Par/IC file_path not found")
    
    #Get the names of the parameter & IC sheets
    param_sheet = get(paramIC_dict, :param_sheet, "param_sheet not found")
    IC_sheet = get(paramIC_dict, :IC_sheet, "IC_sheet not found")
    
    #Read in the Excel workbook
    xf_ParIC = XLSX.readxlsx(ParIC_path)
    
    #Get the Excel sheets 
    param_wkst = xf_ParIC[param_sheet]
    IC_wkst = xf_ParIC[IC_sheet]
    
    #Put the worksheet data into DataFrames 
    param_df = DataFrame(XLSX.gettable(param_wkst, header = true)...,)
    IC_df = DataFrame(XLSX.gettable(IC_wkst, header = true)...,)

    #Rename the columns of par_data and IC_df (just in case)
    rename!(param_df,["Parameter","Fixed?","Lower Bound","Initial Guess","Upper Bound","Interpretation"])
    rename!(IC_df,["IC","Fixed?","Lower Bound","Initial Guess","Upper Bound","Interpretation"])

    param_opt_names = param_df[param_df."Fixed?" .== "No", 1]
    IC_opt_names = IC_df[IC_df."Fixed?" .== "No", 1]
    x_names = convert.(String, [param_opt_names; IC_opt_names])
    
    #Specify the column types 
    col_types = [String, String, Real, Real, Real, String]
    for i=1:6
        param_df[!,i] = convert.(Union{Missing,col_types[i]}, param_df[:,i])
        IC_df[!,i] = convert.(Union{Missing,col_types[i]}, IC_df[:,i])
    end

    #Get the indices of the different groups (fixed/optimized, params/ICs)
    param_opt_indices = findall(param_df[:,"Fixed?"] .== "No")
    param_fix_indices = findall(param_df[:,"Fixed?"] .== "Yes")
    IC_opt_indices = findall(IC_df[:,"Fixed?"] .== "No")
    IC_fix_indices = findall(IC_df[:,"Fixed?"] .== "Yes")

    #Get number of params, ICs of each type 
    num_params_opt = length(param_opt_indices)
    num_params_fix = length(param_fix_indices)
    num_ICs_opt = length(IC_opt_indices)       #(really the number of ICs/IC ratios)
    num_ICs_fix = length(IC_fix_indices)       #(really the number of ICs/IC ratios)
    
    num_vars_opt = num_params_opt + num_ICs_opt
    
    
    ############################################################################
    #STEP 4: Get the box constraints for the optimized values and *scale* them
    ############################################################################
    
    #Lower Bounds (optimized parameters)
    param_LBs = param_df[param_df."Fixed?" .== "No", "Lower Bound"]
    param_UBs = param_df[param_df."Fixed?" .== "No", "Upper Bound"]
    
    #Lower Bounds (optimized ICs/IC ratios)
    IC_LBs = IC_df[IC_df."Fixed?" .== "No", "Lower Bound"]
    IC_UBs = IC_df[IC_df."Fixed?" .== "No", "Upper Bound"]
    
    #All UBs (to pass to ODE_fit)
    UBs = [param_UBs; IC_UBs]  #Array{Union{Missing, Float64},1}
    
    #Scaled Lower Bounds (optimized parameters)
    param_LBs_scaled = param_LBs ./ param_UBs
    param_UBs_scaled = repeat([1.], length(param_UBs))
    
    #Scaled Lower Bounds (optimized ICs/IC ratios)
    IC_LBs_scaled = IC_LBs ./ IC_UBs
    IC_UBs_scaled = repeat([1.], length(IC_UBs))
    
    #The Lower & Upper Bound arrays for the optimizer
    x_LBs = [param_LBs_scaled; IC_LBs_scaled]     #Array{Float64,1}
    x_UBs = [param_UBs_scaled; IC_UBs_scaled]     #Array{Float64,1}
    
    
    ###########################################################
    #STEP 5: Get all the FIXED values and put them in an array
    ###########################################################
    
    #Get all the fixed values from the spreadsheet
    params_fix = param_df[param_df."Fixed?" .== "Yes", "Initial Guess"]
    ICs_fix = IC_df[IC_df."Fixed?" .== "Yes", "Initial Guess"]
    
    #All the fixed parameters/ICs in the objectivefrom the spreadsheet function
    fixed_args = Array{Real}([params_fix; ICs_fix])

    num_params = num_params_opt + num_params_fix       #The total number of params
    num_ICs_and_IC_ratios = num_ICs_opt + num_ICs_fix  #The total number of ICs/IC ratios in the spreadsheet
     
    
    ##########################################################
    #STEP 6: Generate a list of random initial guesses
    ##########################################################
    
    #Get the initial guesses given in the spreadsheet
    par_init_first = param_df[param_df."Fixed?" .== "No", "Initial Guess"]
    IC_init_first = IC_df[IC_df."Fixed?" .== "No", "Initial Guess"]

    #Generate list of random initial guesses
    Random.seed!(1234) #set seed for reproducibility
    x0_list = [func_get_x0(x_LBs, x_UBs) for i=1:num_x0]
    x0_list[1] = [par_init_first; IC_init_first] ./ UBs    
  
    #Note: We use the initial guesses in the spreadsheet as the first x0 value.

    ##########################################
    #STEP 7: Generate the Simulated Data Sets 
    ##########################################
    
    Random.seed!(5678)  #Set seed for reproducibility
    sim_data_list = [func_sim_data(data_obs_fit) for i=1:BS_samples]
    sim_data_list[1] = data_obs_fit  
    #Note: We use the original (observed) data set as the first one to fit
  
    
    #####################################################################
    #STEP 8: Get all of the options for the optimization and integration
    #####################################################################
    
    local opt_alg::Any
    
    #Choose which function to call depending on the chosen optimizer...add options for LBFGS and BFGS?
    if optimizer == ConjugateGradient
        fit_ODE = FirstOrderFit
        opt_alg = ConjugateGradient()
    elseif optimizer == GradientDescent
        fit_ODE = FirstOrderFit
        opt_alg = GradientDescent()
    elseif optimizer == IPNewton
        fit_ODE = SecondOrderFit
        opt_alg = IPNewton()
    else
        error("Error: You entered an invalid optimizer!")
    end
    
    #The options for Optim.Options()
    xtol_abs = get(optimizer_options, :xtol_abs, 1e-10)
    ftol_rel = get(optimizer_options, :ftol_rel, 1e-4)
    gtol_abs = get(optimizer_options, :gtol_abs, 1e-8)
    max_iter = get(optimizer_options, :max_iter, 1000)
    max_time = get(optimizer_options, :max_time, 3000)
    show_trace = get(optimizer_options, :show_trace, true)
    show_every = get(optimizer_options, :show_every, 5)
    
    optim_options = Optim.Options(x_tol = xtol_abs, f_tol = ftol_rel, g_tol = gtol_abs, iterations = max_iter, 
                                  time_limit = max_time,  show_trace = show_trace, show_every = show_every, 
                                  allow_f_increases = true, successive_f_tol = 4)
   
    #The constraints for Optim
    con_func = constraints[:con_func]
    con_jac = constraints[:con_jac]
    con_hess = constraints[:con_hess]
    con_LBs = constraints[:con_LBs]
    con_UBs = constraints[:con_UBs]
    cons = TwiceDifferentiableConstraints(con_func, con_jac, con_hess, x_LBs, x_UBs, con_LBs, con_UBs)
    
    #Collect optim_options, cons, and (almost) all of the other options for ODE_Fit
    fit_args = (ODE_model = ODE_model,
                ODE_vars = ODE_vars,
                t_obs = t_fit, 
                t_all = t_all,
                x_LBs = x_LBs,
                x_UBs = x_UBs,
                UBs = UBs,
                param_opt_indices = param_opt_indices, 
                param_fix_indices = param_fix_indices,
                IC_opt_indices = IC_opt_indices,
                IC_fix_indices = IC_fix_indices,
                params_fix = params_fix, 
                ICs_fix = ICs_fix, 
                f_calc_ICs = f_calc_ICs,
                N0 = N0,
                norm = norm,
                norm_weights = norm_weights,
                norm_scale = norm_scale,
                optimizer = opt_alg,
                make_plots = false,
                constraints = cons,
                integrator_options = integrator_options,
                optim_options = optim_options,
                convergence_report = false)
    
    
    ##############################################################
    #STEP 9: Create some arrays for storing optimization output
    #######  and a couple other set-up things.
    ##############################################################
    
    minimums_all = []    #store all the minimum objective values found
    minimizers_all = []   #stores all the sets of parameters obtained via the optimizations
    
    minimums_best = []   #the best mininums found for each data set
    minimizers_best = []  #stores the best set of parameters found for each simulated data set
    
    soln_dfs = [] #List to hold solutions (as DataFrames) for each tuple in minimizers_best
    
    autodiff_fails = 0    #to keep track of number of times autodifferentiation failed 
    sample_num = 1
    
    #Display progress bar
    p = Progress(num_x0 * BS_samples, 1, "Optimizations in progress...", 50)

    
    ###############################################################
    #STEP 10: Perform the Optimizations on the simulated data sets
    ###############################################################
    
    #Loop over the simulated data sets
    t = @elapsed for sim_data in sim_data_list 
        
        minimums_sample = []    #to store the obj func values for each sample
        minimizers_sample = []  #to store the minimizers of each sample
       
        #Loop over the initial guesses
        for x0 in x0_list
            
            optim_out = fit_ODE(;fit_args...,x0 = x0, data_obs = sim_data, autodiff = :forward)
             
            #Check the type of the output if it errors
            if (typeof(optim_out.minimum) != Float64) || (typeof(optim_out.minimizer) != Array{Float64,1}) 
                println("Uh oh, there was an error: typeof(optim_out.minimum) = ", typeof(optim_out.minimum))
                println("Uh oh, there was an error: typeof(optim_out.minimizer) = ", typeof(optim_out.minimizer))
            else
                push!(minimums_sample, optim_out.minimum)     #Add the obj func value to the list
                push!(minimizers_sample, optim_out.minimizer) #Add the minimizer to the list
                push!(minimums_all, optim_out.minimum)        #Add the minimum to the "master" list
                push!(minimizers_all, optim_out.minimizer)    #Add the minimizer to the "master" list 

                if optim_out.AD_fail == true
                    autodiff_fails += 1
                end
            end
                
            next!(p) #Update the progress bar 
        end
        
        best_minimum = minimum(minimums_sample)                   #Get best minimum found from the sample
        idx_best = findall(minimums_sample .== best_minimum)[1]   #Get the index of the best minimium/minimizer  
        best_minimizer = minimizers_sample[idx_best]              #Get the best minimizer from the sample
        
        push!(minimums_best, best_minimum)       #Add the best minimium from the sample to the list of best minimums
        push!(minimizers_best, best_minimizer)   #Add the best minimizer from the sample to the list of best minimizers
     
        println("Sample ", sample_num, " best min: ", best_minimum)
        sample_num += 1
        
    end
    

    #To do: report total optimization time in hrs, mins, seconds
    println("\nTotal optimization time: ", round(t), " seconds")
    println("Average time per optimization: ", round(t / (num_x0 * BS_samples), sigdigits = 4), " seconds\n")
    println("Number of autodiff fails: ", autodiff_fails,"\n")
    
    ##########################################################################################
    #STEP 11: Compute the ODE solutions for the best minimizers (to make the prediction bands)
    ##########################################################################################
      
    soln_dfs = []  #List to store DataFrames containing the solutions.
    
    for i=1:BS_samples
        x_best = minimizers_best[i]
        sol = ODE_sol(ODE_model, ODE_vars, x_best, params_fix, ICs_fix, param_opt_indices, 
                      param_fix_indices, IC_opt_indices, IC_fix_indices, param_UBs, IC_UBs, f_calc_ICs, 
                      N0, t_all, int_options)  #Get the solution for the best minimizer of the i-th sample
        push!(soln_dfs,sol)
    end
    
    #Make a dictionary of the solutions organized by *variable*
    var_dfs = sort_by_var(soln_dfs)
    
    ###########################################################
    #STEP 12: Compute the percentiles for the prediction bands
    ###########################################################
    
    band_pct_low  = (50 - pred_band_pct/2)/100     #Lowest percentile to plot
    band_pct_high = (50 + pred_band_pct/2)/100     #Highest percentile to plot
    band_pct_med = 0.50                              
    #Note: We divide the percentiles by 100 b.c. percentiles_by_row() expects numbers in [0,1] 
   
    #Dictionary to store variable percentile data.
    var_pctiles_dict = Dict()   #Dict will store one DataFrame for each ODE variable.  
    
    #Get DataFrame of percentiles for each variable and store it in dictionary 
    for var in ODE_vars
        df = select(var_dfs[var], Not(:t))   
        pctiles_df = percentiles_by_row(df; low = band_pct_low, med = band_pct_med, high = band_pct_high) 
        insertcols!(pctiles_df, 1, :t => t_all)  #Add the t column back in
        var_pctiles_dict[var] = pctiles_df
    end
    
    #############################################################
    #STEP 13: Plot the prediction band for each fitted variable. 
    #############################################################
    
    #Convert the first date fitted to a string in the desired format (for labelling the plots)
    date_str_first = Dates.format(date_fit_first, date_format)
    
    for var in vars_to_fit
        
       #Select the DataFrame of percentiles
       var_pctiles_df  = var_pctiles_dict[var]
        
       #Get the values for each percentile
       var_pct_low  = var_pctiles_df[:,string(band_pct_low)]
       var_pct_med  = var_pctiles_df[:,string(band_pct_med)]
       var_pct_high = var_pctiles_df[:,string(band_pct_high)]
        
       #Plot the prediction band (aka "cloud")
       fig = plot(t_all, var_pct_low, fillrange = var_pct_high, fillalpha = 0.25, c = 4, lw = 0,
                  label = "$(pred_band_pct)% prediction band",
                  xlabel = L"t \textrm{ (days since %$date_str_first)}",
                  ylabel = "Number",
                  legend = :topleft, 
                  title = L"%$var",
                  fontfamily="serif-roman",
                  dpi = 90)
      
        
       ##################################################################
       #STEP 14: Plot the observed data (on top of the prediction bands)
       ##################################################################
        
       #Plot the fitted observed data
       plot!(t_fit, data_obs_fit[:,var],
             line = :scatter,
             ms = 2,           #markersize
             msw = 0,          #markerstrokewidth
             label = L"%$var \textrm{ (observed, fit)}")
       
       #Plot the not fitted observed data
       plot!(t_nf, data_obs_nf[:,var],
             line = :scatter,
             ms = 2,
             msw = 0,
             label = L"%$var \textrm{ (observed, not fit)}")
      
       display(fig)   #display the figure in the notebook 
        
       ##########################################################################
       #STEP 15: Save the plots to the specified file path (if save_figs == true)
       ##########################################################################
       if save_figs == true
          plot!(fig, dpi = 300)   #save using high resolution
          path = figs_folder * string(var) * ".png"
          savefig(fig, path)
       end
        
        #For some reason, setting dpi > 100 causes the plot display to be HUGE. 
        #Strangely, increasing the dpi increases the figure dimensions as well as the resolution...Why?
    end #vars in vars_to_fit  
        

#     #################################################################
#     #STEP 16: Plot the prediction bands for the NOT fitted variables
#     #################################################################
    
#     for var in vars_not_fit
        
#        #Select the DataFrame of percentiles
#        var_pctiles_df  = var_pctiles_dict[var]
        
#        #Get the values for each percentile
#        var_pct_low  = var_pctiles_df[:,string(band_pct_low)]
#        var_pct_med  = var_pctiles_df[:,string(band_pct_med)]
#        var_pct_high = var_pctiles_df[:,string(band_pct_high)]
        
#        #Plot the prediction band (aka "cloud")
#        fig = plot(t_all, var_pct_low, fillrange = var_pct_high, fillalpha = 0.25, c = 4, lw = 0,
#                   label = "$(pred_band_pct)% prediction band",
#                   xlabel = L"t \textrm{ (days since %$date_str_first)}",
#                   ylabel = "Number",
#                   legend = :topleft, 
#                   title = L"%$var",
#                   fontfamily="serif-roman")
    
#        display(fig)   
            
#        if save_figs == true
#           plot!(fig, dpi = 300)   #save using high resolution
#           path = figs_folder * string(var) * ".png"
#           savefig(fig, path)
#        end 
       
#     end #for vars in vars_not_fit
    
    ########################################################
    #STEP 17: Organize the best minimizers into a DataFrame
    ########################################################
    
    #Organize the BEST minimizers in a DataFrame
    minimizers_df = DataFrame(Array{Float64}(undef, length(minimizers_best), length(x_names)), x_names);
    
    for i=1:length(minimizers_best)
        minimizers_df[i,:] = UBs .* minimizers_best[i]   #Note that we scale the values back
    end

    #########################################################################
    #STEP 18: Create DataFrame of percentiles based on the *best* minimizers
    #########################################################################
        
    #The column names (percentile values) for our percentile DataFrame
    str_pctiles = [string(p) for p in x_pctiles]
    
    #Create an empty DataFrame to store the percentile data 
    par_pctiles = DataFrame(Array{Float64}(undef, length(x_names), length(str_pctiles)), str_pctiles)
    
    for i=1:length(x_names)
        par_pctiles[i,:] = quantile(minimizers_df[:,i], (x_pctiles ./ 100))
    end
    
    #Insert a column for the parameter/IC names 
    insertcols!(par_pctiles, 1, :Parameter => x_names)
    
    #Insert a column for the objective function value at each minimum 
    insertcols!(minimizers_df, 1, :obj_value => minimums_best)
    
    ###############################################################################
    #STEP 19: Return the parameter percentiles DataFrame and a couple other values.
    ############################################################################### 
    println(par_pctiles)
    return (minimizers_df = minimizers_df, var_pctiles_dict = var_pctiles_dict,
            data_obs_fit = data_obs_fit, data_obs_nf = data_obs_nf, t_all = t_all, 
            t_fit = t_fit, t_nf = t_nf, x0_list = x0_list, sim_data_list = sim_data_list)

end

ODE_Param_Est (generic function with 1 method)