### Set analysis options as variables

In [0]:
%r
version <- "7"
dry_run <- FALSE
result_dir <- "/dbfs/tmp/result"
volume_root <- "/Volumes/idm_dhs_recode_dev_01"

files <- list.files(result_dir, recursive = TRUE, full.names = TRUE)
file.remove(files)


### Get metadata from filename

In [0]:
%r
get_file_type <- function(filename) {
  # Extract just the base filename (strip path)
  base <- basename(filename)
  
  # Try to match; regexec returns -1 if no match
  m <- regexec("^[A-Z]{2}(IR|MR|PR|HR|KR|BR|CR)", base, perl = TRUE)
  parts <- regmatches(base, m)[[1]]

  # If length < 2, no capture was found
  if (length(parts) < 2) {
    survey_code <- NA
  } else {
    survey_code <- parts[2]
  }
  survey_code
}

get_country_code <- function(filename) {
  basename <- basename(filename)
  country_code <- substr(basename, 1, 2)
  return(country_code)
}

### Check chapter folder, must implement run_indicators.R

In [0]:
%r

find_chapter_folders <- function() {
  chapter_dirs <- list.dirs(here::here(), recursive = FALSE)
  chapter_dirs <- chapter_dirs[grepl("^Chap", basename(chapter_dirs))]
  valid_chapters <- list()
  for (dir in chapter_dirs) {
    chapter_name <- basename(dir)
    if (file.exists(file.path(dir, "run_indicators.R"))) {
      valid_chapters[[chapter_name]] <- dir
    }
  }
  return(valid_chapters)
}

check_input_files <- function(chapter_path) {
  required_files <- list(IR = FALSE, MR = FALSE, PR = FALSE, HR = FALSE, KR = FALSE, BR = FALSE, CR = FALSE)
  run_indicators_path <- file.path(chapter_path, "run_indicators.R")
  if (file.exists(run_indicators_path)) {
    content <- readLines(run_indicators_path)
    required_files$IR <- any(grepl("--ir=|IR.*dta|IR.*DTA", content))
    required_files$MR <- any(grepl("--mr=|MR.*dta|MR.*DTA", content))
    required_files$PR <- any(grepl("--pr=|PR.*dta|PR.*DTA", content))
    required_files$HR <- any(grepl("--hr=|HR.*dta|HR.*DTA", content))
    required_files$KR <- any(grepl("--kr=|KR.*dta|KR.*DTA", content))
    required_files$BR <- any(grepl("--br=|BR.*dta|BR.*DTA", content))
    required_files$CR <- any(grepl("--cr=|CR.*dta|CR.*DTA", content))
  }
  return(required_files)
}

### Process files for country

In [0]:
%r
# Define process_country_chapter function
 process_country_chapter <- function(country_code, chapter_info, files_by_type, dry_run = TRUE) {
  chapter_name <- chapter_info$name
  chapter_path <- chapter_info$path
  required_files <- chapter_info$required_files
  message(sprintf("Processing %s for chapter %s", country_code, chapter_name))
  cmd_params <- c()
  
  tryCatch({
    for (file_type in names(required_files)) {
      # message(paste0("file type:", file_type))
      if (required_files[[file_type]]) {
        matching_files <- files_by_type[[file_type]]  
        message(paste0("matching total:", length(matching_files)))      
        if (!is.null(matching_files)) {
          # Filter by country code
          country_filtered <- matching_files[tolower(get_country_code(matching_files)) == tolower(country_code)]
          
          if (length(country_filtered) > 0) {
            # Further filter by version in the filename
            pattern <- paste0("(?i)", tolower(country_code), ".*", version)
            version_filtered <- country_filtered[grepl(pattern, basename(country_filtered), perl = TRUE)]

            if (length(version_filtered) > 0) {
              for (local_path in version_filtered) {
                tryCatch({
                  message(sprintf("Processing local file: %s", local_path))
                  
                  # Make path absolute if necessary
                  if (!fs::is_absolute_path(local_path)) {
                    local_path <- fs::path_abs(local_path)
                  }                  
                  # Add command-line parameter for the tool
                  cmd_param <- sprintf("--%s=%s", tolower(file_type), normalizePath(local_path))
                  # message(sprintf("Adding command parameter: %s", cmd_param))
                  
                  cmd_params <- c(cmd_params, cmd_param)
                 
                }, error = function(e) {
                  message(sprintf("Failed to process file %s: %s", local_path, e$message))
                })
              }              
           
          } else {
            message(sprintf("No %s files found for country %s", file_type, country_code))
            return(FALSE)
          }
        }
      }
    }

    if (length(cmd_params) == 0) {
      message(sprintf("No data files found for country %s, skipping", country_code))
      return(FALSE)
    }
    script_path <- NULL
    if (file.exists(file.path(chapter_path, "run_indicators.R"))) {
      script_path <- file.path(chapter_path, "run_indicators.R")
    }
    if (!is.null(script_path)) {
      for (cmd_param in cmd_params){
         cmd <- paste("Rscript", script_path, paste(cmd_param, sprintf("--output-dir=%s", result_dir), collapse = " "))
         if (!dry_run) {
          message(sprintf("Executing: %s", cmd))
          system(cmd)
          } else {
            message(sprintf("Would execute: %s", cmd))
          }
      }
    } else {
      message(sprintf("No run_indicators.R found for chapter %s", chapter_name))
    }
  }
  return (TRUE)
  }, error = function(e) {
    message(sprintf("Error processing %s for chapter %s: %s", country_code, chapter_name, e$message))
    return(FALSE)
  })
}

### Run the DHS indicator process

In [0]:
%r
required_packages <- c("AzureStor", "AzureAuth", "tidyverse", "parallel", "optparse", "here", "fs")
missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing_packages) > 0) {
  stop(paste("Missing required packages:", paste(missing_packages, collapse = ", ")))
}
message("Starting DHS data analysis...")
message("Current working directory: ", getwd())
  suppressPackageStartupMessages({
    for (pkg in required_packages) {
      library(pkg, character.only = TRUE)
    }
})

# Piloting Countries
countries <- read.csv("countries.csv", stringsAsFactors = FALSE)

# Find all chapter folders
message("Finding chapter folders with run_indicators.R...")
chapter_folders <- find_chapter_folders()
message(sprintf("Found %d valid chapter folders", length(chapter_folders)))
    
# Create chapter info list with required files for each chapter
chapter_info <- list()
for (name in names(chapter_folders)) {
  path <- chapter_folders[[name]]
  required_files <- check_input_files(path)
  if (!is.null(required_files)) {
      message(sprintf("Chapter %s requires the following files:", name))
      for (file_type in names(required_files)) {
          if (required_files[[file_type]]) {
            message(sprintf("  %s: Required", file_type))
          }
      }
      chapter_info[[name]] <- list(
        name = name,
        path = path,
        required_files = required_files
      )
  }
}

all_files <- c()
files <- list.files(
  path = volume_root,
  pattern = "\\.dta$", # Match .dta files
  recursive = TRUE, # Search subdirectories
  full.names = TRUE,
  ignore.case = TRUE 
)
message(sprintf("Found %d total files", length(files)))
# Filter to only those in raw_dtas folders
files <- files[grepl("raw_dtas/", files)]
all_files <- c(all_files, files)

# for debugging
df <- as.data.frame(t(unlist(all_files)))
write.csv(df, "all_files.csv", row.names = FALSE)

message(sprintf("Found %d files in DBFS volumes", length(all_files)))

files_by_type <- list()
for (file_path in all_files) {
  file_type <- get_file_type(file_path)
  if (!is.na(file_type)) {
    if (is.null(files_by_type[[file_type]])) {
      files_by_type[[file_type]] <- character(0)
    }
    files_by_type[[file_type]] <- c(files_by_type[[file_type]], file_path)
  }
}
country_codes <- unique(sapply(all_files, get_country_code))
message("---------0000--------")
message(paste0(" countries found:", str(country_codes)))
message("---------1111-------")
message(paste0(" required :", str(countries$file_prefix)))
missing_prefixes <- setdiff(tolower(countries$file_prefix), tolower(country_codes))
available_prefixes <- intersect(tolower(countries$file_prefix), tolower(country_codes))
message("---------2222-------")
message(paste0(" available for processing:", str(available_prefixes)))
if (length(missing_prefixes) > 0) {
  message("The following file prefixes are in config but not found in Volume")
  print(countries[countries$file_prefix %in% missing_prefixes, c("file_prefix", "country_name")])
} else {
  message("All configured file prefixes are present in Databricks Volume.")
}
results <- list()
for (ch_name in names(chapter_info)) {
  ch_data <- chapter_info[[ch_name]]
  message(sprintf("Processing chapter %s", ch_name))
  required_types <- names(ch_data$required_files)[unlist(ch_data$required_files)]
  message(sprintf("Chapter %s requires file types: %s", ch_name, paste(required_types, collapse = ", ")))
  tasks <- list()
  # loop over all countries found in volume
  for (country in country_codes) {
    task_id <- paste(country, ch_name, sep = "_")
    tasks[[task_id]] <- list(country_code = country, chapter_info = ch_data)
  }
}
# message(paste(tasks, collapse="\n"))

# pre-allocate a vector to hold each task’s duration
times <- numeric(length(tasks))

# use lapply, loop over the indices so we can write into `times`
ch_results <- lapply(seq_along(tasks), function(i) {
  task  <- tasks[[i]]
  start <- Sys.time()

  ok <- tryCatch({
    process_country_chapter(
      task$country_code,
      task$chapter_info,
      files_by_type,
      dry_run
    )
    TRUE
  }, error = function(e) {
    message(sprintf("Error processing country %s: %s",
                    task$country_code, e$message))
    FALSE
  })

  # record how long this one took
  times[i] <- as.numeric(Sys.time() - start, units = "secs")
  ok
})

results       <- c(results, ch_results)
success_count <- sum(unlist(results))
message(sprintf("Completed processing. Success: %d/%d", 
                success_count, length(results)))

# see a quick run summary:
data.frame(
  country    = vapply(tasks, `[[`, "", "country_code"),
  success    = unlist(ch_results),
  time_secs  = times
)



In [0]:
import shutil
import os

source_dir = "/dbfs/tmp/result/Chap07_FP"
# create report
reports_dir = os.path.join(source_dir, "reports")
index_file = os.path.join(source_dir, "index.html")

# Collect HTML file paths
html_files = [
    f for f in os.listdir(reports_dir)
    if f.endswith(".html") and os.path.isfile(os.path.join(reports_dir, f))
]

# Build HTML content
html_lines = [
    "<!DOCTYPE html>",
    "<html>",
    "<head><meta charset='utf-8'><title>Family Planning Indicators Report</title></head>",
    "<body>",
    "<h1>Family Planning Indicators Report</h1>",
    "<ul>"
]

for file in sorted(html_files):
    prefix = file.split("_")[0]  # Get prefix before first underscore
    href = os.path.join("reports", file)
    html_lines.append(f'  <li><a href="{href}">{prefix}</a></li>')

html_lines.extend([
    "</ul>",
    "</body>",
    "</html>"
])

# Write index.html
with open(index_file, "w", encoding="utf-8") as f:
    f.write("\n".join(html_lines))

print(f"index.html created at {index_file}")

# zip results

output_zip = "/tmp/data"  
shutil.make_archive(output_zip, 'zip', source_dir)

shutil.move("/tmp/data.zip", "/dbfs/FileStore/data.zip")

print(f"download link: https://adb-1652658079176617.17.azuredatabricks.net/files/data.zip" )
