In [None]:
# This R script automates the process of converting large CSV files from a ZIP archive into 
# the Parquet format for more efficient storage and querying. It extracts CSVs, converts them,
# and repackages the Parquet files into a new ZIP file, enhancing data portability and 
# performance. The script uses local directories for speed, cleans up resources after processing, 
# and ensures compatibility across systems by adjusting file paths.

# Load necessary libraries
library(data.table)
library(arrow)
library(zip)


In [4]:

# Define the path to the zip file and output paths # use a local path in order to make the file to run fast
zip_path <- "C:/Users/Ulf/Downloads/GLORIA_MRIOs_59_2022.zip"
#zip_path <- "G:/Shared drives/MoreScope Team Folder/30 Technology/MRIO/GLORIA/Version 59/GLORIA_MRIOs_59_2022.zip"
#zip_path <- "G:/Shared drives/MoreScope Team Folder/30 Technology/MRIO/GLORIA/Version 59/my_zipped_file.zip"

temp_dir <- tempdir()  # Using the system's temporary directory
new_zip_path <- sub("\\.zip$", "_parquet.zip", zip_path)  # Destination path for new zip

# Ensure the temporary directory is clean
if (length(list.files(temp_dir)) > 0) {
    unlink(temp_dir, recursive = TRUE)  # Remove existing files if any
}
dir.create(temp_dir)

# List contents of the ZIP file using the zip package
files_to_extract <- zip_list(zip_path)

# Check if there are files to process
if (is.null(files_to_extract) || nrow(files_to_extract) == 0) {
    cat("No files to process. Check the zip archive and path.\n")
    return()  # Exit the function or script if no files to process
}

# Initialize progress bar
pb <- txtProgressBar(min = 0, max = nrow(files_to_extract), style = 3)

# Store all parquet paths for later zipping
parquet_paths <- character()

# Process each file sequentially
for (i in seq_len(nrow(files_to_extract))) {
    file_name <- files_to_extract$filename[i]
    
    # Unzip only the current file
    unzip(zip_path, files = file_name, exdir = temp_dir)
    
    # Full path for the extracted file
    file_path <- file.path(temp_dir, file_name)
    
    # Read the CSV file
    data <- fread(file_path)
    
    # Convert to Parquet and save it temporarily
    parquet_path <- sub("\\.csv$", ".parquet", file_path)
    write_parquet(as.data.frame(data), parquet_path)
    parquet_paths <- c(parquet_paths, parquet_path)
    
    # Clear data from memory and run garbage collector
    rm(data)
    gc()
    
    # Update the progress bar
    setTxtProgressBar(pb, i)
}

# Close the progress bar
close(pb)

# Zip all the parquet files together at the end
# Change to the temporary directory to ensure all operations are localized
setwd(temp_dir)

# Confirm current directory (to ensure the script operates in the correct directory)
print(getwd())

# List all Parquet files in the current directory
parquet_files <- list.files(pattern = "\\.parquet$", full.names = TRUE)

# Check if any Parquet files were found
if (length(parquet_files) == 0) {
    cat("No parquet files found in the directory.\n")
} else {
    # Print out the names of the files to be zipped
    cat("Listing Parquet files:\n")
    print(parquet_files)

    # Remove './' prefix if present to ensure compatibility with Windows
    parquet_files <- gsub("^\\./", "", parquet_files)

    # Zip all the Parquet files into a new zip file
    zip(zipfile = new_zip_path, files = parquet_files)

    cat("All Parquet files have been processed and zipped into: ", new_zip_path, "\n")
}


"'C:\Users\Ulf\AppData\Local\Temp\Rtmp6PfGFW' already exists"


[1] "C:/Users/Ulf/AppData/Local/Temp/Rtmp6PfGFW"
Listing Parquet files:
 [1] "./20240110_120secMother_AllCountries_002_T-Results_2022_059_Markup001(full).parquet"
 [2] "./20240110_120secMother_AllCountries_002_T-Results_2022_059_Markup002(full).parquet"
 [3] "./20240110_120secMother_AllCountries_002_T-Results_2022_059_Markup003(full).parquet"
 [4] "./20240110_120secMother_AllCountries_002_T-Results_2022_059_Markup004(full).parquet"
 [5] "./20240110_120secMother_AllCountries_002_T-Results_2022_059_Markup005(full).parquet"
 [6] "./20240110_120secMother_AllCountries_002_V-Results_2022_059_Markup001(full).parquet"
 [7] "./20240110_120secMother_AllCountries_002_Y-Results_2022_059_Markup001(full).parquet"
 [8] "./20240110_120secMother_AllCountries_002_Y-Results_2022_059_Markup002(full).parquet"
 [9] "./20240110_120secMother_AllCountries_002_Y-Results_2022_059_Markup003(full).parquet"
[10] "./20240110_120secMother_AllCountries_002_Y-Results_2022_059_Markup004(full).parquet"
[11] "./20240110_1