In [0]:
import java.util.zip.{ZipInputStream, ZipEntry}
import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, FileStatus, Path}
import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.types._

In [0]:
val hconf: Configuration = spark.sparkContext.hadoopConfiguration
val fs: FileSystem = FileSystem.get(hconf)

In [0]:
def structFrom(cols: Seq[String]): StructType =
  StructType(cols.map(c => StructField(c, StringType, nullable = true)))

In [0]:
def ensureDir(path: Path): Unit = if (!fs.exists(path)) fs.mkdirs(path)

In [0]:
def copyStream(in: java.io.InputStream, out: FSDataOutputStream, bufferSize: Int = 1024 * 1024): Unit = {
  val buf = new Array[Byte](bufferSize)
  var n = in.read(buf)
  while (n > 0) { out.write(buf, 0, n); n = in.read(buf) }
  out.hflush()
  out.close()
}

In [0]:
def unzipAll(
  srcZipDir: String,          
  dstExtractDir: String,      
  preserveZipName: Boolean = true,
  overwrite: Boolean = false
): Unit = {

  val srcPath = new Path(srcZipDir)
  val dstPath = new Path(dstExtractDir)
  val doneDir = new Path(dstPath, "_extracted")

  ensureDir(dstPath); ensureDir(doneDir)

  val zips: Array[FileStatus] = fs.listStatus(srcPath)
    .filter(st => st.isFile && st.getPath.getName.toLowerCase.endsWith(".zip"))

  println(s"[unzip] Zips encontrados: ${zips.length} em $srcZipDir")

  zips.foreach { z =>
    val zipPath = z.getPath
    val zipName = zipPath.getName
    val base    = zipName.stripSuffix(".zip")
    val marker  = new Path(doneDir, s"$base.done")

    if (fs.exists(marker) && !overwrite) {
      println(s"[unzip] PULANDO (done): $zipName")
    } else {
      println(s"[unzip] Processando: $zipName")
      var zin: ZipInputStream = null
      var fin: FSDataInputStream = null
      var extracted = 0

      try {
        fin = fs.open(zipPath)
        zin = new ZipInputStream(fin)

        var entry: ZipEntry = zin.getNextEntry
        while (entry != null) {
          if (!entry.isDirectory && entry.getName.toLowerCase.endsWith(".csv")) {
            val outDir = if (preserveZipName) new Path(dstPath, base) else dstPath
            ensureDir(outDir)
            val outFile = new Path(outDir, entry.getName.split("[/\\\\]").last)

            if (!fs.exists(outFile) || overwrite) {
              val out: FSDataOutputStream = fs.create(outFile, overwrite)
              copyStream(zin, out)
              println(s"[unzip] Extraído: ${outFile.toString}")
              extracted += 1
            } else {
              println(s"[unzip] Já existia: ${outFile.toString}")
            }
          }
          zin.closeEntry()
          entry = zin.getNextEntry
        }

        // cria marker
        val out = fs.create(marker, true)
        out.writeUTF(s"done at ${java.time.Instant.now}")
        out.close()

        println(s"[unzip] OK: $zipName | CSVs: $extracted")
      } finally {
        if (zin != null) zin.close()
        if (fin != null) fin.close()
      }
    }
  }
}