-
Notifications
You must be signed in to change notification settings - Fork 2
/
FacadeService.scala
358 lines (304 loc) · 13 KB
/
FacadeService.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
package se.lu.nateko.cp.data.services.etcfacade
import akka.Done
import akka.NotUsed
import akka.event.LoggingAdapter
import akka.stream.Materializer
import akka.stream.scaladsl.FileIO
import akka.stream.scaladsl.Flow
import akka.stream.scaladsl.Keep
import akka.stream.scaladsl.Sink
import akka.stream.scaladsl.Source
import akka.util.ByteString
import se.lu.nateko.cp.data.EtcFacadeConfig
import se.lu.nateko.cp.data.api.ChecksumError
import se.lu.nateko.cp.data.api.CpDataException
import se.lu.nateko.cp.data.api.Utils.iterateChildren
import se.lu.nateko.cp.data.api.dataFail
import se.lu.nateko.cp.data.formats.TimeSeriesStreams
import se.lu.nateko.cp.data.formats.zip
import se.lu.nateko.cp.data.services.upload.UploadResult
import se.lu.nateko.cp.data.services.upload.UploadService
import se.lu.nateko.cp.data.streams.DigestFlow
import se.lu.nateko.cp.data.streams.ZipEntryFlow
import se.lu.nateko.cp.data.streams.ZipEntrySource
import se.lu.nateko.cp.data.utils.akka.Debouncer
import se.lu.nateko.cp.data.utils.akka.done
import se.lu.nateko.cp.meta.core.crypto.Md5Sum
import se.lu.nateko.cp.meta.core.crypto.Sha256Sum
import se.lu.nateko.cp.meta.core.etcupload.DataType
import se.lu.nateko.cp.meta.core.etcupload.EtcUploadMetadata
import se.lu.nateko.cp.meta.core.etcupload.StationId
import java.io.File
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.nio.file.StandardCopyOption.REPLACE_EXISTING
import java.nio.file.StandardOpenOption
import java.nio.file.attribute.BasicFileAttributes
import java.time.Duration
import java.time.Instant
import java.time.LocalDate
import java.time.LocalDateTime
import java.time.LocalTime
import java.time.ZoneOffset
import scala.concurrent.ExecutionContext
import scala.concurrent.Future
import scala.concurrent.duration.DurationInt
import scala.util.Failure
import scala.util.Success
import scala.util.Try
import scala.util.Using
/**
* Encodes the behaviour and logic of the ETC logger data upload facade.
* Main features:
* - integrity control with MD5 checksums
* - staging area for files uploaded from the loggers
* - upload to CP, if the ETC metadata for the filename is available on the meta service
* - packaging EC and PHEN half-hourly files into daily packages (zip archives)
* - version handling in the case of re-uploads of files with the same filename
* - automatic upload retries for all the files in staging
*
* EC and PHEN file packaging and submission is done in the following way.
* 1) If upon upload of a half-hourly file a certain daily package becomes complete (48 files for a particular station, logger, and file number),
* and if no previous uploads of this daily package were performed, then the package is uploaded is triggered through a debouncer
* with 10 minutes delay. Further potential half-hourly uploads for this package will debounce the upload. After a successful upload,
* the half-hourly files are removed.
* 2) At {@code FacadeService.ForceEcUploadTime} time of day, all half-hourly EC files in staging are suplemented with latest
* previously-uploaded files (if any, and only by files from half-hourly slots not represented in staging), packaged and uploaded.
* Upon successful upload, the corresponding half-hourly files are purged from staging.
* 3) After the daily forced EC upload, old files (older than {@code FacadeService.OldFileMaxAge}) are purged from staging.
*/
class FacadeService(val config: EtcFacadeConfig, upload: UploadService)(using mat: Materializer):
import FacadeService._
import mat.executionContext
private val debouncer =
val scheduler = upload.meta.system.scheduler
Debouncer[EtcFilename, Done](10.minutes, scheduler, "Daily package upload")
private val metaClient = upload.meta
private val log = upload.log
Files.createDirectories(Paths.get(config.folder))
private[this] val retries = new RetryLogic(this, log).schedule()
sys.addShutdownHook(retries.cancel())
def getFilePath(file: EtcFilename) = getStationFolder(file.station).resolve(file.toString)
def getStationFolder(station: StationId) = Paths.get(config.folder, station.id)
def getObjectSource(station: StationId, hash: Sha256Sum): Path =
getStationFolder(station).resolve(hash.id)
def getFileSink(fn: EtcFilename, md5: Md5Sum): Sink[ByteString, Future[Done]] =
val tmpPath = Files.createTempFile(fn.toString + ".", "")
val targetFile = getFilePath(fn)
Files.createDirectories(targetFile.getParent)
def transactUpload(): Done =
Files.move(tmpPath, targetFile, REPLACE_EXISTING)
Done
val preprocessing: Flow[ByteString, ByteString, NotUsed] = fn.dataType match
case DataType.SAHEAT =>
TimeSeriesStreams.linesFromUtf8Binary.map{line =>
val stationId = fn.station.hashCode
ByteString(s"$line,$stationId\r\n" , ByteString.UTF_8)
}
case DataType.PHEN =>
ZipEntryFlow.singleEntryUnzip
case _ =>
Flow.apply[ByteString]
Flow.apply[ByteString]
.viaMat(DigestFlow.md5)(Keep.right)
.via(preprocessing)
.toMat(FileIO.toPath(tmpPath)){
(md5Fut, ioFut) => {
for(
md5Actual <- md5Fut;
_ <- ioFut;
done <- if(md5Actual == md5) Future(transactUpload()) else Future.failed(
new ChecksumError(s"Expected MD5 checksum $md5, got $md5Actual")
)
) yield done
}.andThen{
case Success(_) =>
logExternalUpload(fn)
if fn.time.isDefined && fn.extension.equalsIgnoreCase("zip") then
setLastModifiedFromZipContents(targetFile, log)
performUploadIfNotTest(targetFile, fn, false)
case Failure(_) => Files.deleteIfExists(tmpPath)
}
}
end getFileSink
def cleanupVeryOldFiles(station: StationId): Unit =
deleteOldEtcFiles(getStationFolder(station))
private[etcfacade] def performUploadIfNotTest(file: Path, fn: EtcFilename, forceDaily: Boolean): Future[Done] =
if(fn.station == config.testStation) done else performUpload(file, fn, forceDaily)
private def performUpload(file: Path, fn: EtcFilename, forceDaily: Boolean): Future[Done] =
fn.toDaily.fold(performEtcUpload(file, fn, None)){ daily =>
debouncer.debounce(daily){
getUploadedHalfHourlies(daily).flatMap{uploaded =>
val stationFolder = getStationFolder(fn.station)
val fresh = getZippableDailies(stationFolder, daily)
val filePackage = uploaded ++ fresh
val isFullPackage: Boolean = packageIsComplete(filePackage)
if !Files.exists(file) then done
else if isFullPackage && uploaded.isEmpty || forceDaily && isFromBeforeToday(daily)
then
zipToArchive(filePackage, daily).flatMap{
(zipFile, hash) =>
if !uploaded.isEmpty then log.info(
s"ETC facade will upload a new-version object $hash for daily file $daily"
)
performEtcUpload(zipFile, daily, Some(hash)).andThen{
case Success(_) =>
fresh.foreach{(hhFn, _) =>
val hhFile = stationFolder.resolve(hhFn.toString)
Files.deleteIfExists(hhFile)
}
case Failure(_) =>
Files.deleteIfExists(zipFile)
val srcPath = getObjectSource(daily.station, hash)
Files.deleteIfExists(srcPath)
}
}
else done //no uploads for incomplete or previously incomplete packages, unless forced
}
}
}.andThen(handleErrors(fn.toString))
private def performEtcUpload(
file: Path,
fn: EtcFilename,
hashOpt: Option[Sha256Sum],
): Future[Done] = hashOpt
.map(Future.successful)
.getOrElse(FileIO
.fromPath(file)
.viaMat(DigestFlow.sha256)(Keep.right)
.to(Sink.ignore)
.run()
)
.map(getUploadMeta(fn, _))
.flatMap(etcMeta => metaClient.registerEtcUpload(etcMeta).map(_ => etcMeta))
.flatMap{etcMeta =>
val srcPath = getObjectSource(fn.station, etcMeta.hashSum)
Files.move(file, srcPath, REPLACE_EXISTING)
uploadDataObject(srcPath, fn.station, etcMeta.hashSum)
}
private def uploadDataObject(srcPath: Path, station: StationId, hash: Sha256Sum): Future[Done] = upload
.getEtcSink(hash)
.flatMap(FileIO.fromPath(srcPath).runWith)
.flatMap{res =>
res.makeReport.fold(
errMsg => dataFail(errMsg),
_ => done
)
}
.transform(
ok => {Files.delete(srcPath); ok},
err => new Exception(s"ETC facade failure during internal object upload. Station $station, object $hash", err)
)
private[etcfacade] def uploadDataObjectHandleErrors(station: StationId, hash: Sha256Sum): Future[Done] =
val srcPath = getObjectSource(station, hash)
uploadDataObject(srcPath, station, hash).andThen(
handleErrors(hash.base64Url)
)
private def getUploadedHalfHourlies(daily: EtcFilename): Future[DailyPackage] =
EtcFilename
.dailyFileFormats
.get(daily.dataType)
.fold(
Future.failed(CpDataException(s"Not a daily file: $daily"))
)(dailyFormat =>
metaClient.getSameFilenameInfo(daily.toString).map(
_.sortBy(_.submissionEnd).foldLeft(Map.empty){(acc, sfi) =>
if sfi.format != dailyFormat then acc
else
val zipFile = upload.getFile(Some(sfi.format), sfi.hash, true)
val halfHourlies = zip.listEntries(zipFile).get
.flatMap(zentry => EtcFilename.parse(zentry.getName).toOption.map(_ -> zentry))
.collect{
case (fn, zentry) if !acc.contains(fn) =>
fn -> ZipEntrySource.fileEntry(zipFile, zentry)
}
acc ++ halfHourlies
}
)
)
private def appendError(msg: String): Unit = appendLogMsgToFile(msg, "errorLog.txt")
private def logExternalUpload(fn: EtcFilename): Unit = appendLogMsgToFile(fn.toString, "externalUploadsLog.txt")
private def appendLogMsgToFile(msg: String, fileName: String): Unit = {
val msgFile = Paths.get(config.folder, fileName)
val msgBytes = s"${Instant.now}\t$msg\n".getBytes(StandardCharsets.UTF_8)
Files.write(msgFile, msgBytes, StandardOpenOption.APPEND, StandardOpenOption.CREATE)
}
private def handleErrors(uploadedObj: String): PartialFunction[Try[Done], Unit] =
case Failure(err) =>
appendError(s"Error while uploading $uploadedObj : " + UploadResult.extractMessage(err))
log.error(err, s"ETC facade error while uploading $uploadedObj")
end FacadeService
object FacadeService:
import ZipEntryFlow._
type EtcFileInfo = (Path, EtcFilename)
type DailyPackage = Map[EtcFilename, FileEntry]
val ForceEcUploadTime = LocalTime.of(4, 0) //is to be interpreted as UTC time
val OldFileMaxAge = Duration.ofDays(30)
def getUploadMeta(file: EtcFilename, hashSum: Sha256Sum) = EtcUploadMetadata(
hashSum = hashSum,
fileName = file.toString,
station = file.station,
logger = file.loggerNumber,
dataType = file.dataType,
fileId = file.fileNumber,
acquisitionStart = file.time
.map(LocalDateTime.of(file.date, _).minusMinutes(30))
.getOrElse(LocalDateTime.of(file.date, LocalTime.MIN)),
acquisitionStop = file.time
.map(LocalDateTime.of(file.date, _))
.getOrElse(LocalDateTime.of(file.date.plusDays(1), LocalTime.MIN))
)
private def getEtcFiles(folder: Path): Vector[EtcFileInfo] = iterateChildren(folder){_
.flatMap(p => EtcFilename.parse(p.getFileName.toString).toOption.map((p, _)))
.toVector
}
private def deleteOldEtcFiles(folder: Path): Unit =
val now = Instant.now()
getEtcFiles(folder).foreach: (path, filename) =>
val fileLmt = Files.readAttributes(path, classOf[BasicFileAttributes]).creationTime().toInstant
val age = Duration.between(fileLmt, now)
if(age.compareTo(OldFileMaxAge) > 0) Files.deleteIfExists(path)
def getZippableDailies(folder: Path, dailyFile: EtcFilename): DailyPackage =
getEtcFiles(folder).collect{
case (path, fn) if fn.toDaily.contains(dailyFile) =>
fn -> ZipEntryFlow.entryFromFile(path)
}.toMap
def zipToArchive(files: DailyPackage, fn: EtcFilename)(using Materializer, ExecutionContext): Future[(Path, Sha256Sum)] =
val tmpFile = Files.createTempFile(fn.toString, "")
val fileEntries: Seq[FileEntry] = files.values.toSeq.sortBy(_._1.getName)
val alreadyCompressed = fileEntries.forall{ (zentry, _) =>
val ext = zentry.getName.split(".").lastOption.map(_.toLowerCase)
ext.fold(false)(compressedExtensions.contains)
}
val compression: Option[Compression] = if(alreadyCompressed) Some(0) else None
getMultiEntryZipStream(Source(fileEntries), compression)
.viaMat(DigestFlow.sha256)(Keep.right)
.toMat(FileIO.toPath(tmpFile))(Keep.both)
.mapMaterializedValue{
case (hashFut, ioFut) => {
for(
hash <- hashFut;
_ <- ioFut
) yield tmpFile -> hash
}.andThen{
case Failure(_) =>
Files.deleteIfExists(tmpFile)
}
}
.run()
def setLastModifiedFromZipContents(zipFile: Path, log: LoggingAdapter): Unit =
Using(zip.open(zipFile.toFile))(
_.entries().nextElement().getLastModifiedTime
)
.map(Files.setLastModifiedTime(zipFile, _))
.failed
.foreach{
log.error(_, "Could not set last modified date of the uploaded zip from its contents")
}
val compressedExtensions = Set("zip", "jpg", "jpeg", "gz")
def isFromBeforeToday(fn: EtcFilename): Boolean = LocalDate.now(ZoneOffset.UTC).compareTo(fn.date) > 0
private def packageIsComplete(pack: DailyPackage): Boolean =
pack.keysIterator.flatMap(_.slot).toSet.size == 48
end FacadeService