-
Notifications
You must be signed in to change notification settings - Fork 14
/
orchestrator.py
364 lines (330 loc) · 13.7 KB
/
orchestrator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import logging
import os
from typing import Optional
from uuid import UUID
from celery import states
from django.contrib.auth import get_user_model
from django.db.models import Q
from django.db.transaction import rollback
from django.utils import timezone
from django.utils.module_loading import import_string
from django_celery_results.models import TaskResult
from geonode.base.enumerations import STATE_INVALID, STATE_PROCESSED, STATE_RUNNING
from geonode.resource.models import ExecutionRequest
from geonode.upload.models import Upload
from rest_framework import serializers
from importer.api.exception import ImportException
from importer.api.serializer import ImporterSerializer
from importer.celery_app import importer_app
from importer.handlers.base import BaseHandler
from importer.utils import error_handler
from importer.utils import ImporterRequestAction as ira
logger = logging.getLogger(__name__)
class ImportOrchestrator:
"""'
Main import object. Is responsible to handle all the execution steps
Using the ExecutionRequest object, will extrapolate the information and
it call the next step of the import pipeline
Params:
enable_legacy_upload_status default=True: if true, will save the upload progress
also in the legacy upload system
"""
def __init__(self, enable_legacy_upload_status=True) -> None:
self.enable_legacy_upload_status = enable_legacy_upload_status
def get_handler(self, _data) -> Optional[BaseHandler]:
"""
If is part of the supported format, return the handler which can handle the import
otherwise return None
"""
for handler in BaseHandler.get_registry():
if handler.can_handle(_data):
return handler()
logger.error("Handler not found, fallback on the legacy upload system")
return None
def get_serializer(self, _data) -> serializers.Serializer:
for handler in BaseHandler.get_registry():
_serializer = handler.has_serializer(_data)
if _serializer:
return _serializer
logger.info("specific serializer not found, fallback on the default one")
return ImporterSerializer
def load_handler(self, module_path):
try:
return import_string(module_path)
except Exception:
raise ImportException(detail=f"The handler is not available: {module_path}")
def get_execution_object(self, exec_id):
"""
Returns the ExecutionRequest object with the detail about the
current execution
"""
req = ExecutionRequest.objects.filter(exec_id=exec_id)
if not req.exists():
raise ImportException("The selected UUID does not exists")
return req.first()
def perform_next_step(
self,
execution_id: str,
action: str,
handler_module_path: str,
step: str = None,
layer_name: str = None,
alternate: str = None,
**kwargs,
) -> None:
"""
It takes the executionRequest detail to extract which was the last step
and take from the task_lists provided by the ResourceType handler
which will be the following step. if empty a None is returned, otherwise
in async the next step is called
"""
try:
_exec_obj = self.get_execution_object(str(execution_id))
if step is None:
step = _exec_obj.step
# retrieve the task list for the resource_type
tasks = self.load_handler(handler_module_path).get_task_list(action=action)
# getting the index
_index = tasks.index(step) + 1
if _index == 1:
"""
Means that the first task is available and we set the executions state as running
So is updated only at the beginning keeping it in a consistent state
"""
self.update_execution_request_status(
execution_id=str(_exec_obj.exec_id),
status=ExecutionRequest.STATUS_RUNNING,
)
# finding in the task_list the last step done
remaining_tasks = tasks[_index:] if not _index >= len(tasks) else []
if not remaining_tasks:
# The list of task is empty, it means that the process is finished
self.evaluate_execution_progress(
execution_id, handler_module_path=handler_module_path
)
return
# getting the next step to perform
next_step = next(iter(remaining_tasks))
# calling the next step for the resource
# defining the tasks parameter for the step
task_params = (str(execution_id), handler_module_path, action)
logger.info(f"STARTING NEXT STEP {next_step}")
if layer_name and alternate:
# if the layer and alternate is provided, means that we are executing the step specifically for a layer
# so we add this information to the task_parameters to be sent to the next step
logger.info(
f"STARTING NEXT STEP {next_step} for resource: {layer_name}, alternate {alternate}"
)
"""
If layer name and alternate are provided, are sent as an argument
for the next task step
"""
task_params = (
str(execution_id),
next_step,
layer_name,
alternate,
handler_module_path,
action,
)
# continuing to the next step
importer_app.tasks.get(next_step).apply_async(task_params, kwargs)
return execution_id
except StopIteration:
# means that the expected list of steps is completed
logger.info("The whole list of tasks has been processed")
self.set_as_completed(execution_id)
return
except Exception as e:
self.set_as_failed(execution_id, reason=error_handler(e, execution_id))
raise e
def set_as_failed(self, execution_id, reason=None):
"""
Utility method to set the ExecutionRequest object to fail
"""
self.update_execution_request_status(
execution_id=str(execution_id),
status=ExecutionRequest.STATUS_FAILED,
finished=timezone.now(),
last_updated=timezone.now(),
log=reason,
legacy_status=STATE_INVALID,
)
def set_as_partially_failed(self, execution_id, reason=None):
"""
Utility method to set the ExecutionRequest object to partially failed
"""
self.update_execution_request_status(
execution_id=str(execution_id),
status=ExecutionRequest.STATUS_FAILED,
finished=timezone.now(),
last_updated=timezone.now(),
log=f"The execution is completed, but the following layers are not imported: \n {', '.join(reason)}. Check the logs for additional infos",
legacy_status=STATE_INVALID,
)
def set_as_completed(self, execution_id):
"""
Utility method to set the ExecutionRequest object to fail
"""
self.update_execution_request_status(
execution_id=str(execution_id),
status=ExecutionRequest.STATUS_FINISHED,
finished=timezone.now(),
last_updated=timezone.now(),
legacy_status=STATE_PROCESSED,
)
def evaluate_execution_progress(
self, execution_id, _log=None, handler_module_path=None
):
from importer.models import ResourceHandlerInfo
"""
The execution id is a mandatory argument for the task
We use that to filter out all the task execution that are still in progress.
if any is failed, we raise it.
"""
_exec = self.get_execution_object(execution_id)
expected_dataset = _exec.input_params.get("total_layers", 0)
actual_dataset = ResourceHandlerInfo.objects.filter(
execution_request=_exec
).count()
is_last_dataset = actual_dataset >= expected_dataset
execution_id = str(execution_id) # force it as string to be sure
lower_exec_id = execution_id.replace("-", "_").lower()
exec_result = TaskResult.objects.filter(
Q(task_args__icontains=lower_exec_id)
| Q(task_kwargs__icontains=lower_exec_id)
| Q(result__icontains=lower_exec_id)
| Q(task_args__icontains=execution_id)
| Q(task_kwargs__icontains=execution_id)
| Q(result__icontains=execution_id)
)
_has_data = ResourceHandlerInfo.objects.filter(
execution_request__exec_id=execution_id
).exists()
# .all() is needed since we want to have the last status on the DB without take in consideration the cache
if (
exec_result.all()
.exclude(Q(status=states.SUCCESS) | Q(status=states.FAILURE))
.exists()
):
self._evaluate_last_dataset(
is_last_dataset, _log, execution_id, handler_module_path
)
elif exec_result.all().filter(status=states.FAILURE).exists():
"""
Should set it fail if all the execution are done and at least 1 is failed
"""
# failed = [x.task_id for x in exec_result.filter(status=states.FAILURE)]
# _log_message = f"For the execution ID {execution_id} The following celery task are failed: {failed}"
if _has_data:
log = list(
set(
self.get_execution_object(execution_id).output_params.get(
"failed_layers", ["Unknown"]
)
)
)
logger.error(log)
self.set_as_partially_failed(execution_id=execution_id, reason=log)
self._last_step(execution_id, handler_module_path)
elif is_last_dataset:
self.set_as_failed(execution_id=execution_id, reason=_log)
elif expected_dataset == 1 and not _has_data:
self.set_as_failed(execution_id=execution_id, reason=_log)
else:
self._evaluate_last_dataset(
is_last_dataset, _log, execution_id, handler_module_path
)
def _evaluate_last_dataset(
self, is_last_dataset, _log, execution_id, handler_module_path
):
if is_last_dataset:
if _log and "ErrorDetail" in _log:
self.set_as_failed(execution_id=execution_id, reason=_log)
else:
logger.info(
f"Execution with ID {execution_id} is completed. All tasks are done"
)
self._last_step(execution_id, handler_module_path)
self.set_as_completed(execution_id)
else:
logger.info(
f"Execution progress with id {execution_id} is not finished yet, continuing"
)
return
def create_execution_request(
self,
user: get_user_model,
func_name: str,
step: str,
input_params: dict = {},
resource=None,
legacy_upload_name="",
action=None,
name=None,
source=None,
) -> UUID:
"""
Create an execution request for the user. Return the UUID of the request
"""
execution = ExecutionRequest.objects.create(
user=user,
geonode_resource=resource,
func_name=func_name,
step=step,
input_params=input_params,
action=action,
name=name,
source=source,
)
if self.enable_legacy_upload_status:
# getting the package name from the base_filename
Upload.objects.create(
name=legacy_upload_name
or os.path.basename(input_params.get("files", {}).get("base_file")),
state=STATE_RUNNING,
user=user,
metadata={
**{
"func_name": func_name,
"step": step,
"exec_id": str(execution.exec_id),
},
**input_params,
},
)
return execution.exec_id
def update_execution_request_status(
self,
execution_id,
status=None,
legacy_status=STATE_RUNNING,
celery_task_request=None,
**kwargs,
):
"""
Update the execution request status and also the legacy upload status if the
feature toggle is enabled
"""
if status is not None:
kwargs["status"] = status
ExecutionRequest.objects.filter(exec_id=execution_id).update(**kwargs)
if self.enable_legacy_upload_status:
Upload.objects.filter(metadata__contains=execution_id).update(
state=legacy_status,
complete=True,
metadata={**kwargs, **{"exec_id": execution_id}},
)
if celery_task_request:
TaskResult.objects.filter(task_id=celery_task_request.id).update(
task_args=celery_task_request.args
)
def _last_step(self, execution_id, handler_module_path):
"""
Last hookable step for each handler before mark the execution as completed
To overwrite this, please hook the method perform_last_step from the Handler
"""
if not handler_module_path:
return
return self.load_handler(handler_module_path).perform_last_step(execution_id)
orchestrator = ImportOrchestrator(enable_legacy_upload_status=False)