Skip to content

Commit

Permalink
Merge pull request #11196 from MinaProtocol/better-intg-test-exit-codes
Browse files Browse the repository at this point in the history
Better intg test exit codes
  • Loading branch information
mrmr1993 committed Jun 7, 2022
2 parents 27573b7 + 8d69662 commit ea4506c
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 105 deletions.
9 changes: 9 additions & 0 deletions src/app/test_executive/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,12 @@ Any Integration test first creates a whole new testnet from scratch, and then ru
- structured log events are not an integration test construct, they are defined in various places around the protocol code. For example, the `Rejecting_command_for_reason` structured event is defined in `network_pool/transaction_pool.ml`.
- The structured log events that matter to the integration test are in `src/lib/integration_test_lib/event_type.ml`. The events integration-test-side will trigger based on logic defined in each event type's `parse` function, which parses messages from the logs, often trying to match for exact strings
- Please bear in mind that the nodes on GCP run the image that you link in your argument, it does NOT run whatever code you have locally. Only that which relates to the test executive is run from local. If you make a change in the protocol code, first this needs to be pushed to CI, where CI will bake a fresh image, and that image can be obtained to run on one's nodes.

# Exit codes

- Exit code `4` will be returned if not all pods were assigned to nodes and ready in time.
- Exit code `5` will be returned if some pods could not be found.
- Exit code `6` will be returned if `kubectl` exited with a non-zero code or a signal while attempting to retrieve logs.
- Exit code `10` will be returned if `kubectl` exited with a non-zero code or a signal while attempting to run a command in a container. This exit code is the general case of such errors, there are subsequent exit codes which are preferred in more specific cases
- Exit code `11` will be returned if `kubectl` exited with a non-zero code or a signal while attempting to run a node's `start.sh` script in a container
- Exit code `12` will be returned if `kubectl` exited with a non-zero code or a signal while attempting to run a node's `stop.sh` script in a container
36 changes: 18 additions & 18 deletions src/app/test_executive/test_executive.ml
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,21 @@ let report_test_errors ~log_error_set ~internal_error_set =
false
in
Print.eprintf "\n" ;
let result =
let exit_code =
if test_failed then (
color_eprintf Bash_colors.red
"The test has failed. See the above errors for details.\n\n" ;
false )
match (internal_error_set.exit_code, log_error_set.exit_code) with
| None, None ->
Some 1
| Some exit_code, _ | None, Some exit_code ->
Some exit_code )
else (
color_eprintf Bash_colors.green "The test has completed successfully.\n\n" ;
true )
None )
in
let%bind () = Writer.(flushed (Lazy.force stderr)) in
return result
return exit_code

(* TODO: refactor cleanup system (smells like a monad for composing linear resources would help a lot) *)

Expand All @@ -192,15 +196,15 @@ let dispatch_cleanup ~logger ~pause_cleanup_func ~network_cleanup_func
let open Test_error.Set in
combine [ test_error_set; of_hard_or_error log_engine_cleanup_result ]
in
let%bind test_was_successful =
let%bind exit_code =
report_test_errors ~log_error_set ~internal_error_set
in
let%bind () = pause_cleanup_func () in
let%bind () =
Option.value_map !net_manager_ref ~default:Deferred.unit
~f:network_cleanup_func
in
if not test_was_successful then exit 1 else Deferred.unit
Deferred.Option.map ~f:exit (return exit_code) >>| ignore
in
match !cleanup_deferred_ref with
| Some deferred ->
Expand Down Expand Up @@ -298,20 +302,20 @@ let main inputs =
~test_result:(Malleable_error.hard_error_string "fatal error") )
in
Monitor.try_with ~here:[%here] ~extract_exn:false (fun () ->
let init_result =
let open Deferred.Or_error.Let_syntax in
let lift = Deferred.map ~f:Or_error.return in
let open Malleable_error.Let_syntax in
let%bind network, dsl =
let lift = Deferred.bind ~f:Malleable_error.or_hard_error in
[%log trace] "initializing network manager" ;
let%bind net_manager =
lift @@ Engine.Network_manager.create ~logger network_config
Engine.Network_manager.create ~logger network_config
in
net_manager_ref := Some net_manager ;
[%log trace] "deploying network" ;
let%bind network =
lift @@ Engine.Network_manager.deploy net_manager
in
let%bind network = Engine.Network_manager.deploy net_manager in
[%log trace] "initializing log engine" ;
let%map log_engine = Engine.Log_engine.create ~logger ~network in
let%map log_engine =
lift @@ Engine.Log_engine.create ~logger ~network
in
log_engine_ref := Some log_engine ;
let event_router =
Dsl.Event_router.create ~logger
Expand All @@ -330,10 +334,6 @@ let main inputs =
in
(network, dsl)
in
let open Malleable_error.Let_syntax in
let%bind network, dsl =
Deferred.bind init_result ~f:Malleable_error.or_hard_error
in
[%log trace] "initializing network abstraction" ;
let%bind () = Engine.Network.initialize_infra ~logger network in

Expand Down
121 changes: 80 additions & 41 deletions src/lib/integration_test_cloud_engine/kubernetes_network.ml
Original file line number Diff line number Diff line change
Expand Up @@ -38,38 +38,49 @@ module Node = struct
Option.value container_id ~default:info.primary_container_id
in
let%bind cwd = Unix.getcwd () in
Integration_test_lib.Util.run_cmd_exn cwd "kubectl"
Integration_test_lib.Util.run_cmd_or_hard_error ~exit_code:6 cwd "kubectl"
(base_kube_args config @ [ "logs"; "-c"; container_id; pod_id ])

let run_in_container ?container_id ~cmd { pod_id; config; info; _ } =
let run_in_container ?(exit_code = 10) ?container_id ~cmd t =
let { pod_id; config; info; _ } = t in
let container_id =
Option.value container_id ~default:info.primary_container_id
in
let%bind cwd = Unix.getcwd () in
Integration_test_lib.Util.run_cmd_exn cwd "kubectl"
Integration_test_lib.Util.run_cmd_or_hard_error ~exit_code cwd "kubectl"
( base_kube_args config
@ [ "exec"; "-c"; container_id; "-i"; pod_id; "--" ]
@ cmd )

let cp_string_to_container_file ?container_id ~str ~dest t =
let { pod_id; config; info; _ } = t in
let container_id =
Option.value container_id ~default:info.primary_container_id
in
let tmp_file, oc =
Caml.Filename.open_temp_file ~temp_dir:Filename.temp_dir_name
"integration_test_cp_string" ".tmp"
in
Out_channel.output_string oc str ;
Out_channel.close oc ;
let%bind cwd = Unix.getcwd () in
let dest_file = sprintf "%s/%s:%s" config.namespace pod_id dest in
Integration_test_lib.Util.run_cmd_or_error cwd "kubectl"
(base_kube_args config @ [ "cp"; "-c"; container_id; tmp_file; dest_file ])

let start ~fresh_state node : unit Malleable_error.t =
let open Deferred.Let_syntax in
let open Malleable_error.Let_syntax in
let%bind () =
if fresh_state then
Deferred.ignore_m
(run_in_container node ~cmd:[ "sh"; "-c"; "rm -rf .mina-config/*" ])
else Deferred.return ()
in
let%bind () =
Deferred.ignore_m (run_in_container node ~cmd:[ "/start.sh" ])
run_in_container node ~cmd:[ "sh"; "-c"; "rm -rf .mina-config/*" ]
>>| ignore
else Malleable_error.return ()
in
Malleable_error.return ()
run_in_container ~exit_code:11 node ~cmd:[ "/start.sh" ] >>| ignore

let stop node =
let open Deferred.Let_syntax in
let%bind () =
Deferred.ignore_m (run_in_container node ~cmd:[ "/stop.sh" ])
in
Malleable_error.return ()
let open Malleable_error.Let_syntax in
run_in_container ~exit_code:12 node ~cmd:[ "/stop.sh" ] >>| ignore

let logger_metadata node =
[ ("namespace", `String node.config.namespace)
Expand Down Expand Up @@ -594,26 +605,54 @@ module Node = struct
[%log info] "Dumping archive data from (node: %s, container: %s)" t.pod_id
mina_archive_container_id ;
let%map data =
Deferred.bind ~f:Malleable_error.return
(run_in_container t ~container_id:mina_archive_container_id
~cmd:
[ "pg_dump"
; "--create"
; "--no-owner"
; "postgres://postgres:foobar@archive-1-postgresql:5432/archive"
] )
run_in_container t ~container_id:mina_archive_container_id
~cmd:
[ "pg_dump"
; "--create"
; "--no-owner"
; "postgres://postgres:foobar@archive-1-postgresql:5432/archive"
]
in
[%log info] "Dumping archive data to file %s" data_file ;
Out_channel.with_file data_file ~f:(fun out_ch ->
Out_channel.output_string out_ch data )

let run_replayer ~logger (t : t) =
[%log info] "Running replayer on archived data (node: %s, container: %s)"
t.pod_id mina_archive_container_id ;
let open Malleable_error.Let_syntax in
let%bind accounts =
run_in_container t
~cmd:[ "jq"; "-c"; ".ledger.accounts"; "/config/daemon.json" ]
in
let replayer_input =
sprintf
{| { "genesis_ledger": { "accounts": %s, "add_genesis_winner": true }} |}
accounts
in
let dest = "replayer-input.json" in
let%bind _res =
Deferred.bind ~f:Malleable_error.return
(cp_string_to_container_file t ~container_id:mina_archive_container_id
~str:replayer_input ~dest )
in
run_in_container t ~container_id:mina_archive_container_id
~cmd:
[ "mina-replayer"
; "--archive-uri"
; "postgres://postgres:foobar@archive-1-postgresql:5432/archive"
; "--input-file"
; dest
; "--output-file"
; "/dev/null"
; "--continue-on-error"
]

let dump_mina_logs ~logger (t : t) ~log_file =
let open Malleable_error.Let_syntax in
[%log info] "Dumping container logs from (node: %s, container: %s)" t.pod_id
t.info.primary_container_id ;
let%map logs =
Deferred.bind ~f:Malleable_error.return (get_logs_in_container t)
in
let%map logs = get_logs_in_container t in
[%log info] "Dumping container log to file %s" log_file ;
Out_channel.with_file log_file ~f:(fun out_ch ->
Out_channel.output_string out_ch logs )
Expand All @@ -623,9 +662,7 @@ module Node = struct
[%log info]
"Dumping precomputed blocks from logs for (node: %s, container: %s)"
t.pod_id t.info.primary_container_id ;
let%bind logs =
Deferred.bind ~f:Malleable_error.return (get_logs_in_container t)
in
let%bind logs = get_logs_in_container t in
(* kubectl logs may include non-log output, like "Using password from environment variable" *)
let log_lines =
String.split logs ~on:'\n'
Expand Down Expand Up @@ -740,18 +777,20 @@ module Workload = struct

let get_nodes t ~config =
let%bind cwd = Unix.getcwd () in
let open Malleable_error.Let_syntax in
let%bind app_id =
Integration_test_lib.Util.run_cmd_exn cwd "kubectl"
( base_kube_args config
@ [ "get"
; "deployment"
; t.workload_id
; "-o"
; "jsonpath={.spec.selector.matchLabels.app}"
] )
Deferred.bind ~f:Malleable_error.or_hard_error
(Integration_test_lib.Util.run_cmd_or_error cwd "kubectl"
( base_kube_args config
@ [ "get"
; "deployment"
; t.workload_id
; "-o"
; "jsonpath={.spec.selector.matchLabels.app}"
] ) )
in
let%map pod_ids_str =
Integration_test_lib.Util.run_cmd_exn cwd "kubectl"
Integration_test_lib.Util.run_cmd_or_hard_error cwd "kubectl"
( base_kube_args config
@ [ "get"; "pod"; "-l"; "app=" ^ app_id; "-o"; "name" ] )
in
Expand Down Expand Up @@ -874,7 +913,7 @@ let initialize_infra ~logger network =
poll (n + 1)
else (
[%log fatal] "Not all pods were assigned to nodes and ready in time." ;
Malleable_error.hard_error_string
Malleable_error.hard_error_string ~exit_code:4
"Some pods either were not assigned to nodes or did not deploy \
properly." )
in
Expand All @@ -895,7 +934,7 @@ let initialize_infra ~logger network =
"Not all pods were found when querying namespace; this indicates a \
deployment error. Refusing to continue. Expected pods: [%s]"
(String.Set.elements all_pods |> String.concat ~sep:"; ") ;
Malleable_error.hard_error_string
Malleable_error.hard_error_string ~exit_code:5
"Some pods were not found in namespace." )
else if any_pods_are_not_running then (
let failed_pod_statuses =
Expand Down
49 changes: 34 additions & 15 deletions src/lib/integration_test_cloud_engine/mina_automation.ml
Original file line number Diff line number Diff line change
Expand Up @@ -347,9 +347,13 @@ module Network_manager = struct

let run_cmd_exn t prog args = Util.run_cmd_exn t.testnet_dir prog args

let run_cmd_or_hard_error t prog args =
Util.run_cmd_or_hard_error t.testnet_dir prog args

let create ~logger (network_config : Network_config.t) =
let open Malleable_error.Let_syntax in
let%bind all_namespaces_str =
Util.run_cmd_exn "/" "kubectl"
Util.run_cmd_or_hard_error "/" "kubectl"
[ "get"; "namespaces"; "-ojsonpath={.items[*].metadata.name}" ]
in
let all_namespaces = String.split ~on:' ' all_namespaces_str in
Expand All @@ -364,21 +368,23 @@ module Network_manager = struct
then
let%bind () =
if network_config.debug_arg then
Util.prompt_continue
"Existing namespace of same name detected, pausing startup. \
Enter [y/Y] to continue on and remove existing namespace, start \
clean, and run the test; press Cntrl-C to quit out: "
Deferred.bind ~f:Malleable_error.return
(Util.prompt_continue
"Existing namespace of same name detected, pausing startup. \
Enter [y/Y] to continue on and remove existing namespace, \
start clean, and run the test; press Cntrl-C to quit out: " )
else
Deferred.return
Malleable_error.return
([%log info]
"Existing namespace of same name detected; removing to start \
clean" )
in
Util.run_cmd_exn "/" "kubectl"
Util.run_cmd_or_hard_error "/" "kubectl"
[ "delete"; "namespace"; network_config.terraform.testnet_name ]
>>| Fn.const ()
else return ()
in
let open Deferred.Let_syntax in
let%bind () =
if%bind File_system.dir_exists testnet_dir then (
[%log info] "Old terraform directory found; removing to start clean" ;
Expand Down Expand Up @@ -482,16 +488,20 @@ module Network_manager = struct
~f:(fun { keypair; _ } -> keypair)
}
in
let open Malleable_error.Let_syntax in
[%log info] "Initializing terraform" ;
let%bind _ = run_cmd_exn t "terraform" [ "init" ] in
let%map _ = run_cmd_exn t "terraform" [ "validate" ] in
let%bind _ = run_cmd_or_hard_error t "terraform" [ "init" ] in
let%map _ = run_cmd_or_hard_error t "terraform" [ "validate" ] in
t

let deploy t =
let open Malleable_error.Let_syntax in
let logger = t.logger in
if t.deployed then failwith "network already deployed" ;
[%log info] "Deploying network" ;
let%bind _ = run_cmd_exn t "terraform" [ "apply"; "-auto-approve" ] in
let%bind _ =
run_cmd_or_hard_error t "terraform" [ "apply"; "-auto-approve" ]
in
t.deployed <- true ;
let config : Kubernetes_network.config =
{ testnet_name = t.testnet_name
Expand All @@ -501,20 +511,25 @@ module Network_manager = struct
}
in
let%map seeds =
Deferred.List.concat_map t.seed_workloads
Malleable_error.List.map t.seed_workloads
~f:(Kubernetes_network.Workload.get_nodes ~config)
>>| List.concat
and block_producers =
Deferred.List.concat_map t.block_producer_workloads
Malleable_error.List.map t.block_producer_workloads
~f:(Kubernetes_network.Workload.get_nodes ~config)
>>| List.concat
and snark_coordinators =
Deferred.List.concat_map t.snark_coordinator_workloads
Malleable_error.List.map t.snark_coordinator_workloads
~f:(Kubernetes_network.Workload.get_nodes ~config)
>>| List.concat
and snark_workers =
Deferred.List.concat_map t.snark_worker_workloads
Malleable_error.List.map t.snark_worker_workloads
~f:(Kubernetes_network.Workload.get_nodes ~config)
>>| List.concat
and archive_nodes =
Deferred.List.concat_map t.archive_workloads
Malleable_error.List.map t.archive_workloads
~f:(Kubernetes_network.Workload.get_nodes ~config)
>>| List.concat
in
let all_nodes =
seeds @ block_producers @ snark_coordinators @ snark_workers
Expand Down Expand Up @@ -564,4 +579,8 @@ module Network_manager = struct
[%log' info t.logger] "Cleaning up network configuration" ;
let%bind () = File_system.remove_dir t.testnet_dir in
Deferred.unit

let destroy t =
Deferred.Or_error.try_with (fun () -> destroy t)
|> Deferred.bind ~f:Malleable_error.or_hard_error
end

0 comments on commit ea4506c

Please sign in to comment.