chore(tests): heavy integration to use large dataset (#429)

* chore(tests): heavy integration to use large dataset * fix(cfg): `lazy_static` persists across calls (bad), and is not needed (#431) * chore(tests): heavy integration tests to use large dataset
Jon-Becker · Jun 6, 2024 · e8ef5b1 · e8ef5b1
1 parent 773acc6
commit e8ef5b1
Show file tree

Hide file tree

Showing 9 changed files with 259 additions and 215 deletions.
diff --git a/.config/nextest.toml b/.config/nextest.toml
@@ -1,2 +1,6 @@
 [profile.default]
 retries = { backoff = "exponential", count = 2, delay = "2s", jitter = true }
+
+[[profile.default.overrides]]
+filter = 'test(heavy_integration_test)'
+retries = 0
diff --git a/.github/workflows/heavy-integration.yml b/.github/workflows/heavy-integration.yml
@@ -1,45 +1,52 @@
 name: heavy integration
 
 on:
-    schedule:
-        # Runs at 10PM utc
-        - cron: "0 22 * * *"
-    workflow_dispatch:
+  schedule:
+    # Runs at 10PM utc
+    - cron: "0 22 * * *"
+  workflow_dispatch:
 
 concurrency:
-    group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-    cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 
 env:
-    CARGO_TERM_COLOR: always
+  CARGO_TERM_COLOR: always
 
 jobs:
-    heavy-integration:
-        name: heavy (long-running) integration tests
-        runs-on: ubuntu-latest
-        timeout-minutes: 120
+  heavy-integration:
+    name: heavy (long-running) integration tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
 
-        steps:
-            - uses: actions/checkout@v3
-            - uses: dtolnay/rust-toolchain@stable
-            - uses: taiki-e/install-action@nextest
-            - name: Run Tests
-              run: |
-                  cargo nextest r --no-fail-fast --release --nocapture -- --ignored
+    steps:
+      - uses: actions/checkout@v3
+      - uses: dtolnay/rust-toolchain@stable
+      - uses: taiki-e/install-action@nextest
+      - name: Fetch Dataset
+        run: |
+          # download from https://jbecker.dev/data/largest1k.tar.gz
+          wget https://jbecker.dev/data/largest1k.tar.gz
 
-    # If any of the jobs fail, this will create a high-priority issue to signal so.
-    issue:
-        name: Open an issue
-        runs-on: ubuntu-latest
-        needs: heavy-integration
-        if: ${{ failure() }}
-        steps:
-            - uses: actions/checkout@v4
-            - uses: JasonEtco/create-an-issue@v2
-              env:
-                  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-                  WORKFLOW_URL: |
-                      ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-              with:
-                  update_existing: true
-                  filename: .github/INTEGRATION_FAILURE.md
+          # extract the dataset
+          tar -xvf largest1k.tar.gz
+      - name: Run Tests
+        run: |
+          cargo nextest r --no-fail-fast --release --nocapture -- --ignored
+
+  # If any of the jobs fail, this will create a high-priority issue to signal so.
+  issue:
+    name: Open an issue
+    runs-on: ubuntu-latest
+    needs: heavy-integration
+    if: ${{ failure() }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_URL: |
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        with:
+          update_existing: true
+          filename: .github/INTEGRATION_FAILURE.md
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,5 @@ false/*
 
 *.svg
 *.sh
+
+largest1k
diff --git a/crates/cfg/src/core/graph.rs b/crates/cfg/src/core/graph.rs
@@ -1,19 +1,9 @@
-use std::{collections::HashMap, sync::Mutex};
-
 use ethers::prelude::U256;
-use eyre::{eyre, OptionExt, Result};
+use eyre::{OptionExt, Result};
 use heimdall_common::utils::strings::encode_hex_reduced;
 use heimdall_vm::ext::exec::VMTrace;
 use petgraph::{matrix_graph::NodeIndex, Graph};
 
-use lazy_static::lazy_static;
-
-lazy_static! {
-    static ref INSTRUCTION_NODE_MAP: Mutex<HashMap<u128, NodeIndex<u32>>> =
-        Mutex::new(HashMap::new());
-    static ref CONNECTING_EDGES: Mutex<Vec<String>> = Mutex::new(Vec::new());
-}
-
 /// convert a symbolic execution [`VMTrace`] into a [`Graph`] of blocks, illustrating the
 /// control-flow graph found by the symbolic execution engine.
 // TODO: should this be a trait for VMTrace to implement?
@@ -56,53 +46,12 @@ pub fn build_cfg(
         cfg_node.push_str(&format!("{}\n", &assembly));
     }
 
-    // check if the map already contains the current node
-    let mut instruction_node_map =
-        INSTRUCTION_NODE_MAP.lock().map_err(|_| eyre!("failed to lock instruction node map"))?;
-    let chunk_index = match vm_trace.operations.first() {
-        Some(operation) => operation.last_instruction.instruction,
-        None => 0,
-    };
-
-    match instruction_node_map.get(&chunk_index) {
-        Some(node_index) => {
-            // this node already exists, so we need to add an edge to it.
-            if let Some(parent_node) = parent_node {
-                // check if the edge already exists
-                let mut connecting_edges = CONNECTING_EDGES
-                    .lock()
-                    .map_err(|_| eyre!("failed to lock connecting edges"))?;
-                let edge = format!("{} -> {}", parent_node.index(), node_index.index());
-                if !connecting_edges.contains(&edge) {
-                    contract_cfg.add_edge(parent_node, *node_index, jump_taken.to_string());
-                    connecting_edges.push(edge);
-                }
-                drop(connecting_edges)
-            }
-        }
-        None => {
-            // this node does not exist, so we need to add it to the map and the graph
-            let node_index = contract_cfg.add_node(cfg_node);
-
-            if let Some(parent_node) = parent_node {
-                // check if the edge already exists
-                let mut connecting_edges = CONNECTING_EDGES
-                    .lock()
-                    .map_err(|_| eyre!("failed to lock connecting edges"))?;
-                let edge = format!("{} -> {}", parent_node.index(), node_index.index());
-                if !connecting_edges.contains(&edge) {
-                    contract_cfg.add_edge(parent_node, node_index, jump_taken.to_string());
-                    connecting_edges.push(edge);
-                }
-                drop(connecting_edges)
-            }
-
-            instruction_node_map.insert(chunk_index, node_index);
-            parent_node = Some(node_index);
-        }
-    };
-
-    drop(instruction_node_map);
+    // add the node to the graph
+    let node_index = contract_cfg.add_node(cfg_node);
+    if let Some(parent_node) = parent_node {
+        contract_cfg.update_edge(parent_node, node_index, jump_taken.to_string());
+    }
+    parent_node = Some(node_index);
 
     // recurse into the children of the VMTrace map
     for child in vm_trace.children.iter() {

diff --git a/crates/core/tests/test_cfg.rs b/crates/core/tests/test_cfg.rs
@@ -1,7 +1,11 @@
 #[cfg(test)]
 mod integration_tests {
-    use heimdall_cfg::CFGArgs;
+    use std::path::PathBuf;
+
+    use heimdall_cfg::{cfg, CFGArgs, CFGArgsBuilder};
+    use heimdall_common::utils::io::file::delete_path;
     use petgraph::dot::Dot;
+    use serde_json::Value;
 
     #[tokio::test]
     async fn test_cfg_simple() {
@@ -57,4 +61,69 @@ mod integration_tests {
             assert!(output.contains(line))
         }
     }
+
+    #[tokio::test]
+    #[ignore]
+    async fn heavy_integration_test() {
+        let root_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .parent()
+            .expect("no parent")
+            .parent()
+            .expect("no parent")
+            .to_owned();
+
+        // if the ./largest1k directory does not exist, download it from https://jbecker.dev/data/largest1k.tar.gz
+        let dataset_dir = root_dir.join("largest1k");
+        if !dataset_dir.exists() {
+            eprintln!("dataset not found in root, skipping test");
+            std::process::exit(0);
+        }
+
+        // list files in root_dir
+        let contracts = std::fs::read_dir(dataset_dir)
+            .expect("failed to read dataset directory")
+            .map(|res| {
+                // HashMap from filename (without extension) to bytecode (from serde_json::Value)
+                res.map(|e| {
+                    let path = e.path();
+                    let filename = path
+                        .file_stem()
+                        .expect("no file stem")
+                        .to_str()
+                        .expect("no file stem")
+                        .to_owned();
+
+                    // read contents as json and parse to serde_json::Value
+                    let contents_json: Value = serde_json::from_str(
+                        &std::fs::read_to_string(path).expect("failed to read file"),
+                    )
+                    .expect("failed to parse json");
+                    let bytecode = contents_json["code"].as_str().expect("no bytecode").to_owned();
+
+                    (filename, bytecode)
+                })
+            })
+            .collect::<Result<Vec<_>, std::io::Error>>()
+            .expect("failed to collect files");
+
+        for (contract_address, bytecode) in contracts {
+            println!("Generating CFG for contract {contract_address}");
+            let args = CFGArgsBuilder::new()
+                .target(bytecode)
+                .timeout(10000)
+                .output(String::from("./output/tests/cfg/integration"))
+                .build()
+                .expect("failed to build args");
+
+            let _ = cfg(args)
+                .await
+                .map_err(|e| {
+                    eprintln!("failed to generate cfg for contract {contract_address}: {e}");
+                    e
+                })
+                .expect("failed to generate cfg");
+        }
+
+        delete_path(&String::from("./output/tests/cfg/integration"));
+    }
 }
diff --git a/crates/core/tests/test_decode.rs b/crates/core/tests/test_decode.rs
@@ -1,6 +1,7 @@
-#[cfg(test)]
-mod tests {
-    use heimdall_decoder::DecodeArgs;
+mod integration_tests {
+    use heimdall_common::utils::{sync::blocking_await, threading::task_pool};
+    use heimdall_decoder::{DecodeArgs, DecodeArgsBuilder};
+    use serde_json::Value;
 
     #[tokio::test]
     async fn test_decode_transfer() {
@@ -31,17 +32,10 @@ mod tests {
         };
         let _ = heimdall_decoder::decode(args).await;
     }
-}
-
-mod integration_tests {
-    use heimdall_common::utils::{sync::blocking_await, threading::task_pool};
-    use heimdall_decoder::DecodeArgsBuilder;
-    use serde_json::Value;
 
-    /// Thorough testing for decode across a large number of transactions.
     #[test]
     #[ignore]
-    fn heavy_test_decode_thorough() {
+    fn heavy_integration_test() {
         let rpc_url = std::env::var("RPC_URL").unwrap_or_else(|_| {
             println!("RPC_URL not set, skipping test");
             std::process::exit(0);