# Setup

## Julia

Unless not installed yet, install `IJulia` package and other dependencies in Julia runtime environment:
```sh
$ julia

               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.8.1 (2022-09-06)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |

julia> using Pkg

...

julia> Pkg.add("IJulia")

...

julia> Pkg.add("HTTP")

...

julia> Pkg.add("JSON")

...
```

## Find available Arweave nodes

Find available Arweave nodes and generate generate `nodes.json`. This step needs to be rerun once or twice a day when working on this notebook. The generated file is then used later in the notebook. This takes couple of minutes to run:

```sh
$ cargo run -r --bin crawl-peers -- --max-concurrency 400 --max-count 100 --max-depth 3 --req-timeout-secs 4 > nodes.json
```

## Start  Jupyter Notebook

```sh
$ cd /path/to/this/project/
$ jupyter-notebook
```

In [1]:
using HTTP;
using JSON;

In [2]:
function getoffset(tx)
    resp = HTTP.request("GET", "https://arweave.net/tx/$tx/offset");
    json = JSON.parse(String(resp.body));
    tx_size = parse(Int64, json["size"]);
    end_offset = parse(Int64, json["offset"]);
    start_offset = end_offset - tx_size;
    Dict(:start_offset => start_offset, :end_offset => end_offset, :size => tx_size)
end

getoffset (generic function with 1 method)

In [3]:
# One tx that is a bundle
tx = "hUUFmn3FIlPEFkZwpcsaEJxMMQ2YQnl22NXAXUUTODY";

# Just one smaller tx that's not a bundle
# tx = "2tbslSgd8sV3pPMC4ZxSO05yZc4aUBAy7YEculqbp4M";

In [4]:
(start_offset, end_offset, tx_size) = getoffset(tx) |> (data) -> (start_offset = get!(data, :start_offset, 0), end_offset = get!(data, :end_offset, 0), tx_size = get!(data, :size, 0));

Dict(:size => tx_size, :end_offset => end_offset, :start_offset => start_offset)

Dict{Symbol, Int64} with 3 entries:
  :size         => 449120499
  :end_offset   => 100482896210409
  :start_offset => 100482447089910

In [5]:
f = open("nodes.json")
nodes = map((address) -> "http://$address", JSON.parse(readlines(f)[1]));
close(f);

nodes

63-element Vector{String}:
 "http://116.203.213.56:1984"
 "http://65.108.206.14:1984"
 "http://159.69.251.19:1984"
 "http://88.99.139.151:1984"
 "http://45.153.35.151:1984"
 "http://188.166.192.169:1984"
 "http://23.88.65.188:1984"
 "http://157.230.102.219:1984"
 "http://18.169.106.95:1984"
 "http://89.58.45.74:1984"
 "http://65.21.146.138:1984"
 "http://47.90.203.43:1984"
 "http://138.68.42.181:1984"
 ⋮
 "http://135.181.137.241:1984"
 "http://65.21.204.231:1984"
 "http://65.108.126.145:1984"
 "http://65.21.201.96:1984"
 "http://165.227.34.27:1984"
 "http://60.162.132.189:1984"
 "http://46.189.145.50:1984"
 "http://211.21.182.11:1984"
 "http://65.108.15.151:1984"
 "http://206.83.144.16:1984"
 "http://206.83.144.15:1984"
 "http://98.180.9.81:1984"

In [6]:
num_chunks = ceil(Int64, tx_size / (256 * 1024));

In [7]:
chunks = collect(range(start_offset; length = num_chunks - 2, step = 256 * 1024))

1712-element Vector{Int64}:
 100482447089910
 100482447352054
 100482447614198
 100482447876342
 100482448138486
 100482448400630
 100482448662774
 100482448924918
 100482449187062
 100482449449206
 100482449711350
 100482449973494
 100482450235638
               ⋮
 100482892734710
 100482892996854
 100482893258998
 100482893521142
 100482893783286
 100482894045430
 100482894307574
 100482894569718
 100482894831862
 100482895094006
 100482895356150
 100482895618294

In [8]:
function getdatasyncrecord(start_offset, base_address)
    url = "$base_address/data_sync_record/$start_offset/1";
    try
        res = HTTP.get(url, ["Content-Type" => "application/json"]);
        if res.status >= 200 && res.status < 400
            return JSON.parse(String(res.body));
        else
            return [];
        end
    catch e
        return [];
    end
end

getdatasyncrecord (generic function with 1 method)

In [9]:
data_sync_records = map(node -> (node, getdatasyncrecord(start_offset, node)), nodes);

In [10]:
nodes_with_ranges = map(data_sync_records) do (node, records)
    ranges = records |> collect ∘ Base.Iterators.flatten .|> e -> [parse(Int64, e.second), parse(Int64, e.first)];
    if length(ranges) == 1
        return (node, ranges[1])
    else
        return (node, [])
    end
end;


In [11]:
nodes_with_full_data = filter(nodes_with_ranges) do (node, range)
    length(range) == 2 && range[1] <= start_offset && range[2] >= end_offset
end

6-element Vector{Tuple{String, Vector}}:
 ("http://23.88.65.188:1984", [100482019270902, 100486707716342])
 ("http://162.220.53.21:1984", [100482447089910, 100482896404726])
 ("http://65.21.204.231:1984", [100482019270902, 100483578503414])
 ("http://65.21.201.96:1984", [100482019270902, 100486707716342])
 ("http://206.83.144.16:1984", [100482019270902, 100486707716342])
 ("http://206.83.144.15:1984", [100482019270902, 100485453095158])

In [12]:
nodes_with_partial_data = filter(nodes_with_ranges) do (node, range)
    length(range) == 2 && (range[1] <= start_offset && range[2] > start_offset || range[1] < end_offset && range[2] >= end_offset)
end

6-element Vector{Tuple{String, Vector}}:
 ("http://23.88.65.188:1984", [100482019270902, 100486707716342])
 ("http://162.220.53.21:1984", [100482447089910, 100482896404726])
 ("http://65.21.204.231:1984", [100482019270902, 100483578503414])
 ("http://65.21.201.96:1984", [100482019270902, 100486707716342])
 ("http://206.83.144.16:1984", [100482019270902, 100486707716342])
 ("http://206.83.144.15:1984", [100482019270902, 100485453095158])

In [13]:
only_nodes_with_partial_data = setdiff(nodes_with_partial_data, nodes_with_full_data)

Tuple{String, Vector}[]