Skip to content

Commit

Permalink
Merge branch 'master' into raw
Browse files Browse the repository at this point in the history
  • Loading branch information
Glutexo committed Apr 7, 2023
2 parents 05f5b60 + f2c4aa4 commit 92599f6
Show file tree
Hide file tree
Showing 14 changed files with 231 additions and 164 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/elixir.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build:

name: Build and test
runs-on: ubuntu-latest
runs-on: ubuntu-20.04

steps:
- name: Check out the repository
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ onigumo-*.tar
# Temporary files, for example, from tests.
/tmp/

onigumo
# Ignore onigumo escript file
/onigumo
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ flowchart LR
onigumo_operator --> spider_materialization[MATERIALIZER]
subgraph Onigumo
subgraph "Onigumo (kernel)"
onigumo_operator
onigumo_downloader
onigumo_parser
end
subgraph Spider
subgraph "Spider (application)"
spider_operator
spider_parser
spider_materialization
Expand All @@ -51,14 +51,14 @@ The Operator’s job is to:

### Downloader ###

Stahuje obsah a metadata nezpracovaných URL adres.
The Downloader fetches and saves the contents and metadata from the unprocessed URL addresses.

Činnost _downloaderu_ se skládá z:
The Downloader’s job is to:

1. načítání URL ke stažení,
2. kontroly stažených URL,
3. stahování obsahu URL a případných metadat,
4. uložení stažených dat.
1. read URLs for download,
2. check for the already downloaded URLs,
3. fetch the URLs contents along with its metadata,
4. save the downloaded data.

### Parser ###

Expand Down
7 changes: 7 additions & 0 deletions lib/cli.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defmodule Onigumo.CLI do
def main([component]) do
module = Module.safe_concat("Onigumo", component)
root_path = File.cwd!()
module.main(root_path)
end
end
6 changes: 3 additions & 3 deletions lib/onigumo.ex → lib/onigumo/downloader.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ defmodule Onigumo.Downloader do
def main(root_path) do
http_client().start()

download_urls_from_file(root_path)
create_download_stream(root_path)
|> Stream.run()
end

def download_urls_from_file(root_path) do
def create_download_stream(root_path) do
root_path
|> load_urls()
|> Stream.map(&download_url(&1, root_path))
Expand Down Expand Up @@ -51,7 +51,7 @@ defmodule Onigumo.Downloader do

def create_file_name(url) do
suffix = Application.get_env(:onigumo, :downloaded_suffix)
Hash.md5(url, :hex) <> suffix
Onigumo.Utilities.Hash.md5(url, :hex) <> suffix
end

defp http_client() do
Expand Down
6 changes: 0 additions & 6 deletions lib/onigumo_cli.ex

This file was deleted.

7 changes: 7 additions & 0 deletions lib/spider/html.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
defmodule Onigumo.Spider.HTML do
def find_links(document) do
Floki.parse_document!(document)
|> Floki.find("a")
|> Floki.attribute("href")
end
end
2 changes: 1 addition & 1 deletion lib/hash.ex → lib/utilities/hash.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defmodule Hash do
defmodule Onigumo.Utilities.Hash do
def md5(data, fmt) do
hash(:md5, data)
|> format(fmt)
Expand Down
5 changes: 4 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ defmodule Onigumo.MixProject do
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
{:httpoison, "~> 1.8"},
{:mox, "~> 1.0", only: :test}
{:mox, "~> 1.0", only: :test},

# Spider toolbox dependencies
{:floki, "~> 0.32"}
]
end

Expand Down
2 changes: 2 additions & 0 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
%{
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
"floki": {:hex, :floki, "0.32.1", "dfe3b8db3b793939c264e6f785bca01753d17318d144bd44b407fb3493acaa87", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "d4b91c713e4a784a3f7b1e3cc016eefc619f6b1c3898464222867cafd3c681a3"},
"hackney": {:hex, :hackney, "1.18.0", "c4443d960bb9fba6d01161d01cd81173089686717d9490e5d3606644c48d121f", [:rebar3], [{:certifi, "~>2.8.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "9afcda620704d720db8c6a3123e9848d09c87586dc1c10479c42627b905b5c5e"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
Expand Down
32 changes: 16 additions & 16 deletions test/hash_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -51,31 +51,31 @@ defmodule HashTest do
}
]

test("hash MD5 known value in hexadecimal") do
for {data, hash_hex, _} <- @known_md5s do
hash = Hash.md5(data, :hex)
assert(hash == hash_hex)
for {data, hash_hex, _} <- @known_md5s do
test("hash MD5 #{inspect(data)} in hexadecimal") do
hash = Onigumo.Utilities.Hash.md5(unquote(data), :hex)
assert(hash == unquote(hash_hex))
end
end

test("hash MD5 known value in binary") do
for {data, _, hash_bin} <- @known_md5s do
hash = Hash.md5(data, :bin)
assert(hash == hash_bin)
for {data, _, hash_bin} <- @known_md5s do
test("hash MD5 #{inspect(data)} in binary") do
hash = Onigumo.Utilities.Hash.md5(unquote(data), :bin)
assert(hash == unquote(hash_bin))
end
end

test("format a binary hash") do
for {format, hash} <- @formatted_hashes do
formatted = Hash.format(@binary_hash, format)
assert(formatted == hash)
for {format, hash} <- @formatted_hashes do
test("format #{inspect(@binary_hash)} in #{inspect(format)}") do
formatted = Onigumo.Utilities.Hash.format(@binary_hash, unquote(format))
assert(formatted == unquote(hash))
end
end

test("hash a known value") do
for {func, known_hash} <- @known_hashes do
computed_hash = Hash.hash(func, @known_hash_data)
assert(computed_hash == known_hash)
for {func, known_hash} <- @known_hashes do
test("hash #{inspect(@known_hash_data)} with #{inspect(func)}") do
computed_hash = Onigumo.Utilities.Hash.hash(unquote(func), @known_hash_data)
assert(computed_hash == unquote(known_hash))
end
end
end
150 changes: 150 additions & 0 deletions test/onigumo_downloader_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
defmodule OnigumoDownloaderTest do
use ExUnit.Case
import Mox

@urls [
"http://onigumo.local/hello.html",
"http://onigumo.local/bye.html"
]
@slices [0..1, 0..-1]

setup(:verify_on_exit!)

describe("Onigumo.Downloader.main/1") do
@tag :tmp_dir
test("run Downloader", %{tmp_dir: tmp_dir}) do
expect(HTTPoisonMock, :start, fn -> nil end)
expect(HTTPoisonMock, :get!, length(@urls), &prepare_response/1)

input_path_env = Application.get_env(:onigumo, :input_path)
input_path_tmp = Path.join(tmp_dir, input_path_env)
input_file_content = prepare_input(@urls)
File.write!(input_path_tmp, input_file_content)

Onigumo.Downloader.main(tmp_dir)

Enum.map(@urls, &assert_downloaded(&1, tmp_dir))
end
end

describe("Onigumo.Downloader.create_download_stream/1") do
@tag :tmp_dir
test("download URLs from the input file with a created stream", %{tmp_dir: tmp_dir}) do
expect(HTTPoisonMock, :get!, length(@urls), &prepare_response/1)

input_path_env = Application.get_env(:onigumo, :input_path)
input_path_tmp = Path.join(tmp_dir, input_path_env)
input_file_content = prepare_input(@urls)
File.write!(input_path_tmp, input_file_content)

Onigumo.Downloader.create_download_stream(tmp_dir) |> Stream.run()

Enum.map(@urls, &assert_downloaded(&1, tmp_dir))
end
end

describe("Onigumo.Downloader.download_url/2") do
@tag :tmp_dir
test("download a URL", %{tmp_dir: tmp_dir}) do
expect(HTTPoisonMock, :get!, &prepare_response/1)

input_url = Enum.at(@urls, 0)
Onigumo.Downloader.download_url(input_url, tmp_dir)

output_file_name = Onigumo.Downloader.create_file_name(input_url)
output_path = Path.join(tmp_dir, output_file_name)
read_output = File.read!(output_path)
expected_output = body(input_url)
assert(read_output == expected_output)
end
end

describe("Onigumo.Downloader.get_url/1") do
test("get response by HTTP request") do
expect(HTTPoisonMock, :get!, &prepare_response/1)

url = Enum.at(@urls, 0)
get_response = Onigumo.Downloader.get_url(url)
expected_response = prepare_response(url)
assert(get_response == expected_response)
end
end

describe("Onigumo.Downloader.get_body/1") do
test("extract body from URL response") do
url = Enum.at(@urls, 0)
response = prepare_response(url)
get_body = Onigumo.Downloader.get_body(response)
expected_body = body(url)
assert(get_body == expected_body)
end
end

describe("Onigumo.Downloader.write_response/2") do
@tag :tmp_dir
test("write response to file", %{tmp_dir: tmp_dir}) do
response = "Response!"
output_file_name = "body.html"
output_path = Path.join(tmp_dir, output_file_name)
Onigumo.Downloader.write_response(response, output_path)

read_output = File.read!(output_path)
assert(read_output == response)
end
end

describe("Onigumo.Downloader.load_urls/1") do
for slice <- @slices do
@tag :tmp_dir
test("load URLs #{inspect(slice)} from a file", %{tmp_dir: tmp_dir}) do
input_urls = Enum.slice(@urls, unquote(Macro.escape(slice)))

input_path_env = Application.get_env(:onigumo, :input_path)
input_path_tmp = Path.join(tmp_dir, input_path_env)
input_file_content = prepare_input(input_urls)
File.write!(input_path_tmp, input_file_content)

loaded_urls = Onigumo.Downloader.load_urls(tmp_dir) |> Enum.to_list()

assert(loaded_urls == input_urls)
end
end
end

describe("Onigumo.Downloader.create_file_name/1") do
test("create file name from URL") do
input_url = "https://onigumo.local/hello.html"
created_file_name = Onigumo.Downloader.create_file_name(input_url)

input_url_hash = Hash.md5(input_url, :hex)
downloaded_suffix = Application.get_env(:onigumo, :downloaded_suffix)
expected_file_name = input_url_hash <> downloaded_suffix

assert(created_file_name == expected_file_name)
end
end

defp prepare_response(url) do
%HTTPoison.Response{
status_code: 200,
body: body(url)
}
end

defp prepare_input(urls) do
Enum.map(urls, &(&1 <> "\n"))
|> Enum.join()
end

defp body(url) do
"Body from: #{url}\n"
end

defp assert_downloaded(url, tmp_dir) do
file_name = Onigumo.Downloader.create_file_name(url)
output_path = Path.join(tmp_dir, file_name)
read_output = File.read!(output_path)
expected_output = body(url)
assert(read_output == expected_output)
end
end
Loading

0 comments on commit 92599f6

Please sign in to comment.