-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
57 lines (45 loc) · 1.5 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
'use strict'
const { uniq, concat, isEmpty } = require('lodash')
const getHTML = require('html-get')
const cheerio = require('cheerio')
const matcher = require('matcher')
const aigle = require('aigle')
const { URL } = require('url')
const path = require('path')
const { normalizeUrl } = require('@metascraper/helpers')
const REGEX_URL_XML = /^\.xml$/i
const XML_SELECTOR = 'loc'
const getText = $ =>
function () {
return $(this)
.text()
.trim()
}
const isXmlUrl = url => REGEX_URL_XML.test(path.extname(url))
const xmlUrls = async (url, { cheerioOpts = {}, whitelist = false, ...opts } = {}) => {
const { origin: baseUrl } = new URL(url)
const { html } = await getHTML(url, opts)
const $ = cheerio.load(html, { xmlMode: true, ...cheerioOpts })
const urls = uniq(
$(XML_SELECTOR)
.map(getText($))
.get()
)
const iterator = async (set, url) => {
const match = !isEmpty(whitelist) && matcher([url], concat(whitelist))
if (!isEmpty(match)) return set
const urls = isXmlUrl(url) ? await xmlUrls(url, opts) : [normalizeUrl(baseUrl, url)]
return new Set([...set, ...urls])
}
return aigle.reduce(urls, iterator, new Set())
}
module.exports = async (urls, opts) => {
const collection = concat(urls)
const iterator = async (set, url) => {
const urls = Array.from(await xmlUrls(url, opts))
return new Set([...set, ...urls])
}
const set = await aigle.reduce(collection, iterator, new Set())
return Array.from(set)
}
module.exports.isXmlUrl = isXmlUrl