Skip to content

Commit

Permalink
Add removeDuplicates option
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikobeats committed Jul 6, 2018
1 parent 9337816 commit 09282e8
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 34 deletions.
9 changes: 8 additions & 1 deletion README.md
Expand Up @@ -46,7 +46,7 @@ See [examples](/examples).

## API

### links([options])
### htmlUrls([options])

#### options

Expand All @@ -73,6 +73,13 @@ Default: `[]`

A list of links to be excluded from the final output. It supports regex patterns.

##### removeDuplicates

Type: `boolean`<br>
Default: true`

Remove duplicated links detected over all the HTML tags.

## License

**html-urls** © [Kiko Beats](https://kikobeats.com), released under the [MIT](https://github.com/Kikobeats/html-urls/blob/master/LICENSE.md) License.<br>
Expand Down
41 changes: 41 additions & 0 deletions __snapshots__/index.js.snap-shot
Expand Up @@ -234,3 +234,44 @@ exports['${tag} (${attributeName}) 27'] = [
}
]

exports['invariant wwww 1'] = [
{
"url": "https://www.google.com",
"normalizedUrl": "https://www.google.com"
},
{
"url": "https://google.com",
"normalizedUrl": "https://google.com"
},
{
"url": "https://facebook.com",
"normalizedUrl": "https://facebook.com"
}
]

exports['from same tag 2'] = [
{
"url": "https://google.com",
"normalizedUrl": "https://google.com"
},
{
"url": "https://google.com",
"normalizedUrl": "https://google.com"
},
{
"url": "https://facebook.com",
"normalizedUrl": "https://facebook.com"
}
]

exports['from different tags 2'] = [
{
"url": "https://google.com",
"normalizedUrl": "https://google.com"
},
{
"url": "https://google.com",
"normalizedUrl": "https://google.com"
}
]

47 changes: 31 additions & 16 deletions src/index.js
Expand Up @@ -33,7 +33,7 @@ const TAGS = {
}

const reduceSelector = (collection, fn, acc = []) => {
collection.each(function (index, element) {
collection.each(function () {
acc = fn(acc, this)
})
return acc
Expand All @@ -53,30 +53,44 @@ const getLink = ({ url, el, attribute }) => {
}
}

const getLinksByAttribute = ({ selector, attribute, url, whitelist }) => {
return reduceSelector(
selector,
(acc, el) => {
const link = getLink({ url, el, attribute })
const uid = get(link, UID)
if (isEmpty(link)) return acc
const isAlreadyAdded = includes(acc, item => get(item, UID) === uid)
if (isAlreadyAdded) return acc
const match = whitelist && matcher([uid], whitelist)
return isEmpty(match) ? concat(acc, link) : acc
},
[]
)
const createGetLinksByAttribute = ({ removeDuplicates }) => {
const has = removeDuplicates
? (acc, uid) => includes(acc, item => get(item, UID) === uid)
: () => false

return ({ selector, attribute, url, whitelist }) =>
reduceSelector(
selector,
(acc, el) => {
const link = getLink({ url, el, attribute })
const uid = get(link, UID)
if (isEmpty(link)) return acc
const isAlreadyAdded = has(acc, uid)
if (isAlreadyAdded) return acc
const match = whitelist && matcher([uid], whitelist)
return isEmpty(match) ? concat(acc, link) : acc
},
[]
)
}

const createAdd = ({ removeDuplicates }) =>
removeDuplicates
? (acc, links) => uniqBy(concat(acc, links), UID)
: (acc, links) => concat(acc, links)

module.exports = ({
html = '',
url = '',
whitelist = false,
removeDuplicates = true,
cheerioOpts = {}
} = {}) => {
const $ = cheerio.load(html, cheerioOpts)

const add = createAdd({ removeDuplicates })
const getLinksByAttribute = createGetLinksByAttribute({ removeDuplicates })

return reduce(
TAGS,
(acc, htmlTags, attribute) => {
Expand All @@ -86,7 +100,8 @@ module.exports = ({
url,
whitelist
})
return uniqBy(concat(acc, links), UID)

return add(acc, links)
},
[]
)
Expand Down
66 changes: 49 additions & 17 deletions test/index.js
Expand Up @@ -24,27 +24,47 @@ describe('html links', () => {
snapshot(getLinks({ html }))
})

describe('normalization', () => {
describe('remove duplicate urls', () => {
it('from same tag', () => {
const html = generateHtml({
urls: [
'https://google.com',
'https://google.com',
'https://facebook.com'
]
})
snapshot(getLinks({ html }))
describe('remove duplicate urls', () => {
it('from same tag', () => {
const html = generateHtml({
urls: [
'https://google.com',
'https://google.com',
'https://facebook.com'
]
})
it('from different tags', () => {
const html = generateHtml({
urls: ['https://google.com'],
links: ['https://google.com']
})
snapshot(getLinks({ html }))
snapshot(getLinks({ html }))
})
it('from different tags', () => {
const html = generateHtml({
urls: ['https://google.com'],
links: ['https://google.com']
})
snapshot(getLinks({ html }))
})
})

describe('non remove duplicate urls', () => {
it('from same tag', () => {
const html = generateHtml({
urls: [
'https://google.com',
'https://google.com',
'https://facebook.com'
]
})
snapshot(getLinks({ html, removeDuplicates: false }))
})
it('from different tags', () => {
const html = generateHtml({
urls: ['https://google.com'],
links: ['https://google.com']
})
snapshot(getLinks({ html, removeDuplicates: false }))
})
})

describe('normalization', () => {
it('invariant final slash', () => {
const html = generateHtml({
urls: [
Expand All @@ -57,6 +77,18 @@ describe('html links', () => {
snapshot(getLinks({ html }))
})

it('invariant wwww', () => {
const html = generateHtml({
urls: [
'https://www.google.com',
'https://google.com',
'https://facebook.com'
]
})

snapshot(getLinks({ html }))
})

it('query string parameters position are not relevant', () => {
const html = generateHtml({
urls: [
Expand Down

0 comments on commit 09282e8

Please sign in to comment.