Skip to content

Commit

Permalink
fix: only crawl exact host matches (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinBeckwith committed Nov 3, 2019
1 parent f8eefcb commit 846f545
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 3 deletions.
12 changes: 10 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import * as http from 'http';
import enableDestroy = require('server-destroy');

import { getLinks } from './links';
import { URL } from 'url';

const finalhandler = require('finalhandler');
const serveStatic = require('serve-static');
Expand Down Expand Up @@ -161,9 +162,16 @@ export class LinkChecker extends EventEmitter {
this.emit('pagestart', opts.url);
const urls = getLinks(data, opts.url);
for (const url of urls) {
// only crawl links that start with the same host
const crawl =
let crawl =
opts.checkOptions.recurse! && url.startsWith(opts.checkOptions.path);
// only crawl links that start with the same host
if (crawl) {
try {
const parsedUrl = new URL(url);
const pathUrl = new URL(opts.checkOptions.path);
crawl = crawl && parsedUrl.host === pathUrl.host;
} catch {}
}
await this.crawl({
url,
crawl,
Expand Down
2 changes: 1 addition & 1 deletion src/links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ const linksAttr = {
srcset: ['img', 'source'],
} as { [index: string]: string[] };

export function getLinks(source: string, baseUrl: string) {
export function getLinks(source: string, baseUrl: string): string[] {
const $ = cheerio.load(source);
const links = new Array<string>();
Object.keys(linksAttr).forEach(attr => {
Expand Down
10 changes: 10 additions & 0 deletions test/fixtures/baseurl/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<body>
<!--
Ensure we don't follow links to domains that start with
the same subdomain, but really aren't the same domain:
https://github.com/JustinBeckwith/linkinator/issues/89
-->
<a href="http://fake.local.br/deep.html">not quite right</a>
</body>
</html>
21 changes: 21 additions & 0 deletions test/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import * as assert from 'assert';
import * as gaxios from 'gaxios';
import * as nock from 'nock';
import * as sinon from 'sinon';
import * as path from 'path';

import { check, LinkState } from '../src';

Expand Down Expand Up @@ -149,4 +150,24 @@ describe('linkinator', () => {
assert.ok(results.passed);
scopes.forEach(x => x.done());
});

it('should only follow links on the same origin domain', async () => {
const scopes = [
nock('http://fake.local')
.get('/')
.replyWithFile(200, path.resolve('test/fixtures/baseurl/index.html'), {
'content-type': 'text/html',
}),
nock('http://fake.local.br')
.head('/deep.html')
.reply(200),
];
const results = await check({
path: 'http://fake.local',
recurse: true,
});
assert.strictEqual(results.links.length, 2);
assert.ok(results.passed);
scopes.forEach(x => x.done());
});
});

0 comments on commit 846f545

Please sign in to comment.