Skip to content

Commit

Permalink
fix: ignore all non-http[s] links (#111)
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinBeckwith authored Nov 17, 2019
1 parent 1d10a00 commit 2f28114
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 31 deletions.
66 changes: 47 additions & 19 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export interface CrawlResult {
}

interface CrawlOptions {
url: string;
url: URL;
parent?: string;
crawl: boolean;
results: LinkResult[];
Expand All @@ -58,7 +58,6 @@ export class LinkChecker extends EventEmitter {
*/
async check(options: CheckOptions) {
options.linksToSkip = options.linksToSkip || [];
options.linksToSkip.push('^mailto:', '^irc:', '^data:');
let server: http.Server | undefined;
if (!options.path.startsWith('http')) {
const port = options.port || 5000 + Math.round(Math.random() * 1000);
Expand All @@ -74,7 +73,7 @@ export class LinkChecker extends EventEmitter {
const results = new Array<LinkResult>();
queue.add(async () => {
await this.crawl({
url: options.path,
url: new URL(options.path),
crawl: true,
checkOptions: options,
results,
Expand Down Expand Up @@ -119,21 +118,35 @@ export class LinkChecker extends EventEmitter {
*/
private async crawl(opts: CrawlOptions): Promise<void> {
// Check to see if we've already scanned this url
if (opts.cache.has(opts.url)) {
if (opts.cache.has(opts.url.href)) {
return;
}
opts.cache.add(opts.url.href);

// explicitly skip non-http[s] links before making the request
const proto = opts.url.protocol;
if (proto !== 'http:' && proto !== 'https:') {
const r = {
url: opts.url.href,
status: 0,
state: LinkState.SKIPPED,
parent: opts.parent,
};
opts.results.push(r);
this.emit('link', r);
return;
}
opts.cache.add(opts.url);

// Check for links that should be skipped
// Check for user configured links that should be skipped
const skips = opts.checkOptions
.linksToSkip!.map(linkToSkip => {
return new RegExp(linkToSkip).test(opts.url);
return new RegExp(linkToSkip).test(opts.url.href);
})
.filter(match => !!match);

if (skips.length > 0) {
const result: LinkResult = {
url: opts.url,
url: opts.url.href,
state: LinkState.SKIPPED,
parent: opts.parent,
};
Expand All @@ -150,7 +163,7 @@ export class LinkChecker extends EventEmitter {
try {
let res = await gaxios.request<string>({
method: opts.crawl ? 'GET' : 'HEAD',
url: opts.url,
url: opts.url.href,
responseType: opts.crawl ? 'text' : 'stream',
validateStatus: () => true,
});
Expand All @@ -159,7 +172,7 @@ export class LinkChecker extends EventEmitter {
if (res.status === 405) {
res = await gaxios.request<string>({
method: 'GET',
url: opts.url,
url: opts.url.href,
responseType: 'stream',
validateStatus: () => true,
});
Expand All @@ -176,7 +189,7 @@ export class LinkChecker extends EventEmitter {
// request failure: invalid domain name, etc.
}
const result: LinkResult = {
url: opts.url,
url: opts.url.href,
status,
state,
parent: opts.parent,
Expand All @@ -187,28 +200,43 @@ export class LinkChecker extends EventEmitter {
// If we need to go deeper, scan the next level of depth for links and crawl
if (opts.crawl && shouldRecurse) {
this.emit('pagestart', opts.url);
const urls = getLinks(data, opts.url);
for (const url of urls) {
let crawl =
opts.checkOptions.recurse! && url.startsWith(opts.checkOptions.path);
const urlResults = getLinks(data, opts.url.href);
for (const result of urlResults) {
// if there was some sort of problem parsing the link while
// creating a new URL obj, treat it as a broken link.
if (!result.url) {
const r = {
url: result.link,
status: 0,
state: LinkState.BROKEN,
parent: opts.url.href,
};
opts.results.push(r);
this.emit('link', r);
continue;
}

let crawl = (opts.checkOptions.recurse! &&
result.url &&
result.url.href.startsWith(opts.checkOptions.path)) as boolean;

// only crawl links that start with the same host
if (crawl) {
try {
const parsedUrl = new URL(url);
const pathUrl = new URL(opts.checkOptions.path);
crawl = crawl && parsedUrl.host === pathUrl.host;
crawl = crawl && result.url!.host === pathUrl.host;
} catch {}
}

opts.queue.add(async () => {
await this.crawl({
url,
url: result.url!,
crawl,
cache: opts.cache,
results: opts.results,
checkOptions: opts.checkOptions,
queue: opts.queue,
parent: opts.url,
parent: opts.url.href,
});
});
}
Expand Down
22 changes: 14 additions & 8 deletions src/links.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@ const linksAttr = {
srcset: ['img', 'source'],
} as { [index: string]: string[] };

export function getLinks(source: string, baseUrl: string): string[] {
export interface ParsedUrl {
link: string;
error?: Error;
url?: URL;
}

export function getLinks(source: string, baseUrl: string): ParsedUrl[] {
const $ = cheerio.load(source);
const links = new Array<string>();
Object.keys(linksAttr).forEach(attr => {
Expand All @@ -39,7 +45,7 @@ export function getLinks(source: string, baseUrl: string): string[] {
});
const sanitized = links
.filter(link => !!link)
.map(link => normalizeLink(link, baseUrl));
.map(link => parseLink(link, baseUrl));
return sanitized;
}

Expand All @@ -54,12 +60,12 @@ function parseAttr(name: string, value: string): string[] {
}
}

function normalizeLink(link: string, baseUrl: string): string {
function parseLink(link: string, baseUrl: string): ParsedUrl {
try {
const slink = new URL(link, baseUrl);
slink.hash = '';
return slink.href;
} catch (e) {
return link;
const url = new URL(link, baseUrl);
url.hash = '';
return { link, url };
} catch (error) {
return { link, error };
}
}
3 changes: 1 addition & 2 deletions test/fixtures/malformed/index.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
<html>
<body>
<a href="http://fake.local
/thing.html">oh noes</a>
<a href="//////">oh noes</a>
</body>
</html>
4 changes: 2 additions & 2 deletions test/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ describe('linkinator', () => {
path: 'test/fixtures/recurse',
recurse: true,
});
assert.strictEqual(results.links.length, 5);
assert.strictEqual(results.links.length, 4);
scope.done();
});

Expand All @@ -122,7 +122,7 @@ describe('linkinator', () => {
assert.strictEqual(results.links.length, 2);
});

it('should not folow non-http[s] links', async () => {
it('should not follow non-http[s] links', async () => {
// includes mailto, data urls, and irc
const results = await check({ path: 'test/fixtures/protocols' });
assert.ok(results.passed);
Expand Down

0 comments on commit 2f28114

Please sign in to comment.