Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

core(canonical): move canonical audit to LinkElements #7080

Merged
merged 10 commits into from
Mar 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions lighthouse-cli/test/cli/__snapshots__/index-test.js.snap
Original file line number Diff line number Diff line change
Expand Up @@ -1132,9 +1132,6 @@ Object {
Object {
"path": "seo/embedded-content",
},
Object {
"path": "seo/canonical",
},
Object {
"path": "seo/robots-txt",
},
Expand Down
6 changes: 3 additions & 3 deletions lighthouse-cli/test/fixtures/seo/seo-failure-cases.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
<!-- no <meta name="description" content=""> -->
<!-- FAIL(is-crawlable): noindex on root page, vary the case to make sure we are case insensitive -->
<meta name="RoBots" content="nofollow, NOINDEX, all">
<!-- FAIL(hreflang): invalid language code -->
<link rel="alternate" hreflang="xx" href="https://xx.example.com" />
<!-- FAIL(hreflang): invalid language code, vary the case to make sure we are case insensitive -->
<link rel="alTeRnate" hreflang="xx" href="https://xx.example.com" />
<!-- FAIL(hreflang): spece before a valid code -->
<link rel="alternate" href="http://example.com/" hreflang=" x-default" />
<!-- FAIL(canonical): multiple canonical URLs provided (another one is in the HTTP header) -->
<link rel="canonical" href="https://example.com/" />
<link rel="canonical" href="https://example.com/other" />
</head>
<body>
<h1>SEO</h1>
Expand Down
5 changes: 5 additions & 0 deletions lighthouse-cli/test/fixtures/seo/seo-tester.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
</style>
</head>
<body>
<!-- PASS(hreflang): should ignore links in the body -->
<link rel="alternate" hreflang="nonsense" href="https://nonsense.example.com" />
<!-- PASS(canonical): should ignore links in the body -->
<link rel="canonical" href="http://localhost:10200/seo/" />

<h1>SEO</h1>

<h2>Anchor text</h2>
Expand Down
2 changes: 1 addition & 1 deletion lighthouse-cli/test/smokehouse/seo/expectations.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ module.exports = [
},
'canonical': {
score: 0,
explanation: 'Multiple conflicting URLs (https://example.com, https://example.com/)',
explanation: 'Multiple conflicting URLs (https://example.com/other, https://example.com/)',
},
},
},
Expand Down
290 changes: 158 additions & 132 deletions lighthouse-core/audits/seo/canonical.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
'use strict';

const Audit = require('../audit');
const LinkHeader = require('http-link-header');
const URL = require('../../lib/url-shim');
const MainResource = require('../../computed/main-resource.js');
const LINK_HEADER = 'link';
const i18n = require('../../lib/i18n/i18n.js');

const UIStrings = {
Expand Down Expand Up @@ -37,50 +35,27 @@ const UIStrings = {

const str_ = i18n.createMessageInstanceIdFn(__filename, UIStrings);

/**
* @param {string} headerValue
* @returns {Array<string>}
*/
function getCanonicalLinksFromHeader(headerValue) {
const linkHeader = LinkHeader.parse(headerValue);

return linkHeader.get('rel', 'canonical').map(c => c.uri);
}

/**
* @param {string} headerValue
* @returns {Array<string>}
*/
function getHreflangsFromHeader(headerValue) {
const linkHeader = LinkHeader.parse(headerValue);

return linkHeader.get('rel', 'alternate').map(h => h.uri);
}

/**
* Returns true if given string is a valid absolute or relative URL
* @param {string} url
* @returns {boolean}
*/
function isValidRelativeOrAbsoluteURL(url) {
try {
new URL(url, 'https://example.com/');
return true;
} catch (e) {
return false;
}
}

/**
* Returns a primary domain for provided URL (e.g. http://www.example.com -> example.com).
* Note that it does not take second-level domains into account (.co.uk).
* @param {URL} url
* @returns {string}
*/
function getPrimaryDomain(url) {
return url.hostname.split('.').slice(-2).join('.');
return url.hostname
.split('.')
.slice(-2)
.join('.');
}

/**
* @typedef CanonicalURLData
* @property {Set<string>} uniqueCanonicalURLs
* @property {Set<string>} hreflangURLs
* @property {LH.Artifacts.LinkElement|undefined} invalidCanonicalLink
* @property {LH.Artifacts.LinkElement|undefined} relativeCanonicallink
*/

class Canonical extends Audit {
/**
* @return {LH.Audit.Meta}
Expand All @@ -91,111 +66,162 @@ class Canonical extends Audit {
title: str_(UIStrings.title),
failureTitle: str_(UIStrings.failureTitle),
description: str_(UIStrings.description),
requiredArtifacts: ['Canonical', 'Hreflang', 'URL'],
requiredArtifacts: ['LinkElements', 'URL'],
};
}

/**
* @param {LH.Artifacts.LinkElement[]} linkElements
* @return {CanonicalURLData}
*/
static collectCanonicalURLs(linkElements) {
/** @type {Set<string>} */
const uniqueCanonicalURLs = new Set();
/** @type {Set<string>} */
const hreflangURLs = new Set();

/** @type {LH.Artifacts.LinkElement|undefined} */
let invalidCanonicalLink;
/** @type {LH.Artifacts.LinkElement|undefined} */
let relativeCanonicallink;
for (const link of linkElements) {
// Links in the body aren't canonical references for SEO, skip them
/** @see https://html.spec.whatwg.org/multipage/links.html#body-ok */
if (link.source === 'body') continue;
patrickhulce marked this conversation as resolved.
Show resolved Hide resolved

if (link.rel === 'canonical') {
// Links that don't have an href aren't canonical references for SEO, skip them
if (!link.hrefRaw) continue;
patrickhulce marked this conversation as resolved.
Show resolved Hide resolved

// Links that had an hrefRaw but didn't have a valid href were invalid, flag them
if (!link.href) invalidCanonicalLink = link;
// Links that had a valid href but didn't have a valid hrefRaw must have been relatively resolved, flag them
else if (!URL.isValid(link.hrefRaw)) relativeCanonicallink = link;
patrickhulce marked this conversation as resolved.
Show resolved Hide resolved
// Otherwise, it was a valid canonical URL
else uniqueCanonicalURLs.add(link.href);
} else if (link.rel === 'alternate') {
if (link.href && link.hreflang) hreflangURLs.add(link.href);
}
}

return {uniqueCanonicalURLs, hreflangURLs, invalidCanonicalLink, relativeCanonicallink};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are some of these plural and some are singular? :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

short: parity and I'm only willing to do so much cleanup for an unrelated goal :P

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

short: parity and I'm only willing to do so much cleanup for an unrelated goal :P

haha, very fair :)

}

/**
* @param {CanonicalURLData} canonicalURLData
* @return {LH.Audit.Product|undefined}
*/
static findInvalidCanonicalURLReason(canonicalURLData) {
const {uniqueCanonicalURLs, invalidCanonicalLink, relativeCanonicallink} = canonicalURLData;

// the canonical link is totally invalid
if (invalidCanonicalLink) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationInvalid, {url: invalidCanonicalLink.hrefRaw}),
};
}

// the canonical link is valid, but it's relative which isn't allowed
if (relativeCanonicallink) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationRelative, {url: relativeCanonicallink.hrefRaw}),
};
}

/** @type {string[]} */
const canonicalURLs = Array.from(uniqueCanonicalURLs);

// there's no canonical URL at all, we're done
if (canonicalURLs.length === 0) {
return {
rawValue: true,
notApplicable: true,
};
}

// we have multiple conflicting canonical URls, we're done
if (canonicalURLs.length > 1) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationConflict, {urlList: canonicalURLs.join(', ')}),
};
}
}

/**
* @param {CanonicalURLData} canonicalURLData
* @param {URL} canonicalURL
* @param {URL} baseURL
* @return {LH.Audit.Product|undefined}
*/
static findCommonCanonicalURLMistakes(canonicalURLData, canonicalURL, baseURL) {
const {hreflangURLs} = canonicalURLData;

// cross-language or cross-country canonicals are a common issue
if (
hreflangURLs.has(baseURL.href) &&
hreflangURLs.has(canonicalURL.href) &&
baseURL.href !== canonicalURL.href
) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationPointsElsewhere, {url: baseURL.href}),
};
}

// bing and yahoo don't allow canonical URLs pointing to different domains, it's also
// a common mistake to publish a page with canonical pointing to e.g. a test domain or localhost
if (getPrimaryDomain(canonicalURL) !== getPrimaryDomain(baseURL)) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationDifferentDomain, {url: canonicalURL}),
};
}

// another common mistake is to have canonical pointing from all pages of the website to its root
if (
canonicalURL.origin === baseURL.origin &&
canonicalURL.pathname === '/' &&
baseURL.pathname !== '/'
) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationRoot),
};
}
}

/**
* @param {LH.Artifacts} artifacts
* @param {LH.Audit.Context} context
* @return {Promise<LH.Audit.Product>}
*/
static audit(artifacts, context) {
static async audit(artifacts, context) {
const devtoolsLog = artifacts.devtoolsLogs[Audit.DEFAULT_PASS];

return MainResource.request({devtoolsLog, URL: artifacts.URL}, context)
.then(mainResource => {
const baseURL = new URL(mainResource.url);
/** @type {Array<string>} */
let canonicals = [];
/** @type {Array<string>} */
let hreflangs = [];

mainResource.responseHeaders && mainResource.responseHeaders
.filter(h => h.name.toLowerCase() === LINK_HEADER)
.forEach(h => {
canonicals = canonicals.concat(getCanonicalLinksFromHeader(h.value));
hreflangs = hreflangs.concat(getHreflangsFromHeader(h.value));
});

for (const canonical of artifacts.Canonical) {
if (canonical !== null) {
canonicals.push(canonical);
}
}
// we should only fail if there are multiple conflicting URLs
// see: https://github.com/GoogleChrome/lighthouse/issues/3178#issuecomment-381181762
canonicals = Array.from(new Set(canonicals));

artifacts.Hreflang.forEach(({href}) => hreflangs.push(href));

hreflangs = hreflangs
.filter(href => isValidRelativeOrAbsoluteURL(href))
.map(href => (new URL(href, baseURL)).href); // normalize URLs

if (canonicals.length === 0) {
return {
rawValue: true,
notApplicable: true,
};
}

if (canonicals.length > 1) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationConflict, {urlList: canonicals.join(', ')}),
};
}

const canonical = canonicals[0];

if (!isValidRelativeOrAbsoluteURL(canonical)) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationInvalid, {url: canonical}),
};
}

if (!URL.isValid(canonical)) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationRelative, {url: canonical}),
};
}

const canonicalURL = new URL(canonical);

// cross-language or cross-country canonicals are a common issue
if (hreflangs.includes(baseURL.href) && hreflangs.includes(canonicalURL.href) &&
baseURL.href !== canonicalURL.href) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationPointsElsewhere, {url: baseURL.href}),
};
}

// bing and yahoo don't allow canonical URLs pointing to different domains, it's also
// a common mistake to publish a page with canonical pointing to e.g. a test domain or localhost
if (getPrimaryDomain(canonicalURL) !== getPrimaryDomain(baseURL)) {
return {
rawValue: false,
explanation: str_(UIStrings.explanationDifferentDomain, {url: canonicalURL}),
};
}

// another common mistake is to have canonical pointing from all pages of the website to its root
if (canonicalURL.origin === baseURL.origin &&
canonicalURL.pathname === '/' && baseURL.pathname !== '/') {
return {
rawValue: false,
explanation: str_(UIStrings.explanationRoot),
};
}

return {
rawValue: true,
};
});
const mainResource = await MainResource.request({devtoolsLog, URL: artifacts.URL}, context);
const baseURL = new URL(mainResource.url);
const canonicalURLData = Canonical.collectCanonicalURLs(artifacts.LinkElements);

// First we'll check that there was a single valid canonical URL.
const invalidURLAuditProduct = Canonical.findInvalidCanonicalURLReason(canonicalURLData);
if (invalidURLAuditProduct) return invalidURLAuditProduct;

// There was a single valid canonical URL, so now we'll just check for common mistakes.
const canonicalURL = new URL([...canonicalURLData.uniqueCanonicalURLs][0]);
const mistakeAuditProduct = Canonical.findCommonCanonicalURLMistakes(
canonicalURLData,
canonicalURL,
baseURL
);

if (mistakeAuditProduct) return mistakeAuditProduct;

return {
rawValue: true,
};
}
}

Expand Down
7 changes: 2 additions & 5 deletions lighthouse-core/computed/main-resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
'use strict';

const makeComputedArtifact = require('./computed-artifact.js');
const URL = require('../lib/url-shim.js');
const NetworkAnalyzer = require('../lib/dependency-graph/simulator/network-analyzer.js');
const NetworkRecords = require('./network-records.js');

/**
Expand All @@ -22,10 +22,7 @@ class MainResource {
static async compute_(data, context) {
const finalUrl = data.URL.finalUrl;
const requests = await NetworkRecords.request(data.devtoolsLog, context);
// equalWithExcludedFragments is expensive, so check that the finalUrl starts with the request first
const mainResource = requests.find(request => finalUrl.startsWith(request.url) &&
URL.equalWithExcludedFragments(request.url, finalUrl));

const mainResource = NetworkAnalyzer.findMainDocument(requests, finalUrl);
if (!mainResource) {
throw new Error('Unable to identify the main resource');
}
Expand Down
Loading