GoogleChrome · brendankenny · Mar 11, 2019 · Jan 23, 2019 · Jan 23, 2019 · Jan 23, 2019
diff --git a/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap b/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap
@@ -1132,9 +1132,6 @@ Object {
         Object {
           "path": "seo/embedded-content",
         },
-        Object {
-          "path": "seo/canonical",
-        },
         Object {
           "path": "seo/robots-txt",
         },

diff --git a/lighthouse-cli/test/fixtures/seo/seo-failure-cases.html b/lighthouse-cli/test/fixtures/seo/seo-failure-cases.html
@@ -14,12 +14,12 @@
   <!-- no <meta name="description" content=""> -->
   <!-- FAIL(is-crawlable): noindex on root page, vary the case to make sure we are case insensitive -->
   <meta name="RoBots" content="nofollow, NOINDEX, all">
-  <!-- FAIL(hreflang): invalid language code -->
-  <link rel="alternate" hreflang="xx" href="https://xx.example.com" />
+  <!-- FAIL(hreflang): invalid language code, vary the case to make sure we are case insensitive -->
+  <link rel="alTeRnate" hreflang="xx" href="https://xx.example.com" />
   <!-- FAIL(hreflang): spece before a valid code -->
   <link rel="alternate" href="http://example.com/" hreflang=" x-default" />
   <!-- FAIL(canonical): multiple canonical URLs provided (another one is in the HTTP header) -->
-  <link rel="canonical" href="https://example.com/" />
+  <link rel="canonical" href="https://example.com/other" />
 </head>
 <body>
   <h1>SEO</h1>

diff --git a/lighthouse-cli/test/fixtures/seo/seo-tester.html b/lighthouse-cli/test/fixtures/seo/seo-tester.html
@@ -27,6 +27,11 @@
   </style>
 </head>
 <body>
+  <!-- PASS(hreflang): should ignore links in the body -->
+  <link rel="alternate" hreflang="nonsense" href="https://nonsense.example.com" />
+  <!-- PASS(canonical): should ignore links in the body -->
+  <link rel="canonical" href="http://localhost:10200/seo/" />
+
   <h1>SEO</h1>
 
   <h2>Anchor text</h2>

diff --git a/lighthouse-cli/test/smokehouse/seo/expectations.js b/lighthouse-cli/test/smokehouse/seo/expectations.js
@@ -137,7 +137,7 @@ module.exports = [
       },
       'canonical': {
         score: 0,
-        explanation: 'Multiple conflicting URLs (https://example.com, https://example.com/)',
+        explanation: 'Multiple conflicting URLs (https://example.com/other, https://example.com/)',
       },
     },
   },

diff --git a/lighthouse-core/audits/seo/canonical.js b/lighthouse-core/audits/seo/canonical.js
@@ -6,10 +6,8 @@
 'use strict';
 
 const Audit = require('../audit');
-const LinkHeader = require('http-link-header');
 const URL = require('../../lib/url-shim');
 const MainResource = require('../../computed/main-resource.js');
-const LINK_HEADER = 'link';
 const i18n = require('../../lib/i18n/i18n.js');
 
 const UIStrings = {
@@ -37,50 +35,27 @@ const UIStrings = {
 
 const str_ = i18n.createMessageInstanceIdFn(__filename, UIStrings);
 
-/**
- * @param {string} headerValue
- * @returns {Array<string>}
- */
-function getCanonicalLinksFromHeader(headerValue) {
-  const linkHeader = LinkHeader.parse(headerValue);
-
-  return linkHeader.get('rel', 'canonical').map(c => c.uri);
-}
-
-/**
- * @param {string} headerValue
- * @returns {Array<string>}
- */
-function getHreflangsFromHeader(headerValue) {
-  const linkHeader = LinkHeader.parse(headerValue);
-
-  return linkHeader.get('rel', 'alternate').map(h => h.uri);
-}
-
-/**
- * Returns true if given string is a valid absolute or relative URL
- * @param {string} url
- * @returns {boolean}
- */
-function isValidRelativeOrAbsoluteURL(url) {
-  try {
-    new URL(url, 'https://example.com/');
-    return true;
-  } catch (e) {
-    return false;
-  }
-}
-
 /**
  * Returns a primary domain for provided URL (e.g. http://www.example.com -> example.com).
  * Note that it does not take second-level domains into account (.co.uk).
  * @param {URL} url
  * @returns {string}
  */
 function getPrimaryDomain(url) {
-  return url.hostname.split('.').slice(-2).join('.');
+  return url.hostname
+    .split('.')
+    .slice(-2)
+    .join('.');
 }
 
+/**
+ * @typedef CanonicalURLData
+ * @property {Set<string>} uniqueCanonicalURLs
+ * @property {Set<string>} hreflangURLs
+ * @property {LH.Artifacts.LinkElement|undefined} invalidCanonicalLink
+ * @property {LH.Artifacts.LinkElement|undefined} relativeCanonicallink
+ */
+
 class Canonical extends Audit {
   /**
    * @return {LH.Audit.Meta}
@@ -91,111 +66,162 @@ class Canonical extends Audit {
       title: str_(UIStrings.title),
       failureTitle: str_(UIStrings.failureTitle),
       description: str_(UIStrings.description),
-      requiredArtifacts: ['Canonical', 'Hreflang', 'URL'],
+      requiredArtifacts: ['LinkElements', 'URL'],
     };
   }
 
+  /**
+   * @param {LH.Artifacts.LinkElement[]} linkElements
+   * @return {CanonicalURLData}
+   */
+  static collectCanonicalURLs(linkElements) {
+    /** @type {Set<string>} */
+    const uniqueCanonicalURLs = new Set();
+    /** @type {Set<string>} */
+    const hreflangURLs = new Set();
+
+    /** @type {LH.Artifacts.LinkElement|undefined} */
+    let invalidCanonicalLink;
+    /** @type {LH.Artifacts.LinkElement|undefined} */
+    let relativeCanonicallink;
+    for (const link of linkElements) {
+      // Links in the body aren't canonical references for SEO, skip them
+      /** @see https://html.spec.whatwg.org/multipage/links.html#body-ok */
+      if (link.source === 'body') continue;
+
+      if (link.rel === 'canonical') {
+        // Links that don't have an href aren't canonical references for SEO, skip them
+        if (!link.hrefRaw) continue;
+
+        // Links that had an hrefRaw but didn't have a valid href were invalid, flag them
+        if (!link.href) invalidCanonicalLink = link;
+        // Links that had a valid href but didn't have a valid hrefRaw must have been relatively resolved, flag them
+        else if (!URL.isValid(link.hrefRaw)) relativeCanonicallink = link;
+        // Otherwise, it was a valid canonical URL
+        else uniqueCanonicalURLs.add(link.href);
+      } else if (link.rel === 'alternate') {
+        if (link.href && link.hreflang) hreflangURLs.add(link.href);
+      }
+    }
+
+    return {uniqueCanonicalURLs, hreflangURLs, invalidCanonicalLink, relativeCanonicallink};
+  }
+
+  /**
+   * @param {CanonicalURLData} canonicalURLData
+   * @return {LH.Audit.Product|undefined}
+   */
+  static findInvalidCanonicalURLReason(canonicalURLData) {
+    const {uniqueCanonicalURLs, invalidCanonicalLink, relativeCanonicallink} = canonicalURLData;
+
+    // the canonical link is totally invalid
+    if (invalidCanonicalLink) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationInvalid, {url: invalidCanonicalLink.hrefRaw}),
+      };
+    }
+
+    // the canonical link is valid, but it's relative which isn't allowed
+    if (relativeCanonicallink) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationRelative, {url: relativeCanonicallink.hrefRaw}),
+      };
+    }
+
+    /** @type {string[]} */
+    const canonicalURLs = Array.from(uniqueCanonicalURLs);
+
+    // there's no canonical URL at all, we're done
+    if (canonicalURLs.length === 0) {
+      return {
+        rawValue: true,
+        notApplicable: true,
+      };
+    }
+
+    // we have multiple conflicting canonical URls, we're done
+    if (canonicalURLs.length > 1) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationConflict, {urlList: canonicalURLs.join(', ')}),
+      };
+    }
+  }
+
+  /**
+   * @param {CanonicalURLData} canonicalURLData
+   * @param {URL} canonicalURL
+   * @param {URL} baseURL
+   * @return {LH.Audit.Product|undefined}
+   */
+  static findCommonCanonicalURLMistakes(canonicalURLData, canonicalURL, baseURL) {
+    const {hreflangURLs} = canonicalURLData;
+
+    // cross-language or cross-country canonicals are a common issue
+    if (
+      hreflangURLs.has(baseURL.href) &&
+      hreflangURLs.has(canonicalURL.href) &&
+      baseURL.href !== canonicalURL.href
+    ) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationPointsElsewhere, {url: baseURL.href}),
+      };
+    }
+
+    // bing and yahoo don't allow canonical URLs pointing to different domains, it's also
+    // a common mistake to publish a page with canonical pointing to e.g. a test domain or localhost
+    if (getPrimaryDomain(canonicalURL) !== getPrimaryDomain(baseURL)) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationDifferentDomain, {url: canonicalURL}),
+      };
+    }
+
+    // another common mistake is to have canonical pointing from all pages of the website to its root
+    if (
+      canonicalURL.origin === baseURL.origin &&
+      canonicalURL.pathname === '/' &&
+      baseURL.pathname !== '/'
+    ) {
+      return {
+        rawValue: false,
+        explanation: str_(UIStrings.explanationRoot),
+      };
+    }
+  }
+
   /**
    * @param {LH.Artifacts} artifacts
    * @param {LH.Audit.Context} context
    * @return {Promise<LH.Audit.Product>}
    */
-  static audit(artifacts, context) {
+  static async audit(artifacts, context) {
     const devtoolsLog = artifacts.devtoolsLogs[Audit.DEFAULT_PASS];
 
-    return MainResource.request({devtoolsLog, URL: artifacts.URL}, context)
-      .then(mainResource => {
-        const baseURL = new URL(mainResource.url);
-        /** @type {Array<string>} */
-        let canonicals = [];
-        /** @type {Array<string>} */
-        let hreflangs = [];
-
-        mainResource.responseHeaders && mainResource.responseHeaders
-          .filter(h => h.name.toLowerCase() === LINK_HEADER)
-          .forEach(h => {
-            canonicals = canonicals.concat(getCanonicalLinksFromHeader(h.value));
-            hreflangs = hreflangs.concat(getHreflangsFromHeader(h.value));
-          });
-
-        for (const canonical of artifacts.Canonical) {
-          if (canonical !== null) {
-            canonicals.push(canonical);
-          }
-        }
-        // we should only fail if there are multiple conflicting URLs
-        // see: https://github.com/GoogleChrome/lighthouse/issues/3178#issuecomment-381181762
-        canonicals = Array.from(new Set(canonicals));
-
-        artifacts.Hreflang.forEach(({href}) => hreflangs.push(href));
-
-        hreflangs = hreflangs
-          .filter(href => isValidRelativeOrAbsoluteURL(href))
-          .map(href => (new URL(href, baseURL)).href); // normalize URLs
-
-        if (canonicals.length === 0) {
-          return {
-            rawValue: true,
-            notApplicable: true,
-          };
-        }
-
-        if (canonicals.length > 1) {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationConflict, {urlList: canonicals.join(', ')}),
-          };
-        }
-
-        const canonical = canonicals[0];
-
-        if (!isValidRelativeOrAbsoluteURL(canonical)) {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationInvalid, {url: canonical}),
-          };
-        }
-
-        if (!URL.isValid(canonical)) {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationRelative, {url: canonical}),
-          };
-        }
-
-        const canonicalURL = new URL(canonical);
-
-        // cross-language or cross-country canonicals are a common issue
-        if (hreflangs.includes(baseURL.href) && hreflangs.includes(canonicalURL.href) &&
-          baseURL.href !== canonicalURL.href) {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationPointsElsewhere, {url: baseURL.href}),
-          };
-        }
-
-        // bing and yahoo don't allow canonical URLs pointing to different domains, it's also
-        // a common mistake to publish a page with canonical pointing to e.g. a test domain or localhost
-        if (getPrimaryDomain(canonicalURL) !== getPrimaryDomain(baseURL)) {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationDifferentDomain, {url: canonicalURL}),
-          };
-        }
-
-        // another common mistake is to have canonical pointing from all pages of the website to its root
-        if (canonicalURL.origin === baseURL.origin &&
-          canonicalURL.pathname === '/' && baseURL.pathname !== '/') {
-          return {
-            rawValue: false,
-            explanation: str_(UIStrings.explanationRoot),
-          };
-        }
-
-        return {
-          rawValue: true,
-        };
-      });
+    const mainResource = await MainResource.request({devtoolsLog, URL: artifacts.URL}, context);
+    const baseURL = new URL(mainResource.url);
+    const canonicalURLData = Canonical.collectCanonicalURLs(artifacts.LinkElements);
+
+    // First we'll check that there was a single valid canonical URL.
+    const invalidURLAuditProduct = Canonical.findInvalidCanonicalURLReason(canonicalURLData);
+    if (invalidURLAuditProduct) return invalidURLAuditProduct;
+
+    // There was a single valid canonical URL, so now we'll just check for common mistakes.
+    const canonicalURL = new URL([...canonicalURLData.uniqueCanonicalURLs][0]);
+    const mistakeAuditProduct = Canonical.findCommonCanonicalURLMistakes(
+      canonicalURLData,
+      canonicalURL,
+      baseURL
+    );
+
+    if (mistakeAuditProduct) return mistakeAuditProduct;
+
+    return {
+      rawValue: true,
+    };
   }
 }
 

diff --git a/lighthouse-core/computed/main-resource.js b/lighthouse-core/computed/main-resource.js
@@ -6,7 +6,7 @@
 'use strict';
 
 const makeComputedArtifact = require('./computed-artifact.js');
-const URL = require('../lib/url-shim.js');
+const NetworkAnalyzer = require('../lib/dependency-graph/simulator/network-analyzer.js');
 const NetworkRecords = require('./network-records.js');
 
 /**
@@ -22,10 +22,7 @@ class MainResource {
   static async compute_(data, context) {
     const finalUrl = data.URL.finalUrl;
     const requests = await NetworkRecords.request(data.devtoolsLog, context);
-    // equalWithExcludedFragments is expensive, so check that the finalUrl starts with the request first
-    const mainResource = requests.find(request => finalUrl.startsWith(request.url) &&
-      URL.equalWithExcludedFragments(request.url, finalUrl));
-
+    const mainResource = NetworkAnalyzer.findMainDocument(requests, finalUrl);
     if (!mainResource) {
       throw new Error('Unable to identify the main resource');
     }