diff --git a/core/computed/entity-classification.js b/core/computed/entity-classification.js index 4629f084d3ec..dbaeea5f96c7 100644 --- a/core/computed/entity-classification.js +++ b/core/computed/entity-classification.js @@ -59,7 +59,7 @@ class EntityClassification { // Make up an entity only for valid http/https URLs. if (!parsedUrl.protocol.startsWith('http')) return; - const rootDomain = Util.getRootDomain(url); + const rootDomain = UrlUtils.getRootDomain(url); if (!rootDomain) return; if (entityCache.has(rootDomain)) return entityCache.get(rootDomain); diff --git a/core/computed/resource-summary.js b/core/computed/resource-summary.js index 8ee62cf59833..ed50bd4629a7 100644 --- a/core/computed/resource-summary.js +++ b/core/computed/resource-summary.js @@ -9,7 +9,7 @@ import {makeComputedArtifact} from './computed-artifact.js'; import {NetworkRecords} from './network-records.js'; import {NetworkRequest} from '../lib/network-request.js'; import {Budget} from '../config/budget.js'; -import {Util} from '../../shared/util.js'; +import UrlUtils from '../lib/url-utils.js'; /** @typedef {{count: number, resourceSize: number, transferSize: number}} ResourceEntry */ @@ -59,7 +59,7 @@ class ResourceSummary { firstPartyHosts = budget.options.firstPartyHostnames; } else { firstPartyHosts = classifiedEntities.firstParty?.domains.map(domain => `*.${domain}`) || - [`*.${Util.getRootDomain(URLArtifact.finalDisplayedUrl)}`]; + [`*.${UrlUtils.getRootDomain(URLArtifact.finalDisplayedUrl)}`]; } networkRecords.filter(record => { diff --git a/core/lib/url-utils.js b/core/lib/url-utils.js index 09ded7e032c5..761efbe7fd56 100644 --- a/core/lib/url-utils.js +++ b/core/lib/url-utils.js @@ -4,6 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +import {getDomain} from 'tldts'; + import {Util} from '../../shared/util.js'; import {LighthouseError} from './lh-error.js'; @@ -99,6 +101,16 @@ class UrlUtils { } } + /** + * Returns a primary domain for provided hostname (e.g. www.example.com -> example.com). + * @param {string|URL} url hostname or URL object + * @return {string} + */ + static getRootDomain(url) { + const parsedUrl = Util.createOrReturnURL(url); + return getDomain(parsedUrl.href) || parsedUrl.hostname; + } + /** * Check if rootDomains matches * @@ -120,8 +132,8 @@ class UrlUtils { } // get the string before the tld - const urlARootDomain = Util.getRootDomain(urlAInfo); - const urlBRootDomain = Util.getRootDomain(urlBInfo); + const urlARootDomain = UrlUtils.getRootDomain(urlAInfo); + const urlBRootDomain = UrlUtils.getRootDomain(urlBInfo); return urlARootDomain === urlBRootDomain; } diff --git a/core/test/lib/url-utils-test.js b/core/test/lib/url-utils-test.js index be75ebaabbc3..943abb8abcc1 100644 --- a/core/test/lib/url-utils-test.js +++ b/core/test/lib/url-utils-test.js @@ -396,4 +396,32 @@ describe('UrlUtils', () => { }).toThrow('INVALID_URL'); }); }); + + describe('getRootDomain', () => { + it('returns the correct rootDomain from a string from PSL', () => { + assert.equal(UrlUtils.getRootDomain('https://www.example.com/index.html'), 'example.com'); + assert.equal(UrlUtils.getRootDomain('https://example.com'), 'example.com'); + assert.equal(UrlUtils.getRootDomain('https://www.example.co.uk'), 'example.co.uk'); + assert.equal(UrlUtils.getRootDomain('https://example.com.br/app/'), 'example.com.br'); + assert.equal(UrlUtils.getRootDomain('https://example.tokyo.jp'), 'example.tokyo.jp'); + assert.equal(UrlUtils.getRootDomain('https://sub.example.com'), 'example.com'); + assert.equal(UrlUtils.getRootDomain('https://sub.example.tokyo.jp'), 'example.tokyo.jp'); + assert.equal(UrlUtils.getRootDomain('http://localhost'), 'localhost'); + assert.equal(UrlUtils.getRootDomain('http://localhost:8080'), 'localhost'); + assert.equal(UrlUtils.getRootDomain('https://www.hydro.mb.ca'), 'hydro.mb.ca'); + }); + + it('returns the correct rootDomain from an URL object', () => { + assert.equal(UrlUtils.getRootDomain(new URL('https://www.example.com/index.html')), 'example.com'); + assert.equal(UrlUtils.getRootDomain(new URL('https://example.com')), 'example.com'); + assert.equal(UrlUtils.getRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk'); + assert.equal(UrlUtils.getRootDomain(new URL('https://example.com.br/app/')), 'example.com.br'); + assert.equal(UrlUtils.getRootDomain(new URL('https://example.tokyo.jp')), 'example.tokyo.jp'); + assert.equal(UrlUtils.getRootDomain(new URL('https://sub.example.com')), 'example.com'); + assert.equal(UrlUtils.getRootDomain(new URL('https://sub.example.tokyo.jp')), 'example.tokyo.jp'); + assert.equal(UrlUtils.getRootDomain(new URL('http://localhost')), 'localhost'); + assert.equal(UrlUtils.getRootDomain(new URL('http://localhost:8080')), 'localhost'); + assert.equal(UrlUtils.getRootDomain(new URL('https://www.hydro.mb.ca')), 'hydro.mb.ca'); + }); + }); }); diff --git a/package.json b/package.json index 0e096a4ea180..509757a52de8 100644 --- a/package.json +++ b/package.json @@ -204,6 +204,7 @@ "semver": "^5.3.0", "speedline-core": "^1.4.3", "third-party-web": "^0.24.0", + "tldts": "^6.0.22", "ws": "^7.0.0", "yargs": "^17.3.1", "yargs-parser": "^21.0.0" diff --git a/report/renderer/report-ui-features.js b/report/renderer/report-ui-features.js index e9c0994fe705..b9b81b4e2dfd 100644 --- a/report/renderer/report-ui-features.js +++ b/report/renderer/report-ui-features.js @@ -322,7 +322,7 @@ export class ReportUIFeatures { * @return {Array} */ _getThirdPartyRows(rowEls, finalDisplayedUrl) { - const finalDisplayedUrlRootDomain = Util.getRootDomain(finalDisplayedUrl); + const finalDisplayedUrlEntity = Util.getEntityFromUrl(finalDisplayedUrl, this.json.entities); const firstPartyEntityName = this.json.entities?.find(e => e.isFirstParty === true)?.name; /** @type {Array} */ @@ -337,7 +337,8 @@ export class ReportUIFeatures { if (!urlItem) continue; const datasetUrl = urlItem.dataset.url; if (!datasetUrl) continue; - const isThirdParty = Util.getRootDomain(datasetUrl) !== finalDisplayedUrlRootDomain; + const isThirdParty = + Util.getEntityFromUrl(datasetUrl, this.json.entities) !== finalDisplayedUrlEntity; if (!isThirdParty) continue; } diff --git a/shared/test/util-test.js b/shared/test/util-test.js index 44640acd6d03..35629e7d5d23 100644 --- a/shared/test/util-test.js +++ b/shared/test/util-test.js @@ -9,38 +9,38 @@ import assert from 'assert/strict'; import {Util} from '../util.js'; describe('util helpers', () => { - describe('getTld', () => { + describe('getPseudoTld', () => { it('returns the correct tld', () => { - assert.equal(Util.getTld('example.com'), '.com'); - assert.equal(Util.getTld('example.co.uk'), '.co.uk'); - assert.equal(Util.getTld('example.com.br'), '.com.br'); - assert.equal(Util.getTld('example.tokyo.jp'), '.jp'); + assert.equal(Util.getPseudoTld('example.com'), '.com'); + assert.equal(Util.getPseudoTld('example.co.uk'), '.co.uk'); + assert.equal(Util.getPseudoTld('example.com.br'), '.com.br'); + assert.equal(Util.getPseudoTld('example.tokyo.jp'), '.jp'); }); }); - describe('getRootDomain', () => { + describe('getPseudoRootDomain', () => { it('returns the correct rootDomain from a string', () => { - assert.equal(Util.getRootDomain('https://www.example.com/index.html'), 'example.com'); - assert.equal(Util.getRootDomain('https://example.com'), 'example.com'); - assert.equal(Util.getRootDomain('https://www.example.co.uk'), 'example.co.uk'); - assert.equal(Util.getRootDomain('https://example.com.br/app/'), 'example.com.br'); - assert.equal(Util.getRootDomain('https://example.tokyo.jp'), 'tokyo.jp'); - assert.equal(Util.getRootDomain('https://sub.example.com'), 'example.com'); - assert.equal(Util.getRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp'); - assert.equal(Util.getRootDomain('http://localhost'), 'localhost'); - assert.equal(Util.getRootDomain('http://localhost:8080'), 'localhost'); + assert.equal(Util.getPseudoRootDomain('https://www.example.com/index.html'), 'example.com'); + assert.equal(Util.getPseudoRootDomain('https://example.com'), 'example.com'); + assert.equal(Util.getPseudoRootDomain('https://www.example.co.uk'), 'example.co.uk'); + assert.equal(Util.getPseudoRootDomain('https://example.com.br/app/'), 'example.com.br'); + assert.equal(Util.getPseudoRootDomain('https://example.tokyo.jp'), 'tokyo.jp'); + assert.equal(Util.getPseudoRootDomain('https://sub.example.com'), 'example.com'); + assert.equal(Util.getPseudoRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp'); + assert.equal(Util.getPseudoRootDomain('http://localhost'), 'localhost'); + assert.equal(Util.getPseudoRootDomain('http://localhost:8080'), 'localhost'); }); it('returns the correct rootDomain from an URL object', () => { - assert.equal(Util.getRootDomain(new URL('https://www.example.com/index.html')), 'example.com'); - assert.equal(Util.getRootDomain(new URL('https://example.com')), 'example.com'); - assert.equal(Util.getRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk'); - assert.equal(Util.getRootDomain(new URL('https://example.com.br/app/')), 'example.com.br'); - assert.equal(Util.getRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp'); - assert.equal(Util.getRootDomain(new URL('https://sub.example.com')), 'example.com'); - assert.equal(Util.getRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp'); - assert.equal(Util.getRootDomain(new URL('http://localhost')), 'localhost'); - assert.equal(Util.getRootDomain(new URL('http://localhost:8080')), 'localhost'); + assert.equal(Util.getPseudoRootDomain(new URL('https://www.example.com/index.html')), 'example.com'); + assert.equal(Util.getPseudoRootDomain(new URL('https://example.com')), 'example.com'); + assert.equal(Util.getPseudoRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk'); + assert.equal(Util.getPseudoRootDomain(new URL('https://example.com.br/app/')), 'example.com.br'); + assert.equal(Util.getPseudoRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp'); + assert.equal(Util.getPseudoRootDomain(new URL('https://sub.example.com')), 'example.com'); + assert.equal(Util.getPseudoRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp'); + assert.equal(Util.getPseudoRootDomain(new URL('http://localhost')), 'localhost'); + assert.equal(Util.getPseudoRootDomain(new URL('http://localhost:8080')), 'localhost'); }); }); diff --git a/shared/util.js b/shared/util.js index 871ef4b06f9d..fcc51eb13dce 100644 --- a/shared/util.js +++ b/shared/util.js @@ -77,6 +77,23 @@ class Util { return details; } + /** + * Given the entity classification dataset and a URL, identify the entity. + * @param {string} url + * @param {LH.Result.Entities=} entities + * @return {LH.Result.LhrEntity|string} + */ + static getEntityFromUrl(url, entities) { + // If it's a pre-v10 LHR, we don't have entities, so match against the root-ish domain + if (!entities) { + return Util.getPseudoRootDomain(url); + } + + const entity = entities.find(e => e.origins.find(origin => url.startsWith(origin))); + // This fallback case would be unexpected, but leaving for safety. + return entity || Util.getPseudoRootDomain(url); + } + /** * Split a string by markdown code spans (enclosed in `backticks`), splitting * into segments that were enclosed in backticks (marked as `isCode === true`) @@ -292,11 +309,12 @@ class Util { /** * Gets the tld of a domain + * This function is used only while rendering pre-10.0 LHRs. * * @param {string} hostname * @return {string} tld */ - static getTld(hostname) { + static getPseudoTld(hostname) { const tlds = hostname.split('.').slice(-2); if (!listOfTlds.includes(tlds[0])) { @@ -308,12 +326,16 @@ class Util { /** * Returns a primary domain for provided hostname (e.g. www.example.com -> example.com). + * As it doesn't consult the Public Suffix List, it can sometimes lose detail. + * See the `listOfTlds` comment above for more. + * This function is used only while rendering pre-10.0 LHRs. See UrlUtils.getRootDomain + * for the current method that makes use of PSL. * @param {string|URL} url hostname or URL object * @return {string} */ - static getRootDomain(url) { + static getPseudoRootDomain(url) { const hostname = Util.createOrReturnURL(url).hostname; - const tld = Util.getTld(hostname); + const tld = Util.getPseudoTld(hostname); // tld is .com or .co.uk which means we means that length is 1 to big // .com => 2 & .co.uk => 3 diff --git a/yarn.lock b/yarn.lock index a23a03348cba..b5c7ee29803e 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6938,6 +6938,18 @@ through@2, "through@>=2.2.7 <3", through@^2.3.8: resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5" integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU= +tldts-core@^6.0.22: + version "6.0.22" + resolved "https://registry.yarnpkg.com/tldts-core/-/tldts-core-6.0.22.tgz#1f4d43eb75f1f2e89e488776128abd7b3bd3f1b6" + integrity sha512-5m5+f69JzLj+QP+5DVgBv0fKjAE0zJaU8kBWx6dN+Tm9cm+OHNDIVNf2dmy3WL+ujECROIPJZHNAr+74hm8ujA== + +tldts@^6.0.22: + version "6.0.22" + resolved "https://registry.yarnpkg.com/tldts/-/tldts-6.0.22.tgz#9a2833b196ebb6704085b0cd07fdfc205eb4d3bd" + integrity sha512-dBxlzF/sbr8DBCI6To3gMUzTgoz7P8qrnZsfF+nYGkjEfcPaOUkwtJMjLzde4dN7xyjDLMIS5+uxChhYaFzRKw== + dependencies: + tldts-core "^6.0.22" + tmp@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.1.tgz#8457fc3037dcf4719c251367a1af6500ee1ccf14"