Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core(entity-classification): integrate public-suffix-list into LH #15641

Merged
merged 4 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/computed/entity-classification.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class EntityClassification {
// Make up an entity only for valid http/https URLs.
if (!parsedUrl.protocol.startsWith('http')) return;

const rootDomain = Util.getRootDomain(url);
const rootDomain = UrlUtils.getRootDomain(url);
if (!rootDomain) return;
if (entityCache.has(rootDomain)) return entityCache.get(rootDomain);

Expand Down
4 changes: 2 additions & 2 deletions core/computed/resource-summary.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {makeComputedArtifact} from './computed-artifact.js';
import {NetworkRecords} from './network-records.js';
import {NetworkRequest} from '../lib/network-request.js';
import {Budget} from '../config/budget.js';
import {Util} from '../../shared/util.js';
import UrlUtils from '../lib/url-utils.js';

/** @typedef {{count: number, resourceSize: number, transferSize: number}} ResourceEntry */

Expand Down Expand Up @@ -59,7 +59,7 @@ class ResourceSummary {
firstPartyHosts = budget.options.firstPartyHostnames;
} else {
firstPartyHosts = classifiedEntities.firstParty?.domains.map(domain => `*.${domain}`) ||
[`*.${Util.getRootDomain(URLArtifact.finalDisplayedUrl)}`];
[`*.${UrlUtils.getRootDomain(URLArtifact.finalDisplayedUrl)}`];
}

networkRecords.filter(record => {
Expand Down
16 changes: 14 additions & 2 deletions core/lib/url-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* SPDX-License-Identifier: Apache-2.0
*/

import {getDomain} from 'tldts';

import {Util} from '../../shared/util.js';
import {LighthouseError} from './lh-error.js';

Expand Down Expand Up @@ -99,6 +101,16 @@
}
}

/**
* Returns a primary domain for provided hostname (e.g. www.example.com -> example.com).
* @param {string|URL} url hostname or URL object
* @return {string}
*/
static getRootDomain(url) {
alexnj marked this conversation as resolved.
Show resolved Hide resolved
const parsedUrl = Util.createOrReturnURL(url);
return getDomain(parsedUrl.href) || parsedUrl.hostname;
}

/**
* Check if rootDomains matches
*
Expand All @@ -120,8 +132,8 @@
}

// get the string before the tld
const urlARootDomain = Util.getRootDomain(urlAInfo);
const urlBRootDomain = Util.getRootDomain(urlBInfo);
const urlARootDomain = UrlUtils.getRootDomain(urlAInfo);
const urlBRootDomain = UrlUtils.getRootDomain(urlBInfo);

Check warning on line 136 in core/lib/url-utils.js

View check run for this annotation

Codecov / codecov/patch

core/lib/url-utils.js#L135-L136

Added lines #L135 - L136 were not covered by tests

return urlARootDomain === urlBRootDomain;
}
Expand Down
28 changes: 28 additions & 0 deletions core/test/lib/url-utils-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -396,4 +396,32 @@ describe('UrlUtils', () => {
}).toThrow('INVALID_URL');
});
});

describe('getRootDomain', () => {
it('returns the correct rootDomain from a string from PSL', () => {
assert.equal(UrlUtils.getRootDomain('https://www.example.com/index.html'), 'example.com');
assert.equal(UrlUtils.getRootDomain('https://example.com'), 'example.com');
assert.equal(UrlUtils.getRootDomain('https://www.example.co.uk'), 'example.co.uk');
assert.equal(UrlUtils.getRootDomain('https://example.com.br/app/'), 'example.com.br');
assert.equal(UrlUtils.getRootDomain('https://example.tokyo.jp'), 'example.tokyo.jp');
assert.equal(UrlUtils.getRootDomain('https://sub.example.com'), 'example.com');
assert.equal(UrlUtils.getRootDomain('https://sub.example.tokyo.jp'), 'example.tokyo.jp');
assert.equal(UrlUtils.getRootDomain('http://localhost'), 'localhost');
assert.equal(UrlUtils.getRootDomain('http://localhost:8080'), 'localhost');
assert.equal(UrlUtils.getRootDomain('https://www.hydro.mb.ca'), 'hydro.mb.ca');
});

it('returns the correct rootDomain from an URL object', () => {
assert.equal(UrlUtils.getRootDomain(new URL('https://www.example.com/index.html')), 'example.com');
assert.equal(UrlUtils.getRootDomain(new URL('https://example.com')), 'example.com');
assert.equal(UrlUtils.getRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk');
assert.equal(UrlUtils.getRootDomain(new URL('https://example.com.br/app/')), 'example.com.br');
assert.equal(UrlUtils.getRootDomain(new URL('https://example.tokyo.jp')), 'example.tokyo.jp');
assert.equal(UrlUtils.getRootDomain(new URL('https://sub.example.com')), 'example.com');
assert.equal(UrlUtils.getRootDomain(new URL('https://sub.example.tokyo.jp')), 'example.tokyo.jp');
assert.equal(UrlUtils.getRootDomain(new URL('http://localhost')), 'localhost');
assert.equal(UrlUtils.getRootDomain(new URL('http://localhost:8080')), 'localhost');
assert.equal(UrlUtils.getRootDomain(new URL('https://www.hydro.mb.ca')), 'hydro.mb.ca');
});
});
});
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@
"semver": "^5.3.0",
"speedline-core": "^1.4.3",
"third-party-web": "^0.24.0",
"tldts": "^6.0.22",
"ws": "^7.0.0",
"yargs": "^17.3.1",
"yargs-parser": "^21.0.0"
Expand Down
5 changes: 3 additions & 2 deletions report/renderer/report-ui-features.js
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ export class ReportUIFeatures {
* @return {Array<HTMLElement>}
*/
_getThirdPartyRows(rowEls, finalDisplayedUrl) {
const finalDisplayedUrlRootDomain = Util.getRootDomain(finalDisplayedUrl);
const finalDisplayedUrlEntity = Util.getEntityFromUrl(finalDisplayedUrl, this.json.entities);
const firstPartyEntityName = this.json.entities?.find(e => e.isFirstParty === true)?.name;

/** @type {Array<HTMLElement>} */
Expand All @@ -337,7 +337,8 @@ export class ReportUIFeatures {
if (!urlItem) continue;
const datasetUrl = urlItem.dataset.url;
if (!datasetUrl) continue;
const isThirdParty = Util.getRootDomain(datasetUrl) !== finalDisplayedUrlRootDomain;
const isThirdParty =
Util.getEntityFromUrl(datasetUrl, this.json.entities) !== finalDisplayedUrlEntity;
if (!isThirdParty) continue;
}

Expand Down
48 changes: 24 additions & 24 deletions shared/test/util-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,38 @@ import assert from 'assert/strict';
import {Util} from '../util.js';

describe('util helpers', () => {
describe('getTld', () => {
describe('getPseudoTld', () => {
it('returns the correct tld', () => {
assert.equal(Util.getTld('example.com'), '.com');
assert.equal(Util.getTld('example.co.uk'), '.co.uk');
assert.equal(Util.getTld('example.com.br'), '.com.br');
assert.equal(Util.getTld('example.tokyo.jp'), '.jp');
assert.equal(Util.getPseudoTld('example.com'), '.com');
assert.equal(Util.getPseudoTld('example.co.uk'), '.co.uk');
assert.equal(Util.getPseudoTld('example.com.br'), '.com.br');
assert.equal(Util.getPseudoTld('example.tokyo.jp'), '.jp');
});
});

describe('getRootDomain', () => {
describe('getPseudoRootDomain', () => {
it('returns the correct rootDomain from a string', () => {
assert.equal(Util.getRootDomain('https://www.example.com/index.html'), 'example.com');
assert.equal(Util.getRootDomain('https://example.com'), 'example.com');
assert.equal(Util.getRootDomain('https://www.example.co.uk'), 'example.co.uk');
assert.equal(Util.getRootDomain('https://example.com.br/app/'), 'example.com.br');
assert.equal(Util.getRootDomain('https://example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getRootDomain('https://sub.example.com'), 'example.com');
assert.equal(Util.getRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getRootDomain('http://localhost'), 'localhost');
assert.equal(Util.getRootDomain('http://localhost:8080'), 'localhost');
assert.equal(Util.getPseudoRootDomain('https://www.example.com/index.html'), 'example.com');
assert.equal(Util.getPseudoRootDomain('https://example.com'), 'example.com');
assert.equal(Util.getPseudoRootDomain('https://www.example.co.uk'), 'example.co.uk');
assert.equal(Util.getPseudoRootDomain('https://example.com.br/app/'), 'example.com.br');
assert.equal(Util.getPseudoRootDomain('https://example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getPseudoRootDomain('https://sub.example.com'), 'example.com');
assert.equal(Util.getPseudoRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getPseudoRootDomain('http://localhost'), 'localhost');
assert.equal(Util.getPseudoRootDomain('http://localhost:8080'), 'localhost');
});

it('returns the correct rootDomain from an URL object', () => {
assert.equal(Util.getRootDomain(new URL('https://www.example.com/index.html')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://example.com')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk');
assert.equal(Util.getRootDomain(new URL('https://example.com.br/app/')), 'example.com.br');
assert.equal(Util.getRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getRootDomain(new URL('https://sub.example.com')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getRootDomain(new URL('http://localhost')), 'localhost');
assert.equal(Util.getRootDomain(new URL('http://localhost:8080')), 'localhost');
assert.equal(Util.getPseudoRootDomain(new URL('https://www.example.com/index.html')), 'example.com');
assert.equal(Util.getPseudoRootDomain(new URL('https://example.com')), 'example.com');
assert.equal(Util.getPseudoRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk');
assert.equal(Util.getPseudoRootDomain(new URL('https://example.com.br/app/')), 'example.com.br');
assert.equal(Util.getPseudoRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getPseudoRootDomain(new URL('https://sub.example.com')), 'example.com');
assert.equal(Util.getPseudoRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getPseudoRootDomain(new URL('http://localhost')), 'localhost');
assert.equal(Util.getPseudoRootDomain(new URL('http://localhost:8080')), 'localhost');
});
});

Expand Down
28 changes: 25 additions & 3 deletions shared/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,23 @@ class Util {
return details;
}

/**
* Given the entity classification dataset and a URL, identify the entity.
* @param {string} url
* @param {LH.Result.Entities=} entities
* @return {LH.Result.LhrEntity|string}
*/
static getEntityFromUrl(url, entities) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the subtlety here kinda goes over my head..

so we use the new PSL-powered getrootdomain in core/c/entity-classification
but out in report we use this method which leverages the boring-basic getrootdomain..

ya?

this does fix #15623 ? my brain is losing track of when the url data is set vs tweaked for display...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the audit side, we use the PSL library. We don't want to carry that much JS payload to the report though. Fortunately, there are only a few places on the report side that uses getRootDomain() — to convert a URL to its entity name for entity-grouping/lookup, and for legacy report third-party filtering, for example. For those, we already have keyed the entity-classification set with root-domains as entity names, so I rewrote them as entity-name comparisons.

This weird |string return value is to support pre-10.0 LHRs, where we dont have entity classification data. For those, we fall back to getLegacyRootDomain and convert them to string comparisons.

It does fix #15623. The entity identified is the proper domain (vs. mb.ca before). Did you give it a try?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok cool thanks.

Did you give it a try?

nope but i knew you would've.

// If it's a pre-v10 LHR, we don't have entities, so match against the root-ish domain
if (!entities) {
alexnj marked this conversation as resolved.
Show resolved Hide resolved
return Util.getPseudoRootDomain(url);
}

const entity = entities.find(e => e.origins.find(origin => url.startsWith(origin)));
// This fallback case would be unexpected, but leaving for safety.
return entity || Util.getPseudoRootDomain(url);
}

/**
* Split a string by markdown code spans (enclosed in `backticks`), splitting
* into segments that were enclosed in backticks (marked as `isCode === true`)
Expand Down Expand Up @@ -292,11 +309,12 @@ class Util {

/**
* Gets the tld of a domain
* This function is used only while rendering pre-10.0 LHRs.
*
* @param {string} hostname
* @return {string} tld
*/
static getTld(hostname) {
static getPseudoTld(hostname) {
const tlds = hostname.split('.').slice(-2);

if (!listOfTlds.includes(tlds[0])) {
Expand All @@ -308,12 +326,16 @@ class Util {

/**
* Returns a primary domain for provided hostname (e.g. www.example.com -> example.com).
* As it doesn't consult the Public Suffix List, it can sometimes lose detail.
* See the `listOfTlds` comment above for more.
* This function is used only while rendering pre-10.0 LHRs. See UrlUtils.getRootDomain
* for the current method that makes use of PSL.
* @param {string|URL} url hostname or URL object
alexnj marked this conversation as resolved.
Show resolved Hide resolved
* @return {string}
*/
static getRootDomain(url) {
static getPseudoRootDomain(url) {
const hostname = Util.createOrReturnURL(url).hostname;
const tld = Util.getTld(hostname);
const tld = Util.getPseudoTld(hostname);

// tld is .com or .co.uk which means we means that length is 1 to big
// .com => 2 & .co.uk => 3
Expand Down
12 changes: 12 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6938,6 +6938,18 @@ through@2, "through@>=2.2.7 <3", through@^2.3.8:
resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=

tldts-core@^6.0.22:
version "6.0.22"
resolved "https://registry.yarnpkg.com/tldts-core/-/tldts-core-6.0.22.tgz#1f4d43eb75f1f2e89e488776128abd7b3bd3f1b6"
integrity sha512-5m5+f69JzLj+QP+5DVgBv0fKjAE0zJaU8kBWx6dN+Tm9cm+OHNDIVNf2dmy3WL+ujECROIPJZHNAr+74hm8ujA==

tldts@^6.0.22:
version "6.0.22"
resolved "https://registry.yarnpkg.com/tldts/-/tldts-6.0.22.tgz#9a2833b196ebb6704085b0cd07fdfc205eb4d3bd"
integrity sha512-dBxlzF/sbr8DBCI6To3gMUzTgoz7P8qrnZsfF+nYGkjEfcPaOUkwtJMjLzde4dN7xyjDLMIS5+uxChhYaFzRKw==
dependencies:
tldts-core "^6.0.22"

tmp@^0.2.1:
version "0.2.1"
resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.1.tgz#8457fc3037dcf4719c251367a1af6500ee1ccf14"
Expand Down
Loading