Skip to content

Commit

Permalink
Fix home page filtering for summary requests (#156)
Browse files Browse the repository at this point in the history
* Fix home page filtering for summary requests

* add `metadata` attribute to `status_info` to make it available for filtering downstream

* update unit tests

* Check metadata object type
  • Loading branch information
giancarloaf authored Dec 18, 2022
1 parent 15aec22 commit 83b3455
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 3 deletions.
4 changes: 3 additions & 1 deletion modules/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def initialize_status_info(file_name, page):
"rank": utils.clamp_integer(metadata["rank"]) if metadata.get("rank") else None,
"date": "{:%Y_%m_%d}".format(date),
"client": metadata.get("layout", client_name).lower(),
"metadata": metadata,
}

@staticmethod
Expand Down Expand Up @@ -219,6 +220,7 @@ def summarize_entry(entry, first_url, first_html_url, entry_number, status_info)
"date": status_info["date"],
"pageid": status_info["pageid"],
"crawlid": status_info["crawlid"],
"metadata": status_info["metadata"],
# we use this below for expAge calculation
"startedDateTime": utils.datetime_to_epoch(
entry["startedDateTime"], status_info
Expand Down Expand Up @@ -442,7 +444,7 @@ def import_page(page, status_info):
)

return {
"metadata": json.dumps(page.get("_metadata")), # TODO TEST ME
"metadata": json.dumps(status_info["metadata"]),
"client": status_info["client"],
"date": status_info["date"],
"pageid": status_info["pageid"],
Expand Down
5 changes: 4 additions & 1 deletion modules/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,10 @@ def is_home_page(element):
metadata = element.get("metadata")
if metadata:
# use metadata.crawl_depth starting from 2022-05
return json.loads(metadata).get("crawl_depth", 0) == 0
if isinstance(metadata, dict):
return metadata.get("crawl_depth", 0) == 0
else:
return json.loads(metadata).get("crawl_depth", 0) == 0
else:
# legacy crawl data is all home-page only (i.e. no secondary pages)
return True
Expand Down
11 changes: 10 additions & 1 deletion test/test_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ class TestHarJsonToSummary(TestCase):
"rank": 10000000,
"date": "2022_01_01",
"client": "desktop",
"metadata": {
"layout": "Desktop",
"page_id": 12345,
"rank": 10000000,
"tested_url": "https://www.test.com/",
},
}

file_name_fixture = "chrome-Jan_1_2022/220101_Dx1.har.gz"
Expand Down Expand Up @@ -88,6 +94,7 @@ def test_initialize_status_info_missing_metadata(self):
):
expected_status_info = copy.deepcopy(self.expected_status_info_fixture)
expected_status_info[status_info_key] = expected
expected_status_info["metadata"].pop(metadata_key, None)

page = copy.deepcopy(self.page_fixture)
page["_metadata"].pop(metadata_key, None)
Expand All @@ -110,8 +117,10 @@ def test_initialize_status_info_default_testid(self):
def test_initialize_status_info_default_layout(self):
page = copy.deepcopy(self.page_fixture)
page["_metadata"].pop("layout", None)
expected_status_info = copy.deepcopy(self.expected_status_info_fixture)
expected_status_info["metadata"].pop("layout", None)
self.assertDictEqual(
self.expected_status_info_fixture,
expected_status_info,
HarJsonToSummary.initialize_status_info(self.file_name_fixture, page),
)

Expand Down

0 comments on commit 83b3455

Please sign in to comment.