From f1819585b0a4980a9305689c93be772c4a424084 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Mon, 4 Nov 2024 09:28:22 -0500 Subject: [PATCH 1/2] Use DeepDiff library for a/b record diffing Why these changes are being introduced: Originally, the library jsondiff was used for getting a diff between the A and B version of a record. This was selected based on prior use and experience. For the most part, it was a nice fit, but lacked the ability to ignore changes in array order as a diff to report. This resulted in high numbers of records showing diff, when the only change was a differently ordered array. How this addresses that need: * The DeepDiff library has built-in support to 'ignore_order' when generating diffs, making it a great fit * DeepDiff is equally fast for generating diffs * Our primary, current use of diffs is to highlight records where a TIMDEX field changed, which is conveniently a "root" field on the JSON object. DeepDiff provides an attribute on the diff object that returns an explicit list of modified "root" fields, which is just what we need, further simplifying some logic in ABDiff to parse this information. Overall, DeepDiff is a more configurable library, with built-in options that align closely with our use case. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-373 --- Pipfile | 2 +- Pipfile.lock | 409 +++++++++++++++------------------ abdiff/core/calc_ab_diffs.py | 57 +++-- abdiff/core/calc_ab_metrics.py | 31 ++- pyproject.toml | 2 +- tests/test_calc_ab_diffs.py | 61 ++++- 6 files changed, 299 insertions(+), 263 deletions(-) diff --git a/Pipfile b/Pipfile index 8cf9c20..8f87bda 100644 --- a/Pipfile +++ b/Pipfile @@ -14,8 +14,8 @@ pygit2 = "*" types-docker = "*" types-pygit2 = "*" flask = "*" -jsondiff = "*" boto3 = "*" +deepdiff = "*" [dev-packages] black = "*" diff --git a/Pipfile.lock b/Pipfile.lock index f6fa651..1ff4aa9 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f69aed4c7e9fc70ed7cd5699d9903216597857c291f59d98715b3602e796033e" + "sha256": "4811e3bc6c1ebe8265070fcf866a2b88abf2ed5c77182164fdb978c2758bac23" }, "pipfile-spec": 6, "requires": { @@ -26,20 +26,20 @@ }, "boto3": { "hashes": [ - "sha256:a839ce09a844d92e0039f95851e88da9df80c89ebb4c7818b3e78247fd97a8a7", - "sha256:c9bab807b372d5b076d6aeb1d6513131fa0b74e32d8895128f8568b6521296ea" + "sha256:a9c0955df0b52b43749d81bde159343a40ea2a3537a46049336fe8193871b18e", + "sha256:f4124548bb831e13504e805f2fbbfcee06df42fffea0655862c6eb9b95d6d1be" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==1.35.46" + "version": "==1.35.53" }, "botocore": { "hashes": [ - "sha256:8bbc9a55cae65a8db7f2e33ff087f4dbfc13fce868e8e3c5273ce9af367a555a", - "sha256:8c0ff5fdd611a28f5752189d171c69690dbc484fa06d74376890bb0543ec3dc1" + "sha256:12869640f2f9fab3152ea312a6906d5bc6ae15522cc74b6367ee1c273269a28b", + "sha256:e610ae076ad1eaed5680d3990493659bbabdffd67b15c61d8373a23e4bc41062" ], "markers": "python_version >= '3.8'", - "version": "==1.35.46" + "version": "==1.35.53" }, "certifi": { "hashes": [ @@ -242,6 +242,15 @@ "markers": "python_version >= '3.7'", "version": "==8.1.7" }, + "deepdiff": { + "hashes": [ + "sha256:245599a4586ab59bb599ca3517a9c42f3318ff600ded5e80a3432693c8ec3c4b", + "sha256:42e99004ce603f9a53934c634a57b04ad5900e0d8ed0abb15e635767489cbc05" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==8.0.1" + }, "docker": { "hashes": [ "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", @@ -451,15 +460,6 @@ "markers": "python_version >= '3.7'", "version": "==1.0.1" }, - "jsondiff": { - "hashes": [ - "sha256:658d162c8a86ba86de26303cd86a7b37e1b2c1ec98b569a60e2ca6180545f7fe", - "sha256:b1f0f7e2421881848b1d556d541ac01a91680cfcc14f51a9b62cdf4da0e56722" - ], - "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==2.2.1" - }, "markupsafe": { "hashes": [ "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", @@ -583,9 +583,17 @@ "sha256:faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1", "sha256:fc44e3c68ff00fd991b59092a54350e6e4911152682b4782f68070985aa9e648" ], - "markers": "python_version >= '3.10'", + "markers": "python_version >= '3.12'", "version": "==2.1.2" }, + "orderly-set": { + "hashes": [ + "sha256:52a18b86aaf3f5d5a498bbdb27bf3253a4e5c57ab38e5b7a56fa00115cd28448", + "sha256:f7a37c95a38c01cdfe41c3ffb62925a318a2286ea0a41790c057fc802aec54da" + ], + "markers": "python_version >= '3.8'", + "version": "==5.2.2" + }, "pandas": { "hashes": [ "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", @@ -637,46 +645,52 @@ }, "pyarrow": { "hashes": [ - "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", - "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", - "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", - "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", - "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", - "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", - "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", - "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", - "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", - "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", - "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", - "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", - "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", - "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", - "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", - "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", - "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", - "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", - "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", - "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", - "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", - "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", - "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", - "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", - "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", - "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", - "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", - "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", - "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", - "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", - "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", - "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", - "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", - "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", - "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", - "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8" + "sha256:00178509f379415a3fcf855af020e3340254f990a8534294ec3cf674d6e255fd", + "sha256:03f40b65a43be159d2f97fd64dc998f769d0995a50c00f07aab58b0b3da87e1f", + "sha256:082ba62bdcb939824ba1ce10b8acef5ab621da1f4c4805e07bfd153617ac19d4", + "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9", + "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2", + "sha256:28f9c39a56d2c78bf6b87dcc699d520ab850919d4a8c7418cd20eda49874a2ea", + "sha256:2c664ab88b9766413197733c1720d3dcd4190e8fa3bbdc3710384630a0a7207b", + "sha256:2c992716cffb1088414f2b478f7af0175fd0a76fea80841b1706baa8fb0ebaad", + "sha256:2e549a748fa8b8715e734919923f69318c953e077e9c02140ada13e59d043310", + "sha256:320ae9bd45ad7ecc12ec858b3e8e462578de060832b98fc4d671dee9f10d9954", + "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a", + "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55", + "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79", + "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba", + "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8", + "sha256:58a62549a3e0bc9e03df32f350e10e1efb94ec6cf63e3920c3385b26663948ce", + "sha256:5f0510608ccd6e7f02ca8596962afb8c6cc84c453e7be0da4d85f5f4f7b0328a", + "sha256:603cd8ad4976568954598ef0a6d4ed3dfb78aff3d57fa8d6271f470f0ce7d34f", + "sha256:606e9a3dcb0f52307c5040698ea962685fb1c852d72379ee9412be7de9c5f9e2", + "sha256:616ea2826c03c16e87f517c46296621a7c51e30400f6d0a61be645f203aa2b93", + "sha256:66dcc216ebae2eb4c37b223feaf82f15b69d502821dde2da138ec5a3716e7463", + "sha256:6dd1b52d0d58dd8f685ced9971eb49f697d753aa7912f0a8f50833c7a7426319", + "sha256:871b292d4b696b09120ed5bde894f79ee2a5f109cb84470546471df264cae136", + "sha256:8c70c1965cde991b711a98448ccda3486f2a336457cf4ec4dca257a926e149c9", + "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2", + "sha256:907ee0aa8ca576f5e0cdc20b5aeb2ad4d3953a3b4769fc4b499e00ef0266f02f", + "sha256:a1824f5b029ddd289919f354bc285992cb4e32da518758c136271cf66046ef22", + "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5", + "sha256:a71ab0589a63a3e987beb2bc172e05f000a5c5be2636b4b263c44034e215b5d7", + "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03", + "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420", + "sha256:b5bd7fd32e3ace012d43925ea4fc8bd1b02cc6cc1e9813b518302950e89b5a22", + "sha256:bc1daf7c425f58527900876354390ee41b0ae962a73ad0959b9d829def583bb1", + "sha256:bc97316840a349485fbb137eb8d0f4d7057e1b2c1272b1a20eebbbe1848f5122", + "sha256:be08af84808dff63a76860847c48ec0416928a7b3a17c2f49a072cac7c45efbd", + "sha256:d5795e37c0a33baa618c5e054cd61f586cf76850a251e2b21355e4085def6280", + "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d", + "sha256:dc892be34dbd058e8d189b47db1e33a227d965ea8805a235c8a7286f7fd17d3a", + "sha256:e7ab04f272f98ebffd2a0661e4e126036f6936391ba2889ed2d44c5006237802", + "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8", + "sha256:f1a198a50c409ab2d009fbf20956ace84567d67f2c5701511d4dd561fae6f32e", + "sha256:fe92efcdbfa0bcf2fa602e466d7f2905500f33f09eb90bf0bcf2e6ca41b574c8" ], "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==17.0.0" + "markers": "python_version >= '3.9'", + "version": "==18.0.0" }, "pycparser": { "hashes": [ @@ -737,65 +751,6 @@ ], "version": "==2024.2" }, - "pyyaml": { - "hashes": [ - "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", - "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48", - "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", - "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e", - "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133", - "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", - "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484", - "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", - "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", - "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", - "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", - "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", - "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", - "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8", - "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", - "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19", - "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc", - "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", - "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1", - "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", - "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", - "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631", - "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", - "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", - "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5", - "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", - "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", - "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", - "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476", - "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", - "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", - "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", - "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b", - "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", - "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", - "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425", - "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", - "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", - "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725", - "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", - "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", - "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", - "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", - "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", - "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", - "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", - "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", - "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", - "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", - "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4", - "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba", - "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12", - "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4" - ], - "markers": "python_version >= '3.8'", - "version": "==6.0.2" - }, "requests": { "hashes": [ "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", @@ -856,11 +811,11 @@ }, "types-setuptools": { "hashes": [ - "sha256:2e48ff3acd4919471e80d5e3f049cce5c177e108d5d36d2d4cee3fa4d4104258", - "sha256:86ea31b5f6df2c6b8f2dc8ae3f72b213607f62549b6fa2ed5866e5299f968694" + "sha256:2949913a518d5285ce00a3b7d88961c80a6e72ffb8f3da0a3f5650ea533bd45e", + "sha256:6721ac0f1a620321e2ccd87a9a747c4a383dc381f78d894ce37f2455b45fcf1c" ], "markers": "python_version >= '3.8'", - "version": "==75.2.0.20241019" + "version": "==75.2.0.20241025" }, "tzdata": { "hashes": [ @@ -880,11 +835,11 @@ }, "werkzeug": { "hashes": [ - "sha256:02c9eb92b7d6c06f31a782811505d2157837cea66aaede3e217c7c27c039476c", - "sha256:34f2371506b250df4d4f84bfe7b0921e4762525762bbd936614909fe25cd7306" + "sha256:8cd39dfbdfc1e051965f156163e2974e52c210f130810e9ad36858f0fd3edad4", + "sha256:a71124d1ef06008baafa3d266c02f56e1836a5984afd6dd6c9230669d60d9fb5" ], - "markers": "python_version >= '3.8'", - "version": "==3.0.4" + "markers": "python_version >= '3.9'", + "version": "==3.1.1" } }, "develop": { @@ -1212,12 +1167,12 @@ }, "ipython": { "hashes": [ - "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a", - "sha256:530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35" + "sha256:0188a1bd83267192123ccea7f4a8ed0a78910535dbaa3f37671dca76ebd429c8", + "sha256:40b60e15b22591450eef73e40a027cf77bd652e757523eebc5bd7c7c498290eb" ], "index": "pypi", "markers": "python_version >= '3.10'", - "version": "==8.28.0" + "version": "==8.29.0" }, "jedi": { "hashes": [ @@ -1237,42 +1192,42 @@ }, "mypy": { "hashes": [ - "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66", - "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a", - "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004", - "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735", - "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931", - "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a", - "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02", - "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0", - "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801", - "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635", - "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179", - "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81", - "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811", - "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd", - "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e", - "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042", - "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627", - "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f", - "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d", - "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6", - "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810", - "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8", - "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1", - "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e", - "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc", - "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4", - "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20", - "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe", - "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19", - "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d", - "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5", - "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d" + "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc", + "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e", + "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f", + "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74", + "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a", + "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2", + "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b", + "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73", + "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e", + "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d", + "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d", + "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6", + "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca", + "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d", + "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5", + "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62", + "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a", + "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc", + "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7", + "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb", + "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7", + "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732", + "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80", + "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a", + "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc", + "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2", + "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0", + "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24", + "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7", + "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b", + "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372", + "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==1.12.1" + "version": "==1.13.0" }, "mypy-extensions": { "hashes": [ @@ -1346,7 +1301,7 @@ "sha256:faa88bc527d0f097abdc2c663cddf37c05a1c2f113716601555249805cf573f1", "sha256:fc44e3c68ff00fd991b59092a54350e6e4911152682b4782f68070985aa9e648" ], - "markers": "python_version >= '3.10'", + "markers": "python_version >= '3.12'", "version": "==2.1.2" }, "packaging": { @@ -1439,55 +1394,61 @@ }, "pyarrow": { "hashes": [ - "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", - "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", - "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", - "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", - "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", - "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", - "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", - "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", - "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", - "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", - "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", - "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", - "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", - "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", - "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", - "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", - "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", - "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", - "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", - "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", - "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", - "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", - "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", - "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", - "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", - "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", - "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", - "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", - "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", - "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", - "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", - "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", - "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", - "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", - "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", - "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8" + "sha256:00178509f379415a3fcf855af020e3340254f990a8534294ec3cf674d6e255fd", + "sha256:03f40b65a43be159d2f97fd64dc998f769d0995a50c00f07aab58b0b3da87e1f", + "sha256:082ba62bdcb939824ba1ce10b8acef5ab621da1f4c4805e07bfd153617ac19d4", + "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9", + "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2", + "sha256:28f9c39a56d2c78bf6b87dcc699d520ab850919d4a8c7418cd20eda49874a2ea", + "sha256:2c664ab88b9766413197733c1720d3dcd4190e8fa3bbdc3710384630a0a7207b", + "sha256:2c992716cffb1088414f2b478f7af0175fd0a76fea80841b1706baa8fb0ebaad", + "sha256:2e549a748fa8b8715e734919923f69318c953e077e9c02140ada13e59d043310", + "sha256:320ae9bd45ad7ecc12ec858b3e8e462578de060832b98fc4d671dee9f10d9954", + "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a", + "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55", + "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79", + "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba", + "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8", + "sha256:58a62549a3e0bc9e03df32f350e10e1efb94ec6cf63e3920c3385b26663948ce", + "sha256:5f0510608ccd6e7f02ca8596962afb8c6cc84c453e7be0da4d85f5f4f7b0328a", + "sha256:603cd8ad4976568954598ef0a6d4ed3dfb78aff3d57fa8d6271f470f0ce7d34f", + "sha256:606e9a3dcb0f52307c5040698ea962685fb1c852d72379ee9412be7de9c5f9e2", + "sha256:616ea2826c03c16e87f517c46296621a7c51e30400f6d0a61be645f203aa2b93", + "sha256:66dcc216ebae2eb4c37b223feaf82f15b69d502821dde2da138ec5a3716e7463", + "sha256:6dd1b52d0d58dd8f685ced9971eb49f697d753aa7912f0a8f50833c7a7426319", + "sha256:871b292d4b696b09120ed5bde894f79ee2a5f109cb84470546471df264cae136", + "sha256:8c70c1965cde991b711a98448ccda3486f2a336457cf4ec4dca257a926e149c9", + "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2", + "sha256:907ee0aa8ca576f5e0cdc20b5aeb2ad4d3953a3b4769fc4b499e00ef0266f02f", + "sha256:a1824f5b029ddd289919f354bc285992cb4e32da518758c136271cf66046ef22", + "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5", + "sha256:a71ab0589a63a3e987beb2bc172e05f000a5c5be2636b4b263c44034e215b5d7", + "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03", + "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420", + "sha256:b5bd7fd32e3ace012d43925ea4fc8bd1b02cc6cc1e9813b518302950e89b5a22", + "sha256:bc1daf7c425f58527900876354390ee41b0ae962a73ad0959b9d829def583bb1", + "sha256:bc97316840a349485fbb137eb8d0f4d7057e1b2c1272b1a20eebbbe1848f5122", + "sha256:be08af84808dff63a76860847c48ec0416928a7b3a17c2f49a072cac7c45efbd", + "sha256:d5795e37c0a33baa618c5e054cd61f586cf76850a251e2b21355e4085def6280", + "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d", + "sha256:dc892be34dbd058e8d189b47db1e33a227d965ea8805a235c8a7286f7fd17d3a", + "sha256:e7ab04f272f98ebffd2a0661e4e126036f6936391ba2889ed2d44c5006237802", + "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8", + "sha256:f1a198a50c409ab2d009fbf20956ace84567d67f2c5701511d4dd561fae6f32e", + "sha256:fe92efcdbfa0bcf2fa602e466d7f2905500f33f09eb90bf0bcf2e6ca41b574c8" ], "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==17.0.0" + "markers": "python_version >= '3.9'", + "version": "==18.0.0" }, "pyarrow-stubs": { "hashes": [ - "sha256:b7c1085d8cf3224c0fd105a16d063ea3b76d8b38d42044a13200b3e2cbf7c302", - "sha256:c59c8163e30f3146d6b0b060aa67623d5d18b292b17e8a53016906c499920b3c" + "sha256:71255538eaa2d5fc85626a520c642285206acb43eccade36e724a4c1c5153299", + "sha256:d0a7b3f661ebddf53291df959648c84d109055aa48b245686f1f1878c249a2b8" ], "index": "pypi", "markers": "python_version >= '3.8' and python_version < '4'", - "version": "==17.9" + "version": "==17.11" }, "pygments": { "hashes": [ @@ -1583,37 +1544,37 @@ }, "ruff": { "hashes": [ - "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628", - "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e", - "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495", - "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9", - "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa", - "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06", - "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b", - "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737", - "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11", - "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be", - "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598", - "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e", - "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4", - "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914", - "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9", - "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d", - "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec", - "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2" + "sha256:21aae53ab1490a52bf4e3bf520c10ce120987b047c494cacf4edad0ba0888da2", + "sha256:28bd8220f4d8f79d590db9e2f6a0674f75ddbc3847277dd44ac1f8d30684b828", + "sha256:2b14e77293380e475b4e3a7a368e14549288ed2931fce259a6f99978669e844f", + "sha256:576305393998b7bd6c46018f8104ea3a9cb3fa7908c21d8580e3274a3b04b691", + "sha256:5b813ef26db1015953daf476202585512afd6a6862a02cde63f3bafb53d0b2d4", + "sha256:7b792468e9804a204be221b14257566669d1db5c00d6bb335996e5cd7004ba80", + "sha256:853277dbd9675810c6826dad7a428d52a11760744508340e66bf46f8be9701d9", + "sha256:9fd67094e77efbea932e62b5d2483006154794040abb3a5072e659096415ae1e", + "sha256:b19fafe261bf741bca2764c14cbb4ee1819b67adb63ebc2db6401dcd652e3748", + "sha256:b73f873b5f52092e63ed540adefc3c36f1f803790ecf2590e1df8bf0a9f72cb8", + "sha256:bb8368cd45bba3f57bb29cbb8d64b4a33f8415d0149d2655c5c8539452ce7760", + "sha256:ccc7e0fc6e0cb3168443eeadb6445285abaae75142ee22b2b72c27d790ab60ba", + "sha256:dba53ed84ac19ae4bfb4ea4bf0172550a2285fa27fbb13e3746f04c80f7fa088", + "sha256:dd8800cbe0254e06b8fec585e97554047fb82c894973f7ff18558eee33d1cb88", + "sha256:e00163fb897d35523c70d71a46fbaa43bf7bf9af0f4534c53ea5b96b2e03397b", + "sha256:f3c54b538633482dc342e9b634d91168fe8cc56b30a4b4f99287f4e339103e88", + "sha256:fa993cfc9f0ff11187e82de874dfc3611df80852540331bc85c75809c93253a8", + "sha256:fd77877a4e43b3a98e5ef4715ba3862105e299af0c48942cc6d51ba3d97dc859" ], "index": "pypi", "markers": "python_version >= '3.7'", - "version": "==0.7.0" + "version": "==0.7.2" }, "setuptools": { "hashes": [ - "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec", - "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8" + "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd", + "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==75.2.0" + "version": "==75.3.0" }, "six": { "hashes": [ @@ -1664,11 +1625,11 @@ }, "virtualenv": { "hashes": [ - "sha256:2ca56a68ed615b8fe4326d11a0dca5dfbe8fd68510fb6c6349163bed3c15f2b2", - "sha256:44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655" + "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba", + "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4" ], "markers": "python_version >= '3.8'", - "version": "==20.27.0" + "version": "==20.27.1" }, "wcwidth": { "hashes": [ diff --git a/abdiff/core/calc_ab_diffs.py b/abdiff/core/calc_ab_diffs.py index e482f08..d5faf36 100644 --- a/abdiff/core/calc_ab_diffs.py +++ b/abdiff/core/calc_ab_diffs.py @@ -1,3 +1,4 @@ +import json import logging import time from collections.abc import Generator @@ -5,7 +6,7 @@ import pyarrow as pa import pyarrow.dataset as ds -from jsondiff import diff +from deepdiff import DeepDiff from abdiff.core.utils import update_or_create_run_json, write_to_dataset @@ -22,6 +23,7 @@ pa.field("record_a", pa.binary()), pa.field("record_b", pa.binary()), pa.field("ab_diff", pa.string()), + pa.field("modified_timdex_fields", pa.list_(pa.string())), pa.field("has_diff", pa.string()), ) ) @@ -61,23 +63,52 @@ def get_diffed_batches_iter( # convert batch to pandas dataframe and calc values for new columns df = batch.to_pandas() # noqa: PD901 - df["ab_diff"] = df.apply( + + # calculate all diffs and unpack into separate columns + diff_results = df.apply( lambda row: calc_record_diff(row["record_a"], row["record_b"]), axis=1 ) - df["has_diff"] = df["ab_diff"].apply(lambda diff_value: diff_value != "{}") + df["ab_diff"] = diff_results.apply(lambda x: x[0]) + df["modified_timdex_fields"] = diff_results.apply( + lambda x: list(x[1]) if x[1] else [] + ) + df["has_diff"] = diff_results.apply(lambda x: x[2]) yield pa.RecordBatch.from_pandas(df) # type: ignore[attr-defined] -def calc_record_diff(record_a: bytes | None, record_b: bytes | None) -> str | None: - """Calculate symmetric diff from two JSON strings.""" +def calc_record_diff( + record_a: str | bytes | dict | None, + record_b: str | bytes | dict | None, + *, + ignore_order: bool = True, + report_repetition: bool = True, +) -> tuple[str | None, list[str] | None, bool]: + """Calculate diff from two JSON byte strings. + + The DeepDiff library has the property 'affected_root_keys' on the produced diff object + that is very useful for our purposes. At this time, we simply want to know if + anything about a particular root level TIMDEX field (e.g. 'dates' or 'title') has + changed which this method provides explicitly. We also serialize the full diff to + JSON via the to_json() method for storage and possible further analysis. + + This method returns a tuple: + - ab_diff: [str] - full diff as JSON + - modified_timdex_fields: list[str] - list of modified root keys (TIMDEX fields) + - has_diff: bool - True/False if any diff present + """ if record_a is None or record_b is None: - return None - - return diff( - record_a.decode(), - record_b.decode(), - syntax="symmetric", - load=True, - dump=True, + return None, None, False + + diff = DeepDiff( + json.loads(record_a) if isinstance(record_a, str | bytes) else record_a, + json.loads(record_b) if isinstance(record_b, str | bytes) else record_b, + ignore_order=ignore_order, + report_repetition=report_repetition, ) + + ab_diff = diff.to_json() + modified_timdex_fields = diff.affected_root_keys + has_diff = bool(modified_timdex_fields) + + return ab_diff, modified_timdex_fields, has_diff diff --git a/abdiff/core/calc_ab_metrics.py b/abdiff/core/calc_ab_metrics.py index 2c12944..2e0a938 100644 --- a/abdiff/core/calc_ab_metrics.py +++ b/abdiff/core/calc_ab_metrics.py @@ -1,6 +1,5 @@ # ruff: noqa: S608 -import json import logging import os import time @@ -48,12 +47,11 @@ def create_record_diff_matrix_dataset( ) -> str: """Create a boolean sparse matrix of modified fields for all records. - This writes a single parquet file with rows for each record, and columns for each - TIMDEX field, and a value of integer 1 if that field has a diff and 0 if not. This - provides a handy way to calculate aggregate metrics for a given field or source in - later steps. The column "has_diff" is also carried over from the diffs dataset to - provide a single column to check if ANY of the field columns indicate a diff for a - record row. + This writes a single parquet file with rows for each record, columns for each TIMDEX + field, and a value of integer 1 if that field has a diff and 0 if not. This provides + a handy way to calculate aggregate metrics for a given field or source in later steps. + The column "has_diff" is also carried over from the diffs dataset to provide a single + column to check if ANY of the field columns indicate a diff for a record row. This code momentarily creates a single dataframe in memory for all rows. This is safe given the nature of the dataframe: there may be 10m rows, and potentially 20-30 @@ -66,32 +64,31 @@ def create_record_diff_matrix_dataset( for i, batch in enumerate( diffs_ds.to_batches( batch_size=batch_size, - columns=["timdex_record_id", "source", "ab_diff", "has_diff"], + columns=["timdex_record_id", "source", "modified_timdex_fields", "has_diff"], ) ): start_time = time.time() batch_df = batch.to_pandas() - # parse diff JSON to dictionary for batch - batch_df["ab_diff"] = batch_df["ab_diff"].apply( - lambda diff_json: json.loads(diff_json) - ) - batch_metrics = [] for _, row in batch_df.iterrows(): record_metrics = { "timdex_record_id": row["timdex_record_id"], "source": row["source"], - "has_diff": 1 if row["has_diff"] == "true" else 0, + "has_diff": (1 if row["has_diff"] == "true" else 0), } - diff_data = row["ab_diff"] - record_metrics.update(generate_field_diff_bools_for_record(diff_data)) + + # for each modified field (root key in diff), set column and value = 1 (True) + if row["modified_timdex_fields"] is not None: + for field in row["modified_timdex_fields"]: + record_metrics[field] = 1 + batch_metrics.append(record_metrics) # build dataframe for batch batch_metrics_df = pd.DataFrame(batch_metrics) batch_metrics_dfs.append(batch_metrics_df) - logger.info(f"batch: {i+1}, elapsed: {time.time()-start_time}") + logger.info(f"batch: {i + 1}, elapsed: {time.time() - start_time}") # concatenate all dataframes into single dataframe for writing and replace None with 0 metrics_df = pd.concat(batch_metrics_dfs) diff --git a/pyproject.toml b/pyproject.toml index 73ed296..4bfb68e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ disallow_untyped_defs = true exclude = ["tests/", "output/"] [[tool.mypy.overrides]] -module = ["docker", "docker.models.containers", "duckdb", "duckdb.duckdb", "jsondiff", "pandas", "ijson"] +module = ["docker", "docker.models.containers", "duckdb", "duckdb.duckdb", "deepdiff", "pandas", "ijson"] ignore_missing_imports = true [tool.pytest.ini_options] diff --git a/tests/test_calc_ab_diffs.py b/tests/test_calc_ab_diffs.py index 5a0ed38..6997723 100644 --- a/tests/test_calc_ab_diffs.py +++ b/tests/test_calc_ab_diffs.py @@ -17,22 +17,66 @@ def test_calc_record_diff_has_diff(): a = {"color": "green"} b = {"color": "red"} - assert calc_record_diff(json.dumps(a).encode(), json.dumps(b).encode()) == json.dumps( - {"color": ["green", "red"]} + ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b) + assert ab_diff == json.dumps( + {"values_changed": {"root['color']": {"new_value": "red", "old_value": "green"}}} ) + assert modified_timdex_fields == {"color"} + assert has_diff def test_calc_record_diff_no_diff(): a = {"color": "green"} b = a - assert calc_record_diff(json.dumps(a).encode(), json.dumps(b).encode()) == json.dumps( - {} # no diff - ) + ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b) + assert ab_diff == json.dumps({}) # no diff + assert not modified_timdex_fields + assert not has_diff def test_calc_record_diff_one_input_is_none(): a = {"color": "green"} - assert calc_record_diff(json.dumps(a).encode(), None) is None + assert calc_record_diff(a, None) == (None, None, False) + + +def test_calc_record_diff_array_by_default_order_not_a_diff(): + """Arrays with the same values, but differently ordered, not considered a diff.""" + a = {"colors": ["green", "red"]} + b = {"colors": ["red", "green"]} + ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b) + assert ab_diff == json.dumps({}) # no diff + assert not modified_timdex_fields + assert not has_diff + + +def test_calc_record_diff_array_set_flag_order_is_a_diff(): + """Arrays with the same values, but differently ordered, not considered a diff.""" + a = {"colors": ["green", "red"]} + b = {"colors": ["red", "green"]} + _, _, has_diff = calc_record_diff(a, b, ignore_order=False) + assert has_diff + + +def test_calc_record_diff_array_repetition_is_reported_when_diff(): + """Same array values, but different in repetition, is considered a diff.""" + a = {"colors": ["red", "green"]} + b = {"colors": ["red", "green", "green"]} + ab_diff, modified_timdex_fields, has_diff = calc_record_diff(a, b) + assert ab_diff == json.dumps( + { + "repetition_change": { + "root['colors'][1]": { + "old_repeat": 1, + "new_repeat": 2, + "old_indexes": [1], + "new_indexes": [1, 2], + "value": "green", + } + } + } + ) + assert modified_timdex_fields == {"colors"} + assert has_diff def test_diffed_batches_yields_pyarrow_record_batch(collated_dataset): @@ -46,7 +90,9 @@ def test_diffed_batches_first_batch_has_diff(collated_dataset): batch_one = next(batch_iter).to_pandas() row = batch_one.iloc[0] - assert row.ab_diff == json.dumps({"color": ["green", "red"]}) + assert row.ab_diff == json.dumps( + {"values_changed": {"root['color']": {"new_value": "red", "old_value": "green"}}} + ) assert row.has_diff @@ -73,6 +119,7 @@ def test_calc_ab_diffs_writes_dataset(caplog, run_directory, collated_dataset_di pa.field("record_a", pa.binary()), pa.field("record_b", pa.binary()), pa.field("ab_diff", pa.string()), + pa.field("modified_timdex_fields", pa.list_(pa.string())), pa.field("has_diff", pa.string()), ) ) From edc5f80dda71028adfc35bd2e658751054102634 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Mon, 4 Nov 2024 09:46:18 -0500 Subject: [PATCH 2/2] Remove unneeded jsondiff output parsing Why these changes are being introduced: Now that the DeepDiff library provides an explicit list of modified "root" fields -- i.e. TIMDEX fields -- as a built-in property, we no longer need any additional logic to parse the diff and surface what fields were modified. How this addresses that need: * This removes the helper function generate_field_diff_bools_for_record() and any tests related to it. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-373 --- abdiff/core/calc_ab_metrics.py | 26 -------------------------- tests/test_calc_ab_metrics.py | 22 ---------------------- 2 files changed, 48 deletions(-) diff --git a/abdiff/core/calc_ab_metrics.py b/abdiff/core/calc_ab_metrics.py index 2e0a938..4607e4d 100644 --- a/abdiff/core/calc_ab_metrics.py +++ b/abdiff/core/calc_ab_metrics.py @@ -104,32 +104,6 @@ def create_record_diff_matrix_dataset( return metrics_dataset -def generate_field_diff_bools_for_record(diff_data: dict) -> dict: - """Function to return dictionary of fields that have a diff. - - Determining if a field had a diff is as straight-forward as looking to see if it shows - up in the parsed diff JSON. The fields may be at the root of the diff, or they could - be nested under "$insert" or "$delete" nodes in the diff. - - If a field from the original A/B records are not in the diff at all, then they did not - have changes, and therefore will not receive a 1 here to indicate a diff. - """ - fields_with_diffs = {} - - for key in diff_data: - - # identify modified fields nested in $insert or $delete blocks - if key in ("$insert", "$delete"): - for subfield in diff_data[key]: - fields_with_diffs[subfield] = 1 - - # identified modified fields at root of diff - else: - fields_with_diffs[key] = 1 - - return fields_with_diffs - - def calculate_metrics_data(field_matrix_parquet: str) -> dict: """Create a dictionary of metrics via DuckDB queries.""" summary: dict = {} diff --git a/tests/test_calc_ab_metrics.py b/tests/test_calc_ab_metrics.py index 3662693..49faadc 100644 --- a/tests/test_calc_ab_metrics.py +++ b/tests/test_calc_ab_metrics.py @@ -12,32 +12,10 @@ calc_ab_metrics, calculate_metrics_data, create_record_diff_matrix_dataset, - generate_field_diff_bools_for_record, ) from abdiff.core.utils import load_dataset, read_run_json -def test_record_field_diffs_no_diffs(): - diff_data = {} - assert generate_field_diff_bools_for_record(diff_data) == {} - - -def test_record_field_diffs_one_diff(): - diff_data = {"color": "green"} - assert generate_field_diff_bools_for_record(diff_data) == {"color": 1} - - -def test_record_field_diffs_diff_from_inserts_and_deletes_counted_only_once(): - diff_data = { - "$insert": {"fruits": "strawberry"}, - "$delete": {"vegetables": "onion"}, - } - assert generate_field_diff_bools_for_record(diff_data) == { - "fruits": 1, - "vegetables": 1, - } - - def test_sparse_matrix_dataset_created_success(run_directory, diffs_dataset_directory): diff_matrix_dataset_filepath = create_record_diff_matrix_dataset( run_directory, diffs_dataset_directory