In [1]:
import zstandard
import os
import json
import sys
from datetime import datetime
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()


In [2]:
file_path = "/data/reddit/submissions/RS_2022-03.zst"
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0
selected_lines = []

try:
	for line, file_bytes_processed in read_lines_zst(file_path):
		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))
			if obj["author"] in {"miaou_dubois", "black-rose-petal", "goddessmoneta", "faegoddess333", "sexyninja_", "Keruimin"}:
				print(obj)
				selected_lines.append(obj)
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
		file_lines += 1
		if file_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
except Exception as err:
	log.info(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")


2022-03-01 02:06:03 : 100,000 : 0 : 30,540,475:0%
2022-03-01 04:14:30 : 200,000 : 0 : 60,294,500:1%
2022-03-01 06:42:19 : 300,000 : 0 : 89,655,300:1%
2022-03-01 09:48:15 : 400,000 : 0 : 119,016,100:1%
2022-03-01 12:39:02 : 500,000 : 0 : 147,066,150:2%
2022-03-01 14:51:33 : 600,000 : 0 : 175,509,425:2%
2022-03-01 16:42:47 : 700,000 : 0 : 205,001,300:2%
2022-03-01 18:31:23 : 800,000 : 0 : 234,493,175:2%
2022-03-01 20:20:08 : 900,000 : 0 : 264,116,125:3%
2022-03-01 22:10:08 : 1,000,000 : 0 : 293,345,850:3%
2022-03-02 00:05:31 : 1,100,000 : 0 : 321,789,125:3%
2022-03-02 02:10:42 : 1,200,000 : 0 : 350,363,475:4%
2022-03-02 04:21:24 : 1,300,000 : 0 : 379,068,900:4%
2022-03-02 06:49:50 : 1,400,000 : 0 : 398,336,925:4%
2022-03-02 09:55:25 : 1,500,000 : 0 : 426,780,200:4%
2022-03-02 12:45:02 : 1,600,000 : 0 : 455,878,850:5%
2022-03-02 14:56:38 : 1,700,000 : 0 : 484,584,275:5%
2022-03-02 16:48:56 : 1,800,000 : 0 : 514,076,150:5%
2022-03-02 18:36:29 : 1,900,000 : 0 : 543,568,025:6%
2022-03-02 20:

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'goddessmoneta', 'author_created_utc': 1614864021, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_9a9wtfzm', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1646605866, 'discussion_type': None, 'distinguished': None, 'domain': 'self.TwoXChromosomes', 'edited': 1646691687.0, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 't8a9ne', 'is_created_from_ads_ui': False, 'is_crosspostable': True, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': True, 'is_self': True, 'is_video

2022-03-07 00:02:38 : 6,300,000 : 0 : 1,809,490,375:19%
2022-03-07 02:16:02 : 6,400,000 : 0 : 1,838,851,175:19%
2022-03-07 04:37:40 : 6,500,000 : 0 : 1,868,080,900:20%
2022-03-07 07:21:51 : 6,600,000 : 0 : 1,897,179,550:20%
2022-03-07 10:31:41 : 6,700,000 : 0 : 1,925,622,825:20%
2022-03-07 13:14:28 : 6,800,000 : 0 : 1,954,197,175:20%
2022-03-07 15:21:06 : 6,900,000 : 0 : 1,983,426,900:21%
2022-03-07 17:14:24 : 7,000,000 : 0 : 2,013,574,150:21%
2022-03-07 19:03:25 : 7,100,000 : 0 : 2,033,497,550:21%
2022-03-07 20:51:49 : 7,200,000 : 0 : 2,063,120,500:22%
2022-03-07 22:40:48 : 7,300,000 : 0 : 2,092,481,300:22%
2022-03-08 00:43:29 : 7,400,000 : 0 : 2,121,842,100:22%
2022-03-08 02:51:13 : 7,500,000 : 0 : 2,150,678,600:23%
2022-03-08 05:09:23 : 7,600,000 : 0 : 2,180,301,550:23%
2022-03-08 07:55:38 : 7,700,000 : 0 : 2,209,400,200:23%
2022-03-08 11:07:30 : 7,800,000 : 0 : 2,237,581,325:23%
2022-03-08 13:42:40 : 7,900,000 : 0 : 2,266,155,675:24%
2022-03-08 15:44:53 : 8,000,000 : 0 : 2,296,171,

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'black-rose-petal', 'author_created_utc': 1626716381, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_apau21g3', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1647394222, 'discussion_type': None, 'distinguished': None, 'domain': 'ibb.co', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 'tf5z43', 'is_created_from_ads_ui': False, 'is_crosspostable': False, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': False, 'is_self': False, 'is_video': False, 'link

2022-03-16 02:07:10 : 15,900,000 : 0 : 4,568,094,825:48%
2022-03-16 04:24:49 : 16,000,000 : 0 : 4,597,979,925:48%
2022-03-16 07:13:04 : 16,100,000 : 0 : 4,627,340,725:49%
2022-03-16 10:21:55 : 16,200,000 : 0 : 4,646,739,825:49%
2022-03-16 12:56:14 : 16,300,000 : 0 : 4,676,100,625:49%
2022-03-16 15:00:06 : 16,400,000 : 0 : 4,706,510,025:49%
2022-03-16 16:51:57 : 16,500,000 : 0 : 4,737,312,650:50%
2022-03-16 18:41:54 : 16,600,000 : 0 : 4,767,722,050:50%
2022-03-16 20:33:31 : 16,700,000 : 0 : 4,798,262,525:50%
2022-03-16 22:27:08 : 16,800,000 : 0 : 4,828,409,775:51%
2022-03-17 00:26:45 : 16,900,000 : 0 : 4,858,163,800:51%
2022-03-17 02:36:26 : 17,000,000 : 0 : 4,888,048,900:51%
2022-03-17 05:00:00 : 17,100,000 : 0 : 4,917,802,925:52%
2022-03-17 07:55:33 : 17,200,000 : 0 : 4,947,163,725:52%
2022-03-17 11:04:19 : 17,300,000 : 0 : 4,976,262,375:52%


{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'miaou_dubois', 'author_created_utc': 1607173814, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_96frs5jt', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1647522262, 'discussion_type': None, 'distinguished': None, 'domain': 'self.EstrangedAdultChild', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 'tg9s22', 'is_created_from_ads_ui': False, 'is_crosspostable': True, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': True, 'is_self': True, 'is_video': F

2022-03-17 13:29:35 : 17,400,000 : 0 : 5,005,623,175:52%
2022-03-17 15:28:45 : 17,500,000 : 0 : 5,036,032,575:53%
2022-03-17 17:19:47 : 17,600,000 : 0 : 5,066,310,900:53%
2022-03-17 19:13:25 : 17,700,000 : 0 : 5,096,589,225:53%
2022-03-17 21:04:52 : 17,800,000 : 0 : 5,116,512,625:54%
2022-03-17 23:00:39 : 17,900,000 : 0 : 5,146,922,025:54%
2022-03-18 01:03:35 : 18,000,000 : 0 : 5,176,020,675:54%
2022-03-18 03:14:44 : 18,100,000 : 0 : 5,205,774,700:55%
2022-03-18 05:51:59 : 18,200,000 : 0 : 5,235,135,500:55%
2022-03-18 09:06:26 : 18,300,000 : 0 : 5,264,496,300:55%
2022-03-18 12:05:11 : 18,400,000 : 0 : 5,293,070,650:56%
2022-03-18 14:15:56 : 18,500,000 : 0 : 5,322,693,600:56%
2022-03-18 16:10:20 : 18,600,000 : 0 : 5,352,840,850:56%
2022-03-18 18:01:09 : 18,700,000 : 0 : 5,383,643,475:56%
2022-03-18 19:55:35 : 18,800,000 : 0 : 5,414,708,250:57%
2022-03-18 21:52:05 : 18,900,000 : 0 : 5,445,248,725:57%
2022-03-18 23:56:04 : 19,000,000 : 0 : 5,464,909,975:57%
2022-03-19 02:10:09 : 19,100,00

KeyboardInterrupt: 