In [1]:
import zstandard
import os
import json
import sys
from datetime import datetime
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()


In [2]:
file_path = "/data/reddit/submissions/RS_2022-01.zst"
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0
selected_lines = []

try:
	for line, file_bytes_processed in read_lines_zst(file_path):
		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))
			if obj["author"] in {"miaou_dubois", "black-rose-petal", "goddessmoneta", "faegoddess333", "sexyninja_", "keruimin"}:
				print(obj)
				selected_lines.append(obj)
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
		file_lines += 1
		if file_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
except Exception as err:
	log.info(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")


2022-01-01 02:37:59 : 100,000 : 0 : 30,671,550:0%
2022-01-01 05:33:45 : 200,000 : 0 : 59,901,275:1%
2022-01-01 08:58:23 : 300,000 : 0 : 88,606,700:1%
2022-01-01 12:39:09 : 400,000 : 0 : 116,263,525:1%
2022-01-01 15:31:37 : 500,000 : 0 : 144,182,500:2%
2022-01-01 17:46:42 : 600,000 : 0 : 173,019,000:2%
2022-01-01 19:53:34 : 700,000 : 0 : 202,379,800:2%
2022-01-01 21:58:19 : 800,000 : 0 : 221,909,975:3%
2022-01-02 00:05:10 : 900,000 : 0 : 250,222,175:3%
2022-01-02 02:22:56 : 1,000,000 : 0 : 279,058,675:3%
2022-01-02 04:50:33 : 1,100,000 : 0 : 307,501,950:3%
2022-01-02 07:45:24 : 1,200,000 : 0 : 335,552,000:4%
2022-01-02 11:18:14 : 1,300,000 : 0 : 363,208,825:4%
2022-01-02 14:19:25 : 1,400,000 : 0 : 381,297,175:4%
2022-01-02 16:35:26 : 1,500,000 : 0 : 409,085,075:5%
2022-01-02 18:36:54 : 1,600,000 : 0 : 437,790,500:5%
2022-01-02 20:35:57 : 1,700,000 : 0 : 466,495,925:5%
2022-01-02 22:35:39 : 1,800,000 : 0 : 494,545,975:6%
2022-01-03 00:37:37 : 1,900,000 : 0 : 522,858,175:6%
2022-01-03 02:

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'miaou_dubois', 'author_created_utc': 1607173814, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_96frs5jt', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1641759916, 'discussion_type': None, 'distinguished': None, 'domain': 'imgur.com', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 's00mmu', 'is_created_from_ads_ui': False, 'is_crosspostable': False, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': False, 'is_self': False, 'is_video': False, 'link_

2022-01-09 21:26:28 : 9,000,000 : 0 : 2,458,311,625:28%
2022-01-09 23:23:11 : 9,100,000 : 0 : 2,486,623,825:28%
2022-01-10 01:28:48 : 9,200,000 : 0 : 2,515,329,250:28%
2022-01-10 03:42:36 : 9,300,000 : 0 : 2,534,335,125:29%
2022-01-10 06:10:35 : 9,400,000 : 0 : 2,562,778,400:29%
2022-01-10 09:11:00 : 9,500,000 : 0 : 2,590,173,075:29%
2022-01-10 12:17:28 : 9,600,000 : 0 : 2,617,829,900:30%
2022-01-10 14:38:53 : 9,700,000 : 0 : 2,646,404,250:30%
2022-01-10 16:37:47 : 9,800,000 : 0 : 2,675,633,975:30%
2022-01-10 18:28:00 : 9,900,000 : 0 : 2,704,601,550:31%
2022-01-10 20:19:27 : 10,000,000 : 0 : 2,733,438,050:31%
2022-01-10 22:09:39 : 10,100,000 : 0 : 2,752,575,000:31%


{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'miaou_dubois', 'author_created_utc': 1607173814, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_96frs5jt', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1641856238, 'crosspost_parent': 't3_s0lj8q', 'crosspost_parent_list': [{'all_awardings': [{'award_sub_type': 'GLOBAL', 'award_type': 'global', 'awardings_required_to_grant_benefits': None, 'coin_price': 100, 'coin_reward': 0, 'count': 2, 'days_of_drip_extension': None, 'days_of_premium': None, 'description': "Shows the Silver Award... and that's it.", 'end_date': None, 'giver_coin_reward': Non

2022-01-11 00:05:44 : 10,200,000 : 0 : 2,781,018,275:31%
2022-01-11 02:11:41 : 10,300,000 : 0 : 2,809,723,700:32%
2022-01-11 04:24:24 : 10,400,000 : 0 : 2,838,429,125:32%
2022-01-11 07:02:08 : 10,500,000 : 0 : 2,867,003,475:32%
2022-01-11 10:18:17 : 10,600,000 : 0 : 2,894,791,375:33%
2022-01-11 13:11:41 : 10,700,000 : 0 : 2,922,710,350:33%
2022-01-11 15:22:00 : 10,800,000 : 0 : 2,951,284,700:33%
2022-01-11 17:15:53 : 10,900,000 : 0 : 2,980,121,200:34%
2022-01-11 19:05:37 : 11,000,000 : 0 : 3,008,957,700:34%
2022-01-11 20:57:58 : 11,100,000 : 0 : 3,028,225,725:34%
2022-01-11 22:49:44 : 11,200,000 : 0 : 3,056,406,850:35%
2022-01-12 00:50:46 : 11,300,000 : 0 : 3,084,719,050:35%
2022-01-12 03:00:20 : 11,400,000 : 0 : 3,112,900,175:35%
2022-01-12 05:21:46 : 11,500,000 : 0 : 3,141,605,600:36%
2022-01-12 08:15:20 : 11,600,000 : 0 : 3,169,524,575:36%
2022-01-12 11:29:21 : 11,700,000 : 0 : 3,197,181,400:36%
2022-01-12 14:05:05 : 11,800,000 : 0 : 3,224,969,300:37%
2022-01-12 16:04:24 : 11,900,00

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'black-rose-petal', 'author_created_utc': 1626716381, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_apau21g3', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1642314629, 'crosspost_parent': 't3_s52xkh', 'crosspost_parent_list': [{'all_awardings': [{'award_sub_type': 'GLOBAL', 'award_type': 'global', 'awardings_required_to_grant_benefits': None, 'coin_price': 100, 'coin_reward': 0, 'count': 3, 'days_of_drip_extension': None, 'days_of_premium': None, 'description': "Shows the Silver Award... and that's it.", 'end_date': None, 'giver_coin_reward':

2022-01-16 07:33:23 : 15,700,000 : 0 : 4,284,186,375:49%
2022-01-16 10:58:25 : 15,800,000 : 0 : 4,311,843,200:49%
2022-01-16 13:57:03 : 15,900,000 : 0 : 4,339,631,100:49%
2022-01-16 16:11:23 : 16,000,000 : 0 : 4,368,205,450:49%
2022-01-16 18:07:38 : 16,100,000 : 0 : 4,397,566,250:50%
2022-01-16 20:02:25 : 16,200,000 : 0 : 4,427,320,275:50%


{'all_awardings': [], 'allow_live_comments': True, 'archived': False, 'author': 'goddessmoneta', 'author_created_utc': 1614864021, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_9a9wtfzm', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1642369391, 'discussion_type': None, 'distinguished': None, 'domain': 'self.Korean', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 's5mfdo', 'is_created_from_ads_ui': False, 'is_crosspostable': False, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': False, 'is_self': True, 'is_video': False, 'link

2022-01-16 21:56:30 : 16,300,000 : 0 : 4,456,681,075:50%
2022-01-16 23:56:15 : 16,400,000 : 0 : 4,485,779,725:51%
2022-01-17 02:06:48 : 16,500,000 : 0 : 4,504,785,600:51%
2022-01-17 04:25:53 : 16,600,000 : 0 : 4,533,884,250:51%
2022-01-17 07:02:00 : 16,700,000 : 0 : 4,562,982,900:52%
2022-01-17 10:09:39 : 16,800,000 : 0 : 4,591,032,950:52%
2022-01-17 13:03:09 : 16,900,000 : 0 : 4,619,083,000:52%
2022-01-17 15:18:15 : 17,000,000 : 0 : 4,647,788,425:53%
2022-01-17 17:11:33 : 17,100,000 : 0 : 4,677,411,375:53%
2022-01-17 18:57:09 : 17,200,000 : 0 : 4,696,810,475:53%
2022-01-17 20:44:17 : 17,300,000 : 0 : 4,725,909,125:54%
2022-01-17 22:33:31 : 17,400,000 : 0 : 4,754,745,625:54%
2022-01-18 00:32:52 : 17,500,000 : 0 : 4,783,319,975:54%
2022-01-18 02:40:33 : 17,600,000 : 0 : 4,812,025,400:54%
2022-01-18 04:57:30 : 17,700,000 : 0 : 4,840,599,750:55%
2022-01-18 07:40:21 : 17,800,000 : 0 : 4,869,174,100:55%
2022-01-18 10:56:30 : 17,900,000 : 0 : 4,897,224,150:55%
2022-01-18 13:38:52 : 18,000,00

In [3]:
selected_lines

[{'all_awardings': [],
  'allow_live_comments': False,
  'archived': False,
  'author': 'miaou_dubois',
  'author_created_utc': 1607173814,
  'author_flair_background_color': None,
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_template_id': None,
  'author_flair_text': None,
  'author_flair_text_color': None,
  'author_flair_type': 'text',
  'author_fullname': 't2_96frs5jt',
  'author_patreon_flair': False,
  'author_premium': False,
  'awarders': [],
  'banned_by': None,
  'can_gild': True,
  'can_mod_post': False,
  'category': None,
  'content_categories': None,
  'contest_mode': False,
  'created_utc': 1641759916,
  'discussion_type': None,
  'distinguished': None,
  'domain': 'imgur.com',
  'edited': False,
  'gilded': 0,
  'gildings': {},
  'hidden': False,
  'hide_score': False,
  'id': 's00mmu',
  'is_created_from_ads_ui': False,
  'is_crosspostable': False,
  'is_meta': False,
  'is_original_content': False,
  'is_reddit_media_domain': False,