In [1]:
import zstandard
import os
import json
import sys
from datetime import datetime
import logging.handlers


log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())


def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line, file_handle.tell()

			buffer = lines[-1]

		reader.close()


In [5]:
file_path = "/data/reddit/submissions/RS_2022-02.zst"
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0
selected_lines = []

try:
	for line, file_bytes_processed in read_lines_zst(file_path):
		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))
			if obj["author"] in {"miaou_dubois", "black-rose-petal", "goddessmoneta", "faegoddess333", "sexyninja_"}:
				print(obj)
				selected_lines.append(obj)
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
		file_lines += 1
		if file_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
except Exception as err:
	log.info(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")


2022-02-01 02:07:05 : 100,000 : 0 : 30,409,400:0%
2022-02-01 04:20:36 : 200,000 : 0 : 59,639,125:1%
2022-02-01 06:59:09 : 300,000 : 0 : 88,868,850:1%
2022-02-01 10:13:28 : 400,000 : 0 : 117,049,975:1%
2022-02-01 13:05:07 : 500,000 : 0 : 145,624,325:2%
2022-02-01 15:13:28 : 600,000 : 0 : 174,854,050:2%
2022-02-01 17:06:17 : 700,000 : 0 : 204,214,850:2%
2022-02-01 18:55:32 : 800,000 : 0 : 233,706,725:3%
2022-02-01 20:45:56 : 900,000 : 0 : 262,936,450:3%
2022-02-01 22:35:26 : 1,000,000 : 0 : 291,641,875:3%
2022-02-02 00:34:14 : 1,100,000 : 0 : 310,778,825:4%
2022-02-02 02:42:32 : 1,200,000 : 0 : 339,615,325:4%
2022-02-02 04:59:30 : 1,300,000 : 0 : 368,320,750:4%
2022-02-02 07:46:42 : 1,400,000 : 0 : 396,764,025:5%
2022-02-02 11:02:48 : 1,500,000 : 0 : 425,076,225:5%
2022-02-02 13:39:05 : 1,600,000 : 0 : 453,388,425:5%
2022-02-02 15:40:14 : 1,700,000 : 0 : 482,356,000:6%
2022-02-02 17:28:42 : 1,800,000 : 0 : 511,585,725:6%
2022-02-02 19:15:09 : 1,900,000 : 0 : 540,815,450:6%
2022-02-02 21:

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'miaou_dubois', 'author_created_utc': 1607173814, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_96frs5jt', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1644787188, 'discussion_type': None, 'distinguished': None, 'domain': 'i.redd.it', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 'srtb1e', 'is_created_from_ads_ui': False, 'is_crosspostable': False, 'is_meta': False, 'is_original_content': True, 'is_reddit_media_domain': True, 'is_robot_indexable': False, 'is_self': False, 'is_video': False, 'link_fl

2022-02-13 22:18:04 : 13,600,000 : 0 : 3,806,286,925:46%
2022-02-14 00:28:10 : 13,700,000 : 0 : 3,836,040,950:46%
2022-02-14 02:52:43 : 13,800,000 : 0 : 3,865,532,825:46%
2022-02-14 05:14:20 : 13,900,000 : 0 : 3,884,800,850:47%
2022-02-14 08:00:21 : 14,000,000 : 0 : 3,913,506,275:47%
2022-02-14 11:06:51 : 14,100,000 : 0 : 3,942,342,775:47%
2022-02-14 13:38:57 : 14,200,000 : 0 : 3,980,354,525:48%
2022-02-14 15:39:05 : 14,300,000 : 0 : 4,009,977,475:48%
2022-02-14 17:27:38 : 14,400,000 : 0 : 4,029,900,875:48%
2022-02-14 19:14:49 : 14,500,000 : 0 : 4,059,654,900:49%
2022-02-14 21:02:58 : 14,600,000 : 0 : 4,089,802,150:49%
2022-02-14 23:16:55 : 14,700,000 : 0 : 4,119,949,400:49%
2022-02-15 01:18:03 : 14,800,000 : 0 : 4,149,310,200:50%
2022-02-15 03:30:26 : 14,900,000 : 0 : 4,178,802,075:50%
2022-02-15 05:56:37 : 15,000,000 : 0 : 4,207,900,725:50%
2022-02-15 08:56:20 : 15,100,000 : 0 : 4,236,868,300:51%
2022-02-15 12:00:35 : 15,200,000 : 0 : 4,265,835,875:51%
2022-02-15 14:22:03 : 15,300,00

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'miaou_dubois', 'author_created_utc': 1607173814, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_96frs5jt', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1645738635, 'discussion_type': None, 'distinguished': None, 'domain': 'henrygeorge.org', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 't0m4ft', 'is_created_from_ads_ui': False, 'is_crosspostable': True, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': True, 'is_self': False, 'is_video': False, 'l

2022-02-24 22:10:20 : 25,500,000 : 0 : 7,154,991,025:86%
2022-02-25 00:06:14 : 25,600,000 : 0 : 7,182,254,625:86%
2022-02-25 02:10:21 : 25,700,000 : 0 : 7,209,649,300:86%
2022-02-25 04:20:28 : 25,800,000 : 0 : 7,236,912,900:87%
2022-02-25 06:47:53 : 25,900,000 : 0 : 7,264,176,500:87%
2022-02-25 09:50:52 : 26,000,000 : 0 : 7,291,440,100:87%
2022-02-25 12:43:25 : 26,100,000 : 0 : 7,319,228,000:88%
2022-02-25 14:53:08 : 26,200,000 : 0 : 7,347,146,975:88%
2022-02-25 16:46:22 : 26,300,000 : 0 : 7,375,459,175:88%
2022-02-25 18:34:39 : 26,400,000 : 0 : 7,404,033,525:89%
2022-02-25 20:27:04 : 26,500,000 : 0 : 7,432,345,725:89%
2022-02-25 22:20:56 : 26,600,000 : 0 : 7,460,789,000:89%
2022-02-26 00:21:25 : 26,700,000 : 0 : 7,488,707,975:90%
2022-02-26 02:29:02 : 26,800,000 : 0 : 7,516,495,875:90%
2022-02-26 04:44:37 : 26,900,000 : 0 : 7,544,545,925:90%
2022-02-26 07:20:30 : 27,000,000 : 0 : 7,571,547,375:91%
2022-02-26 10:20:50 : 27,100,000 : 0 : 7,598,679,900:91%
2022-02-26 13:13:28 : 27,200,00

In [6]:
selected_lines

[{'all_awardings': [],
  'allow_live_comments': False,
  'archived': False,
  'author': 'miaou_dubois',
  'author_created_utc': 1607173814,
  'author_flair_background_color': None,
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_template_id': None,
  'author_flair_text': None,
  'author_flair_text_color': None,
  'author_flair_type': 'text',
  'author_fullname': 't2_96frs5jt',
  'author_patreon_flair': False,
  'author_premium': False,
  'awarders': [],
  'banned_by': None,
  'can_gild': True,
  'can_mod_post': False,
  'category': None,
  'content_categories': None,
  'contest_mode': False,
  'created_utc': 1644787188,
  'discussion_type': None,
  'distinguished': None,
  'domain': 'i.redd.it',
  'edited': False,
  'gilded': 0,
  'gildings': {},
  'hidden': False,
  'hide_score': False,
  'id': 'srtb1e',
  'is_created_from_ads_ui': False,
  'is_crosspostable': False,
  'is_meta': False,
  'is_original_content': True,
  'is_reddit_media_domain': True,
 

In [8]:
file_path = "/data/reddit/submissions/RS_2021-12.zst"
file_size = os.stat(file_path).st_size
file_lines = 0
file_bytes_processed = 0
created = None
bad_lines = 0
selected_lines = []

try:
	for line, file_bytes_processed in read_lines_zst(file_path):
		try:
			obj = json.loads(line)
			created = datetime.utcfromtimestamp(int(obj['created_utc']))
			if obj["author"] in {"miaou_dubois", "black-rose-petal", "goddessmoneta", "faegoddess333", "sexyninja_", "keruimin"}:
				print(obj)
				selected_lines.append(obj)
		except (KeyError, json.JSONDecodeError) as err:
			bad_lines += 1
		file_lines += 1
		if file_lines % 100000 == 0:
			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
except Exception as err:
	log.info(err)

log.info(f"Complete : {file_lines:,} : {bad_lines:,}")


2021-12-01 02:14:45 : 100,000 : 0 : 29,885,100:0%
2021-12-01 05:39:32 : 200,000 : 0 : 58,721,600:1%
2021-12-01 07:15:43 : 300,000 : 0 : 86,771,650:1%
2021-12-01 10:32:03 : 400,000 : 0 : 114,559,550:1%
2021-12-01 13:23:41 : 500,000 : 0 : 133,041,125:2%
2021-12-01 15:31:35 : 600,000 : 0 : 160,960,100:2%
2021-12-01 17:26:36 : 700,000 : 0 : 189,534,450:2%
2021-12-01 19:17:33 : 800,000 : 0 : 217,846,650:3%
2021-12-01 21:10:14 : 900,000 : 0 : 246,158,850:3%
2021-12-01 23:05:28 : 1,000,000 : 0 : 264,771,500:3%
2021-12-02 01:10:52 : 1,100,000 : 0 : 292,297,250:4%
2021-12-02 03:22:39 : 1,200,000 : 0 : 320,216,225:4%
2021-12-02 05:48:12 : 1,300,000 : 0 : 348,135,200:4%
2021-12-02 08:50:27 : 1,400,000 : 0 : 375,529,875:5%
2021-12-02 12:01:50 : 1,500,000 : 0 : 393,618,225:5%
2021-12-02 14:30:54 : 1,600,000 : 0 : 421,143,975:5%
2021-12-02 16:33:58 : 1,700,000 : 0 : 449,849,400:6%
2021-12-02 18:31:32 : 1,800,000 : 0 : 478,554,825:6%
2021-12-02 20:28:20 : 1,900,000 : 0 : 506,998,100:6%
2021-12-02 22:

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'black-rose-petal', 'author_created_utc': 1626716381, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_apau21g3', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1640474253, 'discussion_type': None, 'distinguished': None, 'domain': 'self.AskNYC', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 'rojsnl', 'is_created_from_ads_ui': False, 'is_crosspostable': True, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': True, 'is_self': True, 'is_video': False, 'li

2021-12-26 00:12:14 : 24,000,000 : 0 : 6,416,514,475:80%
2021-12-26 02:54:21 : 24,100,000 : 0 : 6,444,695,600:81%
2021-12-26 05:47:05 : 24,200,000 : 0 : 6,472,614,575:81%
2021-12-26 09:20:37 : 24,300,000 : 0 : 6,499,878,175:81%
2021-12-26 13:08:17 : 24,400,000 : 0 : 6,527,403,925:82%
2021-12-26 15:52:48 : 24,500,000 : 0 : 6,546,016,575:82%
2021-12-26 18:09:30 : 24,600,000 : 0 : 6,574,197,700:82%
2021-12-26 20:32:38 : 24,700,000 : 0 : 6,602,772,050:83%
2021-12-26 22:48:30 : 24,800,000 : 0 : 6,630,691,025:83%
2021-12-27 01:06:06 : 24,900,000 : 0 : 6,658,741,075:83%
2021-12-27 03:33:44 : 25,000,000 : 0 : 6,677,746,950:84%
2021-12-27 06:13:27 : 25,100,000 : 0 : 6,705,928,075:84%
2021-12-27 09:31:29 : 25,200,000 : 0 : 6,734,109,200:84%
2021-12-27 12:48:37 : 25,300,000 : 0 : 6,761,372,800:85%
2021-12-27 15:16:43 : 25,400,000 : 0 : 6,788,636,400:85%
2021-12-27 17:21:25 : 25,500,000 : 0 : 6,816,817,525:85%
2021-12-27 19:17:27 : 25,600,000 : 0 : 6,844,867,575:86%
2021-12-27 21:16:07 : 25,700,00

{'all_awardings': [], 'allow_live_comments': False, 'archived': False, 'author': 'black-rose-petal', 'author_created_utc': 1626716381, 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_apau21g3', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_by': None, 'can_gild': True, 'can_mod_post': False, 'category': None, 'content_categories': None, 'contest_mode': False, 'created_utc': 1640802039, 'discussion_type': None, 'distinguished': None, 'domain': 'self.NYCFriends', 'edited': False, 'gilded': 0, 'gildings': {}, 'hidden': False, 'hide_score': False, 'id': 'rreqyq', 'is_created_from_ads_ui': False, 'is_crosspostable': False, 'is_meta': False, 'is_original_content': False, 'is_reddit_media_domain': False, 'is_robot_indexable': False, 'is_self': True, 'is_video': Fals

2021-12-29 18:25:29 : 27,600,000 : 0 : 7,373,099,825:92%
2021-12-29 20:19:49 : 27,700,000 : 0 : 7,401,805,250:93%
Decoding error with 134,217,728 bytes, reading another chunk
2021-12-29 22:15:30 : 27,800,000 : 0 : 7,430,379,600:93%
2021-12-30 00:16:14 : 27,900,000 : 0 : 7,458,560,725:93%
2021-12-30 02:26:11 : 28,000,000 : 0 : 7,486,741,850:94%
2021-12-30 04:43:54 : 28,100,000 : 0 : 7,514,791,900:94%
2021-12-30 07:23:08 : 28,200,000 : 0 : 7,533,928,850:94%
Decoding error with 134,217,728 bytes, reading another chunk
2021-12-30 10:38:02 : 28,300,000 : 0 : 7,561,978,900:95%
2021-12-30 13:33:14 : 28,400,000 : 0 : 7,589,373,575:95%
2021-12-30 15:48:16 : 28,500,000 : 0 : 7,617,554,700:95%
2021-12-30 17:46:43 : 28,600,000 : 0 : 7,646,129,050:96%
2021-12-30 19:41:41 : 28,700,000 : 0 : 7,674,703,400:96%
2021-12-30 21:39:11 : 28,800,000 : 0 : 7,703,539,900:96%
Decoding error with 134,217,728 bytes, reading another chunk
2021-12-30 23:38:07 : 28,900,000 : 0 : 7,741,158,425:97%
2021-12-31 01:47:00

In [9]:
selected_lines

[{'all_awardings': [],
  'allow_live_comments': False,
  'archived': False,
  'author': 'black-rose-petal',
  'author_created_utc': 1626716381,
  'author_flair_background_color': None,
  'author_flair_css_class': None,
  'author_flair_richtext': [],
  'author_flair_template_id': None,
  'author_flair_text': None,
  'author_flair_text_color': None,
  'author_flair_type': 'text',
  'author_fullname': 't2_apau21g3',
  'author_patreon_flair': False,
  'author_premium': False,
  'awarders': [],
  'banned_by': None,
  'can_gild': True,
  'can_mod_post': False,
  'category': None,
  'content_categories': None,
  'contest_mode': False,
  'created_utc': 1640474253,
  'discussion_type': None,
  'distinguished': None,
  'domain': 'self.AskNYC',
  'edited': False,
  'gilded': 0,
  'gildings': {},
  'hidden': False,
  'hide_score': False,
  'id': 'rojsnl',
  'is_created_from_ads_ui': False,
  'is_crosspostable': True,
  'is_meta': False,
  'is_original_content': False,
  'is_reddit_media_domain': F