In [4]:
import zstandard as zstd
import json
import sys
import os

## Preprocessing
This will read, filter, clean, and combine my data for chunking and ingestion for RAG.

In [10]:
submissions_zst_path = "../Data/climbharder_submissions.zst" #local path to file
comments_zst_path = "../Data/climbharder_comments.zst"
output_path_submissions = "../Data/climbharder_posts.txt"
output_path_comments = "../Data/climbharder_comments.txt"


### Processing zst-compressed ndj files
For this step, I read the zst into a buffer a few lines at a time. For each line, I load the post (or comment) and filter it for sufficient karma threshold and text length. Then, I write it into the output file on its own line. My goal is to create one large newline-separated txt file of all post/comment content.

In [25]:

def process_pushshift_zst(input_path, output_path, subreddit_filter, score_threshold):
    if not os.path.isfile(input_path):
        print(f"Error: File '{input_path}' not found.")
        return

    with open(input_path, 'rb') as f, open(output_path, 'w', encoding='utf-8') as out_file:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        buffer = b''
        for chunk in iter(lambda: stream_reader.read(2**20), b''):
            buffer += chunk
            lines = buffer.split(b'\n')
            buffer = lines.pop()  # Save the last partial line for the next iteration

            for line in lines:
                if not line.strip():
                    continue
                try:
                    post = json.loads(line)
                    if post.get('subreddit') != subreddit_filter:
                        continue
                    if post.get('score', 0) <= score_threshold:
                        continue

                    # For submissions
                    if 'title' in post:
                        title = post.get('title', '').strip()
                        body = post.get('selftext', '').strip()
                        full_text = f"{title} {body}".strip()
                    # For comments
                    else:
                        full_text = post.get('body', '').strip()

                    if len(full_text) < 120:
                        continue

                    out_file.write(full_text + '\n')

                except json.JSONDecodeError:
                    continue


In [15]:
process_pushshift_zst(
        input_path=submissions_zst_path,
        output_path=output_path_submissions,
        subreddit_filter='climbharder',
        score_threshold=20
    )

process_pushshift_zst(
        input_path=comments_zst_path,
        output_path=output_path_comments,
        subreddit_filter='climbharder',
        score_threshold=20
    )

### Concatenation
Since it's easier (and safer) to process these two massive zst files into separate txts using the above script, I need to combine them into one file

In [21]:
def concatenate_two_txt_files(source_path1, source_path2, destination_path):
    try:
        with open(destination_path, 'w', encoding='utf-8') as dest_file:
            for source_path in [source_path1, source_path2]:
                with open(source_path, 'r', encoding='utf-8') as source_file:
                    for line in source_file:
                        dest_file.write(line)
        print(f"Successfully wrote combined contents to '{destination_path}'.")
    except FileNotFoundError as e:
        print(f"Error: {e}")

In [23]:
output_posts_and_comments = "../Data/climbharder_posts_and_comments.txt"
concatenate_two_txt_files(output_path_submissions, output_path_comments, output_posts_and_comments)

Successfully wrote combined contents to '../Data/climbharder_posts_and_comments.txt'.
