In [20]:
import zstandard as zstd
import json

# Path to your Pushshift .zst file (e.g., 'RS_2024-01.zst')
ZST_FILE_PATH = '../Data/climbharder_submissions.zst'

def process_pushshift_zst(file_path, subreddit_filter, score_threshold):
    with open(file_path, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        stream_reader = dctx.stream_reader(f)
        buffer = b''
        for chunk in iter(lambda: stream_reader.read(2**20), b''):
            buffer += chunk
            lines = buffer.split(b'\n')
            buffer = lines.pop()  # Save the last partial line for the next read
            
            for line in lines:
                if not line.strip():
                    continue
                try:
                    post = json.loads(line)
                    if (post.get('subreddit') == subreddit_filter and 
                        post.get('score', 0) > score_threshold):
                        score = post.get('score')
                        title = post.get('title', '').strip()
                        selftext = post.get('selftext', '').strip()
                        timestamp = post.get('created_utc')
                        
                        print(json.dumps({
                            'score': score,
                            'title': title,
                            'body': selftext,
                            'timestamp': timestamp
                        }))
                except json.JSONDecodeError:
                    continue

if __name__ == "__main__":
    process_pushshift_zst(
        file_path=ZST_FILE_PATH,
        subreddit_filter='climbharder',
        score_threshold=20
    )