In [None]:
import pandas as pd
import datetime
import json
import zstandard
import os
import sys
import csv

In [None]:
subreddit_name = 'PUT_NAME_HERE'

In [None]:
input_file = f"subreddits24\{subreddit_name}_submissions.zst" 
output_file = f"subreddits24\{subreddit_name}_submissions_filtered" 

output_format = "csv" 

single_field = None 
write_bad_lines = False 

from_date = datetime.strptime("2024-03-13", "%Y-%m-%d") 
to_date = datetime.strptime("2024-12-13", "%Y-%m-%d") 

field = None
values = []
values_file = None
exact_match = False

def write_headers_csv(writer, is_submission):
    if is_submission:
        headers = ['Score', 'Date', 'Title', 'Author', 'URL', 'Selftext', 'External URL','ID']
    else:
        headers = ['Score', 'Date', 'Comment ID', 'Author', 'URL', 'Body']
    writer.writerow(headers)

def write_line_csv(writer, obj, is_submission):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d"))
    
    if is_submission:
        output_list.append(obj['title'])
    else:
        output_list.append(obj['id'])
        
    output_list.append(f"u/{obj['author']}")
    
    if 'permalink' in obj:
        output_list.append(f"https://www.reddit.com{obj['permalink']}")
    else:
        output_list.append(f"https://www.reddit.com/r/{obj['subreddit']}/comments/{obj['link_id'][3:]}/_/{obj['id']}")
    
    if is_submission:
        # Add selftext as its own column
        if 'selftext' in obj:
            output_list.append(obj['selftext'])
        else:
            output_list.append("")
        
        # Add external URL as its own column
        if 'url' in obj:
            output_list.append(obj['url'])
        else:
            output_list.append("")

        if 'id' in obj:
            output_list.append(obj['id'])
        else:
            output_list.append("")

    else:
        output_list.append(obj['body'])
        
    writer.writerow(output_list)

def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
    chunk = reader.read(chunk_size)
    bytes_read += chunk_size
    if previous_chunk is not None:
        chunk = previous_chunk + chunk
    try:
        return chunk.decode()
    except UnicodeDecodeError:
        if bytes_read > max_window_size:
            raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
        return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)

def read_lines_zst(file_name):
    with open(file_name, 'rb') as file_handle:
        buffer = ''
        reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
        while True:
            chunk = read_and_decode(reader, 2**27, (2**29) * 2)

            if not chunk:
                break
            lines = (buffer + chunk).split("\n")

            for line in lines[:-1]:
                yield line.strip(), file_handle.tell()

            buffer = lines[-1]

        reader.close()

def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match):
    output_path = f"{output_file}.{output_format}"
    is_submission = "submission" in input_file
    print(f"Processing {input_file} to {output_path}")
    handle = open(output_path, 'w', encoding='UTF-8', newline='')
    writer = csv.writer(handle)
    write_headers_csv(writer, is_submission)

    file_size = os.stat(input_file).st_size
    matched_lines = 0
    total_lines = 0
    for line, file_bytes_processed in read_lines_zst(input_file):
        total_lines += 1
        if total_lines % 1000000 == 0:
            print(f"Progress: {total_lines:,} lines, {matched_lines:,} matches, {(file_bytes_processed / file_size) * 100:.0f}%")

        try:
            obj = json.loads(line)
            created = datetime.utcfromtimestamp(int(obj['created_utc']))

            if created < from_date or created > to_date:
                continue

            if field is not None:
                try:
                    field_value = obj[field].lower()
                    matched = False
                    for value in values:
                        if (exact_match and value == field_value) or (not exact_match and value in field_value):
                            matched = True
                            break
                    if not matched:
                        continue
                except KeyError:
                    continue

            matched_lines += 1
            write_line_csv(writer, obj, is_submission)
        except (KeyError, json.JSONDecodeError):
            continue

    handle.close()
    print(f"Complete: {total_lines:,} lines processed, {matched_lines:,} matches")

if __name__ == "__main__":
    if single_field is not None:
        output_format = "txt"

    if values_file is not None:
        values = []
        with open(values_file, 'r') as values_handle:
            for value in values_handle:
                values.append(value.strip().lower())
    else:
        values = [value.lower() for value in values]

    input_files = []
    if os.path.isdir(input_file):
        if not os.path.exists(output_file):
            os.makedirs(output_file)
        for file in os.listdir(input_file):
            if not os.path.isdir(file) and file.endswith(".zst"):
                input_name = os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]
                input_files.append((os.path.join(input_file, file), os.path.join(output_file, input_name)))
    else:
        input_files.append((input_file, output_file))
    
    print(f"Processing {len(input_files)} files")
    for file_in, file_out in input_files:
        try:
            process_file(file_in, file_out, output_format, field, values, from_date, to_date, single_field, exact_match)
        except Exception as err:
            print(f"Error processing {file_in}: {err}")

  input_file = f"subreddits24\{subreddit_name}_submissions.zst"
  output_file = f"subreddits24\{subreddit_name}_submissions_filtered"


Processing 1 files
Processing subreddits24\AskThe_Donald_submissions.zst to subreddits24\AskThe_Donald_submissions_filtered.csv


  created = datetime.utcfromtimestamp(int(obj['created_utc']))


Complete: 106,580 lines processed, 3,942 matches


In [8]:
file_path = f'subreddits24/{subreddit_name}_submissions_filtered.csv'
df = pd.read_csv(file_path)

In [None]:
def filter_elon_musk_posts(df):

    def contains_elon_musk(row):
        for col in ['Title', 'Selftext', 'External URL']:
            if pd.notna(row[col]) and (
                'elon' in str(row[col]).lower() or 'musk' in str(row[col]).lower()
            ):
                return True
        return False

    filtered_df = df[df.apply(contains_elon_musk, axis=1)].copy()
    return filtered_df

elon_musk_df = filter_elon_musk_posts(df)

In [192]:
# Output to CSV
output_filepath = f'reddit_data/{subreddit_name}_submissions_filtered.csv' 
elon_musk_df.to_csv(output_filepath, index=False)