In [17]:
!pip install requests




You should consider upgrading via the 'C:\Users\jgber\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [18]:
import requests
import csv
import time

def fetch_reddit_posts(query, after=None, limit=100):
    """
    Fetch a batch of Reddit posts matching the query using Reddit's public JSON search endpoint.

    Args:
        query (str): The search query string.
        after (str): The "after" token for pagination (if any).
        limit (int): Number of posts to return (maximum allowed is typically 100).

    Returns:
        tuple: (list of posts, after token) where each post is a dict of post data.
    """
    url = "https://www.reddit.com/search.json"
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; RedditScraper/1.0)"
    }
    params = {
        "q": query,
        "sort": "new",    # Fetch newest posts first
        "limit": limit,
        "after": after
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=10)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            return [], None
        data = response.json().get("data", {})
        posts = data.get("children", [])
        after = data.get("after", None)
        return posts, after
    except Exception as e:
        print("Exception during fetch:", e)
        return [], None

def scrape_reddit(query="terminal illness", max_posts=500):
    """
    Scrape historical Reddit posts containing the given query.

    Args:
        query (str): The search query (default is "terminal illness").
        max_posts (int): The maximum number of posts to fetch.

    Returns:
        list: A list of post data dictionaries.
    """
    all_posts = []
    after = None  # Token for pagination
    fetched_count = 0

    while fetched_count < max_posts:
        posts_batch, after = fetch_reddit_posts(query, after=after, limit=100)
        if not posts_batch:
            print("No more posts returned by the API.")
            break

        for post in posts_batch:
            all_posts.append(post["data"])
            fetched_count += 1
            if fetched_count >= max_posts:
                break

        print(f"Fetched {fetched_count} posts so far...")
        if after is None:
            # No more pages available
            break

        time.sleep(1)  # Pause to respect rate limits

    return all_posts

def save_posts_to_csv(posts, filename="reddit_terminal_illness_posts.csv"):
    """
    Save the list of Reddit posts to a CSV file.

    Args:
        posts (list): List of post data dictionaries.
        filename (str): The output CSV file name.
    """
    if not posts:
        print("No posts to save.")
        return

    # Define the CSV columns; adjust fields as needed.
    headers = [
        "id", "title", "selftext", "subreddit", "author",
        "created_utc", "url", "score", "num_comments"
    ]
    try:
        with open(filename, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=headers)
            writer.writeheader()
            for post in posts:
                row = {field: post.get(field, "") for field in headers}
                writer.writerow(row)
        print(f"Saved {len(posts)} posts to {filename}")
    except Exception as e:
        print("Error saving CSV:", e)

if __name__ == "__main__":
    # Define the search query (space will be URL-encoded automatically by requests).
    query = "terminal illness"
    print(f"Scraping Reddit posts for query: {query}")
    posts = scrape_reddit(query=query, max_posts=500)
    save_posts_to_csv(posts)


Scraping Reddit posts for query: terminal illness
Fetched 100 posts so far...
Fetched 200 posts so far...
Fetched 249 posts so far...
Saved 249 posts to reddit_terminal_illness_posts.csv
