# Notebook for scraping reddit posts

### The notebook is split up into the following sections:
1. Defining API
2. Scrape posts from r/depression
3. Scrape posts from r/Casual conversation
4. Merge datasets, label them and remove possible duplications



In [2]:
# Import packages
import praw
import pandas as pd
import os
from datetime import datetime, timezone
import time
from IPython.display import display


# 1. Defining API

In [3]:
# The weird letters are the cresidentials from my API

reddit = praw.Reddit(client_id ='hQafbgnPnl6Jl_wabPofuA',
                     client_secret ='OZ0EsYse7J-JiYE7kPk3mddQgMZ2nA',
                     user_agent ='WebScraper/Own-Biscotti6249')
                     
# to verify whether the instance is authorized instance or not
print(reddit.read_only)

True


# 2. Scrape posts from r/depression

### Defining function to scrape 1000 posts from r/depression

In [15]:
# Target number of posts to collect
TARGET_COUNT = 1000

# Small delay between requests to avoid hitting Reddit rate limits
SLEEP = 0.4

# Output file for the baseline scrape
OUTPUT_BASE = "depression_baseline_1000.csv"

# ---------------------------------------------

# Access the depression subreddit
sub = reddit.subreddit("depression")

def collect_baseline_depression():
    collected = []      # stores all scraped posts
    seen_ids = set()    # used to avoid duplicates

    print("Scraping baseline posts from r/depression...")

    # Loop through newest posts
    for submission in sub.new(limit=None):

        sid = submission.id

        # Skip if we already collected this post
        if sid in seen_ids:
            continue
        seen_ids.add(sid)

        # Extract title and body safely
        title = submission.title or ""
        body = submission.selftext or ""

        # Combine into one text field for later NLP use
        full_text = (title + "\n\n" + body).strip()

        # Skip empty posts
        if full_text == "":
            continue

        # Convert Unix timestamp to readable datetime
        created_dt = datetime.fromtimestamp(
            submission.created_utc, tz=timezone.utc
        ).isoformat()

        # Store relevant fields in a dictionary
        collected.append({
            "id": sid,
            "subreddit": submission.subreddit.display_name,
            "author": str(submission.author) if submission.author else "[deleted]",
            "created_utc": submission.created_utc,
            "created_dt": created_dt,
            "title": title,
            "selftext": body,
            "full_text": full_text,
            "score": submission.score,
            "num_comments": submission.num_comments,
            "url": submission.url,
            "link_flair_text": submission.link_flair_text,
            "source": "baseline"
        })

        # Progress update every 100 posts
        if len(collected) % 100 == 0:
            print(f"Collected {len(collected)} baseline posts...")

        # Stop once we reach the target number
        if len(collected) >= TARGET_COUNT:
            break

        # Small pause between keyword searches
        time.sleep(SLEEP)

    # Convert results to DataFrame and save to CSV
    df = pd.DataFrame(collected)
    df.to_csv(OUTPUT_BASE, index=False)

    print(f"Saved baseline posts: {OUTPUT_BASE}")
    return df, seen_ids


### After i have reached the maximum allowed posts to scrape with reddits API in one go, i define i function go through a recursive key-word based search to collect more posts. To avoid collecting the same post multiple times, I keep track of already seen post IDs.

In [21]:
# Output file for the keyword-based scrape
OUTPUT_KEYWORD = "depression_expanded_keywords.csv"

def keyword_expand_depression(df_existing, seen_ids, keywords, max_per_keyword=1000):
    # Start from the already collected baseline rows
    rows = df_existing.to_dict("records")

    print("\nStarting keyword expansion...")

    # Loop through each keyword
    for kw in keywords:
        print(f"Searching keyword: {kw!r}")

        try:
            # Query Reddit for posts matching the keyword
            for submission in sub.search(
                query=kw,
                sort="new",
                time_filter="all",
                limit=max_per_keyword,
            ):
                sid = submission.id

                # Skip posts already collected
                if sid in seen_ids:
                    continue
                seen_ids.add(sid)

                # extract title and body
                title = submission.title or ""
                body = submission.selftext or ""
                full_text = (title + "\n\n" + body).strip()

                # Skip empty posts
                if full_text == "":
                    continue

                # Convert timestamp to readable datetime
                created_dt = datetime.fromtimestamp(
                    submission.created_utc, tz=timezone.utc
                ).isoformat()

                # Store the post with the matching keyword
                rows.append({
                    "id": sid,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author) if submission.author else "[deleted]",
                    "created_utc": submission.created_utc,
                    "created_dt": created_dt,
                    "title": title,
                    "selftext": body,
                    "full_text": full_text,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                    "link_flair_text": submission.link_flair_text,
                    "matched_keyword": kw,
                    "source": f"keyword:{kw}"
                })

        # Handle potential API or connection errors per keyword
        except Exception as e:
            print(f"Error for keyword {kw}: {e}")

        # Progress update after each keyword
        print(f"Total unique collected so far: {len(seen_ids)}")

        # Small pause between keyword searches
        time.sleep(0.4)

    # Save expanded dataset
    df = pd.DataFrame(rows)
    df.to_csv(OUTPUT_KEYWORD, index=False)

    print(f"\nSaved: {OUTPUT_KEYWORD}")
    return df, seen_ids

### The keywords selected to be used for scraping

In [22]:
# 24 keywords
DEPRESSION_KEYWORDS = [
    "sad", "sadness",
    "depressed", "depression",
    "hopeless", "helpless",
    "tired", "exhausted",
    "empty", "numb",
    "anxiety", "anxious",
    "lonely", "alone",
    "worthless",
    "guilt", "guilty",
    "ashamed", "shame",
    "overwhelmed",
    "panic", "fear",
    "struggling",
    "can't cope",
]

### Scraping 

In [23]:
# 1000 posts
df_base, seen_ids = collect_baseline_depression()

# key-word based scraping
df_full, seen_ids = keyword_expand_depression(df_base, seen_ids, DEPRESSION_KEYWORDS)


Scraping baseline posts from r/depression...
Collected 100 baseline posts...
Collected 200 baseline posts...
Collected 300 baseline posts...
Collected 400 baseline posts...
Collected 500 baseline posts...
Collected 600 baseline posts...
Collected 700 baseline posts...
Collected 800 baseline posts...
Collected 900 baseline posts...
Saved baseline posts: depression_baseline_1000.csv

Starting keyword expansion...
Searching keyword: 'sad'
Total unique collected so far: 1114
Searching keyword: 'sadness'
Total unique collected so far: 1114
Searching keyword: 'depressed'
Total unique collected so far: 1114
Searching keyword: 'depression'
Total unique collected so far: 1114
Searching keyword: 'hopeless'
Total unique collected so far: 1313
Searching keyword: 'helpless'
Total unique collected so far: 1533
Searching keyword: 'tired'
Total unique collected so far: 1629
Searching keyword: 'exhausted'
Total unique collected so far: 1785
Searching keyword: 'empty'
Total unique collected so far: 1932

# 3. Scrape posts from r/CasualConversation

### The previous setup for scraping r/depression is reused for r/CasualConversation, but with different keywords

In [24]:
TARGET_COUNT = 1000
SLEEP = 0.4

OUTPUT_BASE_CC = "casualconversation_baseline_1000.csv"
OUTPUT_KEYWORD_CC = "casualconversation_expanded_keywords.csv"
# -------------------------

sub_cc = reddit.subreddit("CasualConversation")

# 24 keywords

CASUAL_KEYWORDS = [
    "morning", "evening",
    "today", "yesterday",
    "weekend", "weekday",

    "work", "school",
    "job", "college",

    "friend", "friends",
    "family", "coworker",

    "hobby", "hobbies",
    "gaming", "game",
    "music", "movie",

    "food", "cooking",
    "travel", "weather",

    "discussion", "story",
]

def collect_baseline_casual():
    collected = []
    seen_ids = set()

    print("Scraping baseline posts from r/CasualConversation...")

    for submission in sub_cc.new(limit=None):

        sid = submission.id
        if sid in seen_ids:
            continue
        seen_ids.add(sid)

        title = submission.title or ""
        body = submission.selftext or ""
        full_text = (title + "\n\n" + body).strip()
        if full_text == "":
            continue

        created_dt = datetime.fromtimestamp(
            submission.created_utc, tz=timezone.utc
        ).isoformat()

        collected.append({
            "id": sid,
            "subreddit": submission.subreddit.display_name,
            "author": str(submission.author) if submission.author else "[deleted]",
            "created_utc": submission.created_utc,
            "created_dt": created_dt,
            "title": title,
            "selftext": body,
            "full_text": full_text,
            "score": submission.score,
            "num_comments": submission.num_comments,
            "url": submission.url,
            "link_flair_text": submission.link_flair_text,
            "source": "baseline",
        })

        if len(collected) % 100 == 0:
            print(f"Collected {len(collected)} baseline posts...")

        if len(collected) >= TARGET_COUNT:
            break

        time.sleep(SLEEP)

    df_base = pd.DataFrame(collected)
    df_base.to_csv(OUTPUT_BASE_CC, index=False)
    print(f"Saved baseline to {OUTPUT_BASE_CC}")

    return df_base, seen_ids


def keyword_expand_casual(df_existing, seen_ids, keywords, max_per_keyword=1000):
    rows = df_existing.to_dict("records")

    print("\nStarting keyword expansion for r/CasualConversation...")

    for kw in keywords:
        print(f"\nSearching keyword: {kw!r}")

        try:
            for submission in sub_cc.search(
                query=kw,
                sort="new",
                time_filter="all",
                limit=max_per_keyword,
            ):
                sid = submission.id

                if sid in seen_ids:
                    continue
                seen_ids.add(sid)

                title = submission.title or ""
                body = submission.selftext or ""
                full_text = (title + "\n\n" + body).strip()
                if full_text == "":
                    continue

                created_dt = datetime.fromtimestamp(
                    submission.created_utc, tz=timezone.utc
                ).isoformat()

                rows.append({
                    "id": sid,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author) if submission.author else "[deleted]",
                    "created_utc": submission.created_utc,
                    "created_dt": created_dt,
                    "title": title,
                    "selftext": body,
                    "full_text": full_text,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                    "link_flair_text": submission.link_flair_text,
                    "matched_keyword": kw,
                    "source": f"keyword:{kw}",
                })

        except Exception as e:
            print(f"Error for keyword {kw}: {e}")

        print(f"Total unique posts so far: {len(seen_ids)}")
        time.sleep(0.4)

    df_full = pd.DataFrame(rows)
    df_full.to_csv(OUTPUT_KEYWORD_CC, index=False)
    print(f"\nSaved full CC keyword+baseline dataset to {OUTPUT_KEYWORD_CC}")

    return df_full, seen_ids


# scraping
df_cc_base, cc_seen_ids = collect_baseline_casual()

df_cc_full, cc_seen_ids = keyword_expand_casual(
    df_cc_base,
    cc_seen_ids,
    CASUAL_KEYWORDS,
)

Scraping baseline posts from r/CasualConversation...
Collected 100 baseline posts...
Collected 200 baseline posts...
Collected 300 baseline posts...
Collected 400 baseline posts...
Collected 500 baseline posts...
Collected 600 baseline posts...
Collected 700 baseline posts...
Collected 800 baseline posts...
Collected 900 baseline posts...
Saved baseline to casualconversation_baseline_1000.csv

Starting keyword expansion for r/CasualConversation...

Searching keyword: 'morning'
Total unique posts so far: 1169

Searching keyword: 'evening'
Total unique posts so far: 1187

Searching keyword: 'today'
Total unique posts so far: 1297

Searching keyword: 'yesterday'
Total unique posts so far: 1496

Searching keyword: 'weekend'
Total unique posts so far: 1681

Searching keyword: 'weekday'
Total unique posts so far: 1867

Searching keyword: 'work'
Total unique posts so far: 1920

Searching keyword: 'school'
Total unique posts so far: 2082

Searching keyword: 'job'
Total unique posts so far: 221

# 4. Merge datasets, label them and remove possible duplications

In [60]:
# load and merge datasets from r/depression
df_dep_base = pd.read_csv("depression_baseline_1000.csv")
df_dep_kw   = pd.read_csv("depression_expanded_keywords.csv")

# merge + remove ID duplicates within depression
df_dep = pd.concat([df_dep_base, df_dep_kw], ignore_index=True)
df_dep = df_dep.drop_duplicates(subset="id").reset_index(drop=True)

# label depression
df_dep["label"] = 1

print("Depression dataset:", df_dep.shape)

Depression dataset: (3872, 15)


In [61]:
# do the same for casual conversation df
df_cc_base = pd.read_csv("casualconversation_baseline_1000.csv")
df_cc_kw   = pd.read_csv("casualconversation_expanded_keywords.csv")


df_cc = pd.concat([df_cc_base, df_cc_kw], ignore_index=True)
df_cc = df_cc.drop_duplicates(subset="id").reset_index(drop=True)

df_cc["label"] = 0

print("CasualConversation dataset:", df_cc.shape)

CasualConversation dataset: (4051, 15)


In [62]:
# Combine both labeled datasets
df_final = pd.concat([df_dep, df_cc], ignore_index=True)
print("Combined rows (before global ID dedupe):", len(df_final))

# 3.1 Global ID dedupe (just in case the same ID slipped into both sets)
mask_dup_id = df_final.duplicated(subset="id", keep="first")
dup_by_id = df_final[mask_dup_id].copy()
df_final = df_final[~mask_dup_id].reset_index(drop=True)

print("Removed due to duplicate ID:", len(dup_by_id))
print("Rows after global ID dedupe:", len(df_final))

# 3.2 Remove moderator/bot posts 
bad_authors = ["AutoModerator", "moderator", "ModTeam", "CasualMods"]

mask_bad_authors = df_final["author"].isin(bad_authors)
removed_authors = df_final[mask_bad_authors].copy()
df_final = df_final[~mask_bad_authors].reset_index(drop=True)

print("Removed due to bad authors:", len(removed_authors))
print("Rows after author filter:", len(df_final))

# 3.3 Remove placeholder texts in full_text: [deleted], [removed]
placeholder_texts = ["[deleted]", "[removed]"]

mask_placeholders = (
    df_final["full_text"]
    .astype(str)
    .str.strip()
    .isin(placeholder_texts)
)
removed_placeholders = df_final[mask_placeholders].copy()
df_final = df_final[~mask_placeholders].reset_index(drop=True)

print("Removed placeholder texts:", len(removed_placeholders))
print("Rows after placeholder filter:", len(df_final))

# 3.4 Remove textual duplicates (same full_text appearing multiple times)
mask_dup_text = df_final.duplicated(subset="full_text", keep="first")
dup_by_text = df_final[mask_dup_text].copy()  # these are the extra copies
df_final_clean = df_final[~mask_dup_text].reset_index(drop=True)

print("Removed due to duplicate full_text:", len(dup_by_text))
print("Final cleaned rows:", len(df_final_clean))

# Save final cleaned dataset
df_final_clean.to_csv("clean_reddit.csv", index=False)
print("Saved final cleaned file: clean_reddit.csv")



Combined rows (before global ID dedupe): 7923
Removed due to duplicate ID: 0
Rows after global ID dedupe: 7923
Removed due to bad authors: 3
Rows after author filter: 7920
Removed placeholder texts: 0
Rows after placeholder filter: 7920
Removed due to duplicate full_text: 12
Final cleaned rows: 7908
Saved final cleaned file: clean_reddit.csv


### Lets check exactly what was removed

In [63]:
print("\n=== Moderator comments removed ===")
display(removed_authors)

print("\n=== removed duplicate texts ===") # these are the extra copies of the duplicates that have been removed
dup_by_text.head(12)



=== Moderator comments removed ===


Unnamed: 0,id,subreddit,author,created_utc,created_dt,title,selftext,full_text,score,num_comments,url,link_flair_text,source,matched_keyword,label
4204,1p6eml9,CasualConversation,CasualMods,1764083000.0,2025-11-25T15:00:35+00:00,"r/CasualConversation resources, rules, etiquet...",We have a lot to offer in terms of reading ma...,"r/CasualConversation resources, rules, etiquet...",2,1,https://www.reddit.com/r/CasualConversation/co...,:cc: Meta,baseline,,0
4699,1p0e5ge,CasualConversation,CasualMods,1763478000.0,2025-11-18T15:00:29+00:00,Join r/CasualConversation on Twitter and IRC!,Our subreddit has many ways to keep in touch w...,Join r/CasualConversation on Twitter and IRC!\...,1,0,https://www.reddit.com/r/CasualConversation/co...,:cc: Meta,baseline,,0
5780,1oubmj0,CasualConversation,CasualMods,1762873000.0,2025-11-11T15:00:32+00:00,"r/CasualConversation resources, rules, etiquet...",We have a lot to offer in terms of reading ma...,"r/CasualConversation resources, rules, etiquet...",1,0,https://www.reddit.com/r/CasualConversation/co...,:cc: Meta,keyword:work,work,0



=== removed duplicate texts ===


Unnamed: 0,id,subreddit,author,created_utc,created_dt,title,selftext,full_text,score,num_comments,url,link_flair_text,source,matched_keyword,label
534,1p7fkd8,depression,Dry_Lock1015,1764182000.0,2025-11-26T18:33:48+00:00,i hate me,"Right now I’m at the movies with friends, yes ...",i hate me\n\nRight now I’m at the movies with ...,2,1,https://www.reddit.com/r/depression/comments/1...,,baseline,,1
1636,1p48yb6,depression,United-Ingenuity8372,1763858000.0,2025-11-23T00:37:46+00:00,My 23m gf 21f antidepressants or depression ca...,I am dating the most amazing girl I’ve ever me...,My 23m gf 21f antidepressants or depression ca...,1,1,https://www.reddit.com/r/depression/comments/1...,,keyword:exhausted,exhausted,1
1729,1otr8au,depression,United-Ingenuity8372,1762812000.0,2025-11-10T21:53:15+00:00,My 23m gf 21f antidepressants or depression ca...,I am dating the most amazing girl I’ve ever me...,My 23m gf 21f antidepressants or depression ca...,0,3,https://www.reddit.com/r/depression/comments/1...,,keyword:exhausted,exhausted,1
2366,1p0u78y,depression,SageTheLynx,1763516000.0,2025-11-19T01:25:13+00:00,My friend tried to kill herself again and I ju...,I know she feels worse. I know how awful she f...,My friend tried to kill herself again and I ju...,2,0,https://www.reddit.com/r/depression/comments/1...,,keyword:lonely,lonely,1
2746,1o5crk8,depression,DegreeLogical1248,1760339000.0,2025-10-13T07:00:55+00:00,"I hurt my dad, betrayed his trust, and it’s te...",I don’t even know where to start. I feel compl...,"I hurt my dad, betrayed his trust, and it’s te...",1,0,https://www.reddit.com/r/depression/comments/1...,,keyword:guilt,guilt,1
2964,1nqb50i,depression,_iwtd,1758817000.0,2025-09-25T16:18:15+00:00,I understand but I don’t understand,When someone is suicidal everyone tells him th...,I understand but I don’t understand\n\nWhen so...,2,1,https://www.reddit.com/r/depression/comments/1...,,keyword:guilty,guilty,1
3699,1ojgd0q,depression,FriendlyHighway1363,1761771000.0,2025-10-29T20:55:51+00:00,Living for other people. But why?,Skip To QUESTION If you want. It's the 2nd las...,Living for other people. But why?\n\nSkip To Q...,1,0,https://www.reddit.com/r/depression/comments/1...,,keyword:can't cope,can't cope,1
3714,1o449jf,depression,No-Advice2384,1760210000.0,2025-10-11T19:20:13+00:00,I experience a lot of Mood swings recently. Do...,Some context:\nIm 17.5 years old female.\nI ha...,I experience a lot of Mood swings recently. Do...,1,0,https://www.reddit.com/r/depression/comments/1...,,keyword:can't cope,can't cope,1
6594,1onu8yl,CasualConversation,Weirdo_the_dino,1762219000.0,2025-11-04T01:20:13+00:00,How to help perfectionism?,I've become such a perfectionist that i find n...,How to help perfectionism?\n\nI've become such...,0,1,https://www.reddit.com/r/CasualConversation/co...,,keyword:hobby,hobby,0
6832,1nkphvw,CasualConversation,JealousMap3475,1758243000.0,2025-09-19T00:47:32+00:00,"17m, bored, numbers game anyone?","17m, bored, numbers game anyone?\n\nAround my ...","17m, bored, numbers game anyone?\n\n17m, bored...",0,4,https://www.reddit.com/r/CasualConversation/co...,:chat: Just Chatting,keyword:gaming,gaming,0
