# Notebook for scraping reddit posts

In [5]:
# Import package
import praw
import pandas as pd
import datetime
import os
from datetime import timezone
import time

# This is my API

In [8]:
# The weird letters are the cresidentials from my API

reddit = praw.Reddit(client_id ='hQafbgnPnl6Jl_wabPofuA',
                     client_secret ='OZ0EsYse7J-JiYE7kPk3mddQgMZ2nA',
                     user_agent ='WebScraper/Own-Biscotti6249')
                     
# to verify whether the instance is authorized instance or not
print(reddit.read_only)

True


# r/depression

In [15]:
# 1000 POSTS

TARGET_COUNT = 1000
SLEEP = 0.4
OUTPUT_BASE = "depression_baseline_1000.csv"
# -------------------------

sub = reddit.subreddit("depression")


def collect_baseline_depression():
    collected = []
    seen_ids = set()

    print("Scraping baseline posts from r/depression...")

    for submission in sub.new(limit=None):

        sid = submission.id
        if sid in seen_ids:
            continue
        seen_ids.add(sid)

        title = submission.title or ""
        body = submission.selftext or ""
        full_text = (title + "\n\n" + body).strip()

        if full_text == "":
            continue

        created_dt = datetime.fromtimestamp(
            submission.created_utc, tz=timezone.utc
        ).isoformat()

        collected.append({
            "id": sid,
            "subreddit": submission.subreddit.display_name,
            "author": str(submission.author) if submission.author else "[deleted]",
            "created_utc": submission.created_utc,
            "created_dt": created_dt,
            "title": title,
            "selftext": body,
            "full_text": full_text,
            "score": submission.score,
            "num_comments": submission.num_comments,
            "url": submission.url,
            "link_flair_text": submission.link_flair_text,
            "source": "baseline"
        })

        if len(collected) % 100 == 0:
            print(f"Collected {len(collected)} baseline posts...")

        if len(collected) >= TARGET_COUNT:
            break

        time.sleep(SLEEP)

    df = pd.DataFrame(collected)
    df.to_csv(OUTPUT_BASE, index=False)

    print(f"Saved baseline posts: {OUTPUT_BASE}")
    return df, seen_ids


In [21]:
#KEYWORD
OUTPUT_KEYWORD = "depression_expanded_keywords.csv"

def keyword_expand_depression(df_existing, seen_ids, keywords, max_per_keyword=1000):
    rows = df_existing.to_dict("records")

    print("\nStarting keyword expansion...")

    for kw in keywords:
        print(f"Searching keyword: {kw!r}")
        try:
            for submission in sub.search(
                query=kw,
                sort="new",
                time_filter="all",
                limit=max_per_keyword,
            ):
                sid = submission.id
                if sid in seen_ids:
                    continue
                seen_ids.add(sid)

                title = submission.title or ""
                body = submission.selftext or ""
                full_text = (title + "\n\n" + body).strip()
                if full_text == "":
                    continue

                created_dt = datetime.fromtimestamp(
                    submission.created_utc, tz=timezone.utc
                ).isoformat()

                rows.append({
                    "id": sid,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author) if submission.author else "[deleted]",
                    "created_utc": submission.created_utc,
                    "created_dt": created_dt,
                    "title": title,
                    "selftext": body,
                    "full_text": full_text,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                    "link_flair_text": submission.link_flair_text,
                    "matched_keyword": kw,
                    "source": f"keyword:{kw}"
                })
        except Exception as e:
            print(f"Error for keyword {kw}: {e}")

        print(f"Total unique collected so far: {len(seen_ids)}")
        time.sleep(0.4)

    df = pd.DataFrame(rows)
    df.to_csv(OUTPUT_KEYWORD, index=False)
    print(f"\nSaved: {OUTPUT_KEYWORD}")

    return df, seen_ids


In [22]:
# 24 keywords
DEPRESSION_KEYWORDS = [
    "sad", "sadness",
    "depressed", "depression",
    "hopeless", "helpless",
    "tired", "exhausted",
    "empty", "numb",
    "anxiety", "anxious",
    "lonely", "alone",
    "worthless",
    "guilt", "guilty",
    "ashamed", "shame",
    "overwhelmed",
    "panic", "fear",
    "struggling",
    "can't cope",
]

In [23]:
# Step 1 - baseline
df_base, seen_ids = collect_baseline_depression()

# Step 2 - keyword expansion
df_full, seen_ids = keyword_expand_depression(df_base, seen_ids, DEPRESSION_KEYWORDS)


Scraping baseline posts from r/depression...
Collected 100 baseline posts...
Collected 200 baseline posts...
Collected 300 baseline posts...
Collected 400 baseline posts...
Collected 500 baseline posts...
Collected 600 baseline posts...
Collected 700 baseline posts...
Collected 800 baseline posts...
Collected 900 baseline posts...
Saved baseline posts: depression_baseline_1000.csv

Starting keyword expansion...
Searching keyword: 'sad'
Total unique collected so far: 1114
Searching keyword: 'sadness'
Total unique collected so far: 1114
Searching keyword: 'depressed'
Total unique collected so far: 1114
Searching keyword: 'depression'
Total unique collected so far: 1114
Searching keyword: 'hopeless'
Total unique collected so far: 1313
Searching keyword: 'helpless'
Total unique collected so far: 1533
Searching keyword: 'tired'
Total unique collected so far: 1629
Searching keyword: 'exhausted'
Total unique collected so far: 1785
Searching keyword: 'empty'
Total unique collected so far: 1932

# r/casualconversation

In [24]:
import time
import pandas as pd
from datetime import datetime, timezone
import praw

# -------------------------
# CONFIG
# -------------------------
TARGET_COUNT = 1000
SLEEP = 0.4

OUTPUT_BASE_CC = "casualconversation_baseline_1000.csv"
OUTPUT_KEYWORD_CC = "casualconversation_expanded_keywords.csv"
# -------------------------

sub_cc = reddit.subreddit("CasualConversation")

# -------------------------
# 24 keywords
# -------------------------
CASUAL_KEYWORDS = [
    "morning", "evening",
    "today", "yesterday",
    "weekend", "weekday",

    "work", "school",
    "job", "college",

    "friend", "friends",
    "family", "coworker",

    "hobby", "hobbies",
    "gaming", "game",
    "music", "movie",

    "food", "cooking",
    "travel", "weather",

    "discussion", "story",
]
# -------------------------


def collect_baseline_casual():
    collected = []
    seen_ids = set()

    print("Scraping baseline posts from r/CasualConversation...")

    for submission in sub_cc.new(limit=None):

        sid = submission.id
        if sid in seen_ids:
            continue
        seen_ids.add(sid)

        title = submission.title or ""
        body = submission.selftext or ""
        full_text = (title + "\n\n" + body).strip()
        if full_text == "":
            continue

        created_dt = datetime.fromtimestamp(
            submission.created_utc, tz=timezone.utc
        ).isoformat()

        collected.append({
            "id": sid,
            "subreddit": submission.subreddit.display_name,
            "author": str(submission.author) if submission.author else "[deleted]",
            "created_utc": submission.created_utc,
            "created_dt": created_dt,
            "title": title,
            "selftext": body,
            "full_text": full_text,
            "score": submission.score,
            "num_comments": submission.num_comments,
            "url": submission.url,
            "link_flair_text": submission.link_flair_text,
            "source": "baseline",
        })

        if len(collected) % 100 == 0:
            print(f"Collected {len(collected)} baseline posts...")

        if len(collected) >= TARGET_COUNT:
            break

        time.sleep(SLEEP)

    df_base = pd.DataFrame(collected)
    df_base.to_csv(OUTPUT_BASE_CC, index=False)
    print(f"Saved baseline to {OUTPUT_BASE_CC}")

    return df_base, seen_ids


def keyword_expand_casual(df_existing, seen_ids, keywords, max_per_keyword=1000):
    rows = df_existing.to_dict("records")

    print("\nStarting keyword expansion for r/CasualConversation...")

    for kw in keywords:
        print(f"\nSearching keyword: {kw!r}")

        try:
            for submission in sub_cc.search(
                query=kw,
                sort="new",
                time_filter="all",
                limit=max_per_keyword,
            ):
                sid = submission.id

                if sid in seen_ids:
                    continue
                seen_ids.add(sid)

                title = submission.title or ""
                body = submission.selftext or ""
                full_text = (title + "\n\n" + body).strip()
                if full_text == "":
                    continue

                created_dt = datetime.fromtimestamp(
                    submission.created_utc, tz=timezone.utc
                ).isoformat()

                rows.append({
                    "id": sid,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author) if submission.author else "[deleted]",
                    "created_utc": submission.created_utc,
                    "created_dt": created_dt,
                    "title": title,
                    "selftext": body,
                    "full_text": full_text,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                    "link_flair_text": submission.link_flair_text,
                    "matched_keyword": kw,
                    "source": f"keyword:{kw}",
                })

        except Exception as e:
            print(f"Error for keyword {kw}: {e}")

        print(f"Total unique posts so far: {len(seen_ids)}")
        time.sleep(0.4)

    df_full = pd.DataFrame(rows)
    df_full.to_csv(OUTPUT_KEYWORD_CC, index=False)
    print(f"\nSaved full CC keyword+baseline dataset to {OUTPUT_KEYWORD_CC}")

    return df_full, seen_ids


# -------------------------
# RUN THE PIPELINE
# -------------------------

df_cc_base, cc_seen_ids = collect_baseline_casual()

df_cc_full, cc_seen_ids = keyword_expand_casual(
    df_cc_base,
    cc_seen_ids,
    CASUAL_KEYWORDS,
)

Scraping baseline posts from r/CasualConversation...
Collected 100 baseline posts...
Collected 200 baseline posts...
Collected 300 baseline posts...
Collected 400 baseline posts...
Collected 500 baseline posts...
Collected 600 baseline posts...
Collected 700 baseline posts...
Collected 800 baseline posts...
Collected 900 baseline posts...
Saved baseline to casualconversation_baseline_1000.csv

Starting keyword expansion for r/CasualConversation...

Searching keyword: 'morning'
Total unique posts so far: 1169

Searching keyword: 'evening'
Total unique posts so far: 1187

Searching keyword: 'today'
Total unique posts so far: 1297

Searching keyword: 'yesterday'
Total unique posts so far: 1496

Searching keyword: 'weekend'
Total unique posts so far: 1681

Searching keyword: 'weekday'
Total unique posts so far: 1867

Searching keyword: 'work'
Total unique posts so far: 1920

Searching keyword: 'school'
Total unique posts so far: 2082

Searching keyword: 'job'
Total unique posts so far: 221

# merge depression datasets together

In [None]:
# merge the data sets together
df_dep_base = pd.read_csv("depression_baseline_1000.csv")
df_dep_kw   = pd.read_csv("depression_expanded_keywords.csv")

# merge 
df_dep = pd.concat([df_dep_base, df_dep_kw], ignore_index=True)

# drop duplicates by Reddit post ID
df_dep = df_dep.drop_duplicates(subset="id").reset_index(drop=True)

print(df_dep.shape)

df_dep.head(10)

(3872, 14)


Unnamed: 0,id,subreddit,author,created_utc,created_dt,title,selftext,full_text,score,num_comments,url,link_flair_text,source,matched_keyword
0,1p9nrd6,depression,FewNewspaper5365,1764419000.0,2025-11-29T12:27:20+00:00,I think suicide is my choice,I think it's wrong for a society to prevent pe...,I think suicide is my choice\n\nI think it's w...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
1,1p9nqc5,depression,pink_kitty574,1764419000.0,2025-11-29T12:25:41+00:00,Not sure what to do,\nI just took a lot of anxiety pills all at on...,Not sure what to do\n\n\nI just took a lot of ...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
2,1p9npoc,depression,NicyVicy,1764419000.0,2025-11-29T12:24:41+00:00,Severe depression + sertralin,I don’t know what to do anymore. I have been d...,Severe depression + sertralin\n\nI don’t know ...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
3,1p9nf4e,depression,tommyheavenIy,1764418000.0,2025-11-29T12:08:54+00:00,anger out of control,i'm not sure if this is the appropriate sub to...,anger out of control\n\ni'm not sure if this i...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
4,1p9ncyq,depression,FrostyAlarm6695,1764418000.0,2025-11-29T12:05:29+00:00,Carelessness,Lately I’ve been very aware and pressured abou...,Carelessness\n\nLately I’ve been very aware an...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
5,1p9naqi,depression,Darkrose808,1764418000.0,2025-11-29T12:01:58+00:00,These hands.,They cradle a bottle of pills. \n\nThe same wa...,These hands.\n\nThey cradle a bottle of pills....,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
6,1p9n085,depression,Due_Scene_4782,1764417000.0,2025-11-29T11:45:09+00:00,I’m falling,For a while now I’ve been on the edge of manic...,I’m falling\n\nFor a while now I’ve been on th...,1,1,https://www.reddit.com/r/depression/comments/1...,,baseline,
7,1p9mx4c,depression,hootyowl8,1764416000.0,2025-11-29T11:39:54+00:00,Curating a life despite anhedonia,I have been struggling with depression and anh...,Curating a life despite anhedonia\n\nI have be...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
8,1p9mv3r,depression,Enough-Syrup-1577,1764416000.0,2025-11-29T11:36:27+00:00,I feel defective for having SI,"I have major anxiety, depression and SI. Its t...",I feel defective for having SI\n\nI have major...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,
9,1p9mmr9,depression,Plenty_Answer5556,1764415000.0,2025-11-29T11:22:35+00:00,At this point I think I just hate having friends,Always feels easier without someone else with ...,At this point I think I just hate having frien...,1,0,https://www.reddit.com/r/depression/comments/1...,,baseline,


# Clean pipeline

In [40]:
### ------------------------------
### 1. Load & merge depression
### ------------------------------

df_dep_base = pd.read_csv("depression_baseline_1000.csv")
df_dep_kw   = pd.read_csv("depression_expanded_keywords.csv")

# Merge + remove ID duplicates
df_dep = pd.concat([df_dep_base, df_dep_kw], ignore_index=True)
df_dep = df_dep.drop_duplicates(subset="id").reset_index(drop=True)

# Label depression
df_dep["label"] = 1

print("Depression dataset:", df_dep.shape)


### ------------------------------
### 2. Load & merge CasualConversation
### ------------------------------

df_cc_base = pd.read_csv("casualconversation_baseline_1000.csv")
df_cc_kw   = pd.read_csv("casualconversation_expanded_keywords.csv")

df_cc = pd.concat([df_cc_base, df_cc_kw], ignore_index=True)
df_cc = df_cc.drop_duplicates(subset="id").reset_index(drop=True)

# Label casual
df_cc["label"] = 0

print("CasualConversation dataset:", df_cc.shape)


### ------------------------------
### 3. Combine, dedupe, final clean
### ------------------------------

df_final = pd.concat([df_dep, df_cc], ignore_index=True)
df_final = df_final.drop_duplicates(subset="id").reset_index(drop=True)

### Remove moderator/bot posts
bot_authors = ["AutoModerator", "[deleted]", "moderator", "ModTeam"]
df_final = df_final[~df_final["author"].str.lower().isin([a.lower() for a in bot_authors])]

### Optional: remove removal messages
removal_patterns = ["removed", "deleted", "sorry, this post was removed"]
df_final = df_final[~df_final["full_text"].str.lower().str.contains("|".join(removal_patterns))]

### 4. Remove textual duplicates
df_final_clean = df_final.drop_duplicates(subset="full_text").reset_index(drop=True)

df_final_clean.to_csv("clean_reddit.csv", index=False)


### ------------------------------
### 4. Remove textual duplicates
### ------------------------------

print("Original rows:", len(df))

# Drop duplicates by text
df_final_clean = df_final.drop_duplicates(subset="full_text").reset_index(drop=True)

print("Cleaned rows:", len(df_final_clean))
print("Duplicates removed:", len(df_final) - len(df_final_clean))

# Save final deduped dataset
df_final_clean.to_csv("clean_reddit.csv", index=False)

print("Saved final cleaned file: clean_reddit.csv")

Depression dataset: (3872, 15)
CasualConversation dataset: (4051, 15)
Original rows: 7923
Cleaned rows: 7831
Duplicates removed: 12
Saved final cleaned file: clean_reddit.csv
