<a href="https://colab.research.google.com/github/Johanl001/Social-Media-Crisis-Management/blob/main/Data_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
Data pipeline: Reddit scraping (no API keys), cleaning, and save.

Outputs:
- mental_health_posts_with_classification.csv
- mental_health_posts_with_classification.json
"""

import os
import re
import json
import datetime
from typing import List, Dict, Any
from urllib.parse import quote as url_quote

import requests
import pandas as pd
from cleantext import clean


In [2]:
%pip install cleantext

Collecting cleantext
  Downloading cleantext-1.1.4-py3-none-any.whl.metadata (3.5 kB)
Downloading cleantext-1.1.4-py3-none-any.whl (4.9 kB)
Installing collected packages: cleantext
Successfully installed cleantext-1.1.4


In [4]:
# ------------ Configuration ------------

MENTAL_HEALTH_KEYWORDS: List[str] = [
    "depression", "depressed", "anxiety", "suicidal",
    "suicide", "addiction", "substance abuse", "overwhelmed",
    "hopeless", "self harm", "bipolar", "mental health",
    "therapy", "crisis", "panic attack"
]

OUTPUT_CSV = "mental_health_posts_with_classification.csv"
OUTPUT_JSON = "mental_health_posts_with_classification.json"

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0 Safari/537.36"
)
HTTP_TIMEOUT_SECS = 30
DEFAULT_LIMIT_PER_KEYWORD = 100


In [5]:
# ------------ Utilities ------------

def preprocess_text(text: str) -> str:
    """Clean text aggressively for downstream analysis."""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    try:
        cleaned_text = clean(
            text,
            extra_spaces=True,
            lowercase=True,
            numbers=True,
            punct=True,
            stopwords=True,
            stp_lang="english",
            no_urls=True,
            no_emails=True,
            no_phone_numbers=True,
            no_currency_symbols=True,
            no_emoji=True,
        )
        # Collapse whitespace
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
        return cleaned_text
    except Exception:
        # Fallback minimal cleaning
        text = re.sub(r"http[s]?://\S+", " ", text)
        text = re.sub(r"[^A-Za-z\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip().lower()
        return text



In [8]:
# ------------ Reddit Scraping (No API Keys) ------------

def get_reddit_posts(keyword: str, limit: int = DEFAULT_LIMIT_PER_KEYWORD) -> List[Dict[str, Any]]:
    """
    Scrape Reddit via public JSON (no keys) for a given keyword.
    Filters results by MENTAL_HEALTH_KEYWORDS.
    """
    results: List[Dict[str, Any]] = []
    headers = {"User-Agent": USER_AGENT}
    url = f"https://www.reddit.com/search.json?q={url_quote(keyword)}&sort=new&limit={int(limit)}"

    try:
        response = requests.get(url, headers=headers, timeout=HTTP_TIMEOUT_SECS)
    except Exception:
        return results

    if response.status_code != 200:
        return results

    try:
        data = response.json()
    except ValueError:
        return results

    for post in data.get("data", {}).get("children", []):
        d = post.get("data", {})
        title = (d.get("title") or "").strip()
        selftext = (d.get("selftext") or "").strip()
        body = f"{title}\n{selftext}".lower()

        # Additional filter to ensure relevance
        if not any(kw.lower() in body for kw in MENTAL_HEALTH_KEYWORDS):
            continue

        created_utc = d.get("created_utc", 0)
        timestamp = (
            datetime.datetime.fromtimestamp(created_utc).strftime("%Y-%m-%d %H:%M:%S")
            if created_utc else ""
        )

        results.append({
            "platform": "reddit",
            "post_id": d.get("id") or "",
            "timestamp": timestamp,
            "author": d.get("author") or "",
            "title": title,
            "content": selftext,
            "subgroup": d.get("subreddit", "reddit"),
            "likes": d.get("ups", 0) or 0,
            "comments": d.get("num_comments", 0) or 0,
            "url": f"https://www.reddit.com{d.get('permalink','')}",
        })

    return results

In [9]:
# ------------ Pipeline ------------

def collect_posts_across_keywords(
    keywords: List[str],
    per_keyword_limit: int = DEFAULT_LIMIT_PER_KEYWORD,
) -> pd.DataFrame:
    """Collect, deduplicate, clean, and return a DataFrame."""
    all_posts: List[Dict[str, Any]] = []

    for kw in keywords:
        print(f"Collecting posts for keyword: {kw}")
        try:
            all_posts.extend(get_reddit_posts(kw, limit=per_keyword_limit))
        except Exception:
            # Keep pipeline resilient; skip on scraping errors
            continue

    if not all_posts:
        return pd.DataFrame(columns=[
            "platform", "post_id", "timestamp", "author", "title", "content",
            "subgroup", "likes", "comments", "url",
            "cleaned_title", "cleaned_content",
        ])

    # Deduplicate by post_id primarily; fallback to URL if missin
    df = pd.DataFrame(all_posts)
    if "post_id" in df.columns:
        df = df.drop_duplicates(subset=["post_id"], keep="first")
    elif "url" in df.columns:
        df = df.drop_duplicates(subset=["url"], keep="first")
    else:
        df = df.drop_duplicates(subset=["platform", "author", "timestamp", "title"], keep="first")

    # Cleaning
    df["cleaned_title"] = df["title"].map(preprocess_text)
    df["cleaned_content"] = df["content"].map(preprocess_text)

    # Sort newest first if timestamps are present
    try:
        df["timestamp_dt"] = pd.to_datetime(df["timestamp"], errors="coerce")
        df = df.sort_values(by="timestamp_dt", ascending=False).drop(columns=["timestamp_dt"])
    except Exception:
        pass

    return df


def save_outputs(df: pd.DataFrame, csv_path: str = OUTPUT_CSV, json_path: str = OUTPUT_JSON) -> None:
    df.to_csv(csv_path, index=False, encoding="utf-8")
    df.to_json(json_path, orient="records", force_ascii=False)
    print(f"Saved CSV -> {os.path.abspath(csv_path)}")
    print(f"Saved JSON -> {os.path.abspath(json_path)}")


def main() -> None:
    df = collect_posts_across_keywords(MENTAL_HEALTH_KEYWORDS, per_keyword_limit=DEFAULT_LIMIT_PER_KEYWORD)
    print(f"Collected rows: {len(df)}")
    save_outputs(df, OUTPUT_CSV, OUTPUT_JSON)


if __name__ == "__main__":
    main()

Collecting posts for keyword: depression
Collecting posts for keyword: depressed
Collecting posts for keyword: anxiety
Collecting posts for keyword: suicidal
Collecting posts for keyword: suicide
Collecting posts for keyword: addiction
Collecting posts for keyword: substance abuse
Collecting posts for keyword: overwhelmed
Collecting posts for keyword: hopeless
Collecting posts for keyword: self harm
Collecting posts for keyword: bipolar
Collecting posts for keyword: mental health
Collecting posts for keyword: therapy
Collecting posts for keyword: crisis
Collecting posts for keyword: panic attack
Collected rows: 0
Saved CSV -> /content/mental_health_posts_with_classification.csv
Saved JSON -> /content/mental_health_posts_with_classification.json
