**Reddit**

In [None]:
import nest_asyncio
import asyncio
import asyncpraw
import asyncprawcore
import re
import pandas as pd

nest_asyncio.apply()

# --- Patterns ---
TBD_KEYWORDS = [
    "trunk based development", "trunk-based", "mainline development",
    "GitFlow vs trunk", "trunk strategy", "short-lived branches"
]

TBD_PATTERNS = [
    r"\btrunk[-\s]?based\sdevelopment\b",
    r"\btrunk[-\s]?based\b",
    r"\bmainline\sdevelopment\b",
    r"\bshort[-\s]?lived\sbranches?\b",
    r"\bGitFlow\s+vs\s+trunk\b",
    r"\btrunk\sstrategy\b",
    r"\btrunk vs\b"
]

OSS_PATTERNS = [
    r"\bopen source\b", r"\bOSS\b", r"\bFOSS\b", r"\bMIT license\b", r"\bpublic repo\b"
]

SUBREDDITS = [
    'devops', 'git', 'softwaredevelopment', 'programming',
    'cscareerquestions', 'learnprogramming', 'webdev', 'softwareengineering'
]

# --- Helpers ---
def is_tbd_related(text):
    return any(re.search(p, text, re.IGNORECASE) for p in TBD_PATTERNS)

def is_oss(text):
    return any(re.search(p, text, re.IGNORECASE) for p in OSS_PATTERNS)

def clean_html(text):
    return re.sub('<[^<]+?>', '', text).strip()

def save_rows(rows, prefix):
    df = pd.DataFrame(rows)
    df.to_csv(f"{prefix}.csv", index=False)
    df.to_json(f"{prefix}.json", orient="records", indent=2)
    print(f"💾 Saved {len(rows)} rows to {prefix}.*")

# --- Main Function ---
async def fetch_reddit_threads_raw():
    reddit = asyncpraw.Reddit(
        client_id='5sbtbAgx0CFLQqjwWrK0sg',
        client_secret='9noISNLsQ3TChIEu9P5OkdWxJBMzUg',
        user_agent='tbd-research-script by /u/Relevant-Egg9675'
    )

    seen_ids = set()
    all_rows = []
    try:
        for keyword in TBD_KEYWORDS:
            print(f"🔍 Searching Reddit for keyword: '{keyword}'")
            for subreddit_name in SUBREDDITS:
                subreddit = await reddit.subreddit(subreddit_name)
                async for submission in subreddit.search(keyword, sort='new', time_filter='all', limit=None):
                    if submission.id in seen_ids:
                        continue
                    seen_ids.add(submission.id)

                    await submission.load()

                    # ✅ Only keep submission if title or selftext match TBD
                    if not (is_tbd_related(submission.title) or is_tbd_related(submission.selftext)):
                        continue

                    await submission.comments.replace_more(limit=0)

                    discussion_items = [submission.title.strip(), submission.selftext.strip()]
                    for comment in submission.comments.list():
                        if isinstance(comment, asyncpraw.models.Comment):
                            discussion_items.append(comment.body.strip())

                    filtered_items = []
                    oss_flag = False

                    for item in discussion_items:
                        if not item.strip():
                            continue
                        text = clean_html(item)
                        if is_oss(text):
                            oss_flag = True
                        if is_tbd_related(text):
                            filtered_items.append(text)

                    if not filtered_items:
                        continue

                    row = {
                        "keyword": keyword,
                        "subreddit": subreddit_name,
                        "title": submission.title,
                        "url": submission.url,
                        "permalink": f"https://www.reddit.com{submission.permalink}",
                        "OSS-related?": oss_flag,
                        "discussion_items": filtered_items
                    }

                    all_rows.append(row)

                    # Optional checkpoint every 50
                    if len(all_rows) % 50 == 0:
                        save_rows(all_rows, "reddit_tbd_checkpoint")

    except Exception as e:
        print(f"⚠️ Error: {e}")
        save_rows(all_rows, "reddit_tbd_error_dump")

    save_rows(all_rows, "reddit_tbd_raw_full")
    print("✅ Done.")


# --- Run in Notebook ---
await fetch_reddit_threads_raw()
# For script version: asyncio.run(fetch_reddit_threads_raw())


🔍 Searching Reddit for keyword: 'trunk based development'
💾 Saved 50 rows to reddit_tbd_checkpoint.*
⚠️ Error: received 429 HTTP response
💾 Saved 78 rows to reddit_tbd_error_dump.*
💾 Saved 78 rows to reddit_tbd_raw_full.*
✅ Done.


**Stack Overflow**

In [None]:
import requests
import re
import pandas as pd
import time

# --- Patterns ---
TBD_KEYWORDS = [
    "trunk based development", "trunk-based", "mainline development",
    "GitFlow vs trunk", "trunk strategy", "short-lived branches"
]

TBD_PATTERNS = [
    r"\btrunk[-\s]?based\sdevelopment\b",
    r"\btrunk[-\s]?based\b",
    r"\bmainline\sdevelopment\b",
    r"\bshort[-\s]?lived\sbranches?\b",
    r"\bGitFlow\s+vs\s+trunk\b",
    r"\btrunk\sstrategy\b",
    r"\btrunk vs\b"
]

OSS_PATTERNS = [
    r"\bopen source\b", r"\bOSS\b", r"\bFOSS\b", r"\bMIT license\b", r"\bpublic repo\b"
]

STACK_API = "https://api.stackexchange.com/2.3"

def is_tbd_related(text):
    return any(re.search(p, text, re.IGNORECASE) for p in TBD_PATTERNS)

def is_oss(text):
    return any(re.search(p, text, re.IGNORECASE) for p in OSS_PATTERNS)

def clean_html(raw_html):
    return re.sub('<[^<]+?>', '', raw_html).strip()

def fetch_stackoverflow_discussions():
    all_rows = []
    for keyword in TBD_KEYWORDS:
        print(f"🔍 Querying: {keyword}")
        page = 1
        while True:
            url = f"{STACK_API}/search/advanced"
            params = {
                "order": "desc",
                "sort": "relevance",
                "q": keyword,
                "site": "stackoverflow",
                "filter": "withbody",
                "pagesize": 100,
                "page": page
            }

            res = requests.get(url, params=params)
            if res.status_code != 200:
                print("⚠️ Failed request")
                break

            data = res.json()
            questions = data.get("items", [])
            print(f"🔹 Page {page}: {len(questions)} results")

            for q in questions:
                question_id = q.get("question_id")
                title = q.get("title", "")
                link = q.get("link")
                body = q.get("body", "")

                discussion_items = [title.strip(), body.strip()]

                # Fetch answers
                ans_url = f"{STACK_API}/questions/{question_id}/answers"
                ans_params = {
                    "order": "desc",
                    "sort": "votes",
                    "site": "stackoverflow",
                    "filter": "withbody",
                    "pagesize": 100
                }
                ans_res = requests.get(ans_url, params=ans_params)
                answers = ans_res.json().get("items", []) if ans_res.status_code == 200 else []

                for a in answers:
                    answer_body = a.get("body", "")
                    if answer_body:
                        discussion_items.append(answer_body.strip())

                filtered_items = []
                oss_flag = False
                for item in discussion_items:
                    clean = clean_html(item)
                    if not clean:
                        continue
                    if is_oss(clean):
                        oss_flag = True
                    if is_tbd_related(clean):
                        filtered_items.append(clean)
                    else:
                        filtered_items.append(clean + " (new)")

                if not filtered_items:
                    continue

                row = {
                    "title": title,
                    "question_id": question_id,
                    "url": link,
                    "OSS-related?": oss_flag,
                    "discussion_items": filtered_items
                }

                all_rows.append(row)
                time.sleep(0.2)

            if not data.get("has_more"):
                break
            page += 1
            time.sleep(0.5)  # Be polite with the API

    df = pd.DataFrame(all_rows)
    df.to_csv("stackoverflow_tbd_raw_full.csv", index=False)
    df.to_json("stackoverflow_tbd_raw_full.json", orient="records", indent=2)
    print("✅ Saved all Stack Overflow results across pages.")

# --- Run ---
fetch_stackoverflow_discussions()


🔍 Searching Reddit for: 'trunk based development'
⚠️ Request failed with status 403
🔍 Searching Reddit for: 'trunk-based'
⚠️ Request failed with status 403
🔍 Searching Reddit for: 'mainline development'
⚠️ Request failed with status 403
🔍 Searching Reddit for: 'GitFlow vs trunk'
⚠️ Request failed with status 403
🔍 Searching Reddit for: 'trunk strategy'
⚠️ Request failed with status 403
🔍 Searching Reddit for: 'short-lived branches'
⚠️ Request failed with status 403
✅ Done! Saved 0 threads.


**HackerNews**

In [None]:
import requests
import re
import pandas as pd
import time

# --- Keywords and Patterns ---
TBD_KEYWORDS = [
    "trunk based development", "trunk-based development",
    "mainline development", "GitFlow vs trunk",
    "short-lived branches", "trunk strategy"
]

TBD_PATTERNS = [
    r"\btrunk[-\s]?based\sdevelopment\b",
    r"\btrunk[-\s]?based\b",
    r"\bmainline\sdevelopment\b",
    r"\bshort[-\s]?lived\sbranches?\b",
    r"\bGitFlow\s+vs\s+trunk\b",
    r"\btrunk\sstrategy\b",
    r"\btrunk vs\b"
]

OSS_PATTERNS = [r"\bopen source\b", r"\bOSS\b", r"\bFOSS\b", r"\bMIT license\b", r"\bpublic repo\b"]

def is_tbd_related(text):
    return any(re.search(p, text, re.IGNORECASE) for p in TBD_PATTERNS)

def is_oss(text):
    return any(re.search(p, text, re.IGNORECASE) for p in OSS_PATTERNS)

def clean_html(raw_html):
    return re.sub('<[^<]+?>', '', raw_html).strip()

def fetch_item(item_id):
    url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
    response = requests.get(url)
    return response.json()

def fetch_hn_raw_discussions():
    all_rows = []

    for keyword in TBD_KEYWORDS:
        print(f"🔍 Searching HN for: {keyword}")
        page = 0
        while True:
            search_url = (
                f"https://hn.algolia.com/api/v1/search?"
                f"query={keyword}&tags=story&page={page}&hitsPerPage=100"
            )
            response = requests.get(search_url)
            data = response.json()
            hits = data.get("hits", [])
            if not hits:
                break  # No more results
            print(f"🔹 Page {page} -> {len(hits)} hits")

            for hit in hits:
                item_id = hit.get("objectID")
                title = hit.get("title", "")
                story_text = hit.get("story_text", "") or hit.get("comment_text", "") or ""
                url = hit.get("url", f"https://news.ycombinator.com/item?id={item_id}")

                # Fetch top-level comments
                discussion_items = []
                full_item = fetch_item(item_id)
                if full_item:
                    comment_ids = full_item.get("kids", [])[:20]  # Limit to 20 comments
                    for cid in comment_ids:
                        comment = fetch_item(cid)
                        if comment and "text" in comment:
                            discussion_items.append(comment["text"])

                raw_items = [title, story_text] + discussion_items
                filtered_items = []
                oss_flag = False

                for item in raw_items:
                    clean = clean_html(item)
                    if not clean:
                        continue
                    if is_oss(clean):
                        oss_flag = True
                    if is_tbd_related(clean):
                        filtered_items.append(clean)
                    else:
                        filtered_items.append(clean + " (new)")

                if not filtered_items:
                    continue

                row = {
                    "title": title,
                    "id": item_id,
                    "url": url,
                    "OSS-related?": oss_flag,
                    "discussion_items": filtered_items
                }

                all_rows.append(row)
                time.sleep(0.2)

            page += 1
            time.sleep(0.5)

    # Save results
    df = pd.DataFrame(all_rows)
    df.to_csv("hackernews_tbd_raw_full.csv", index=False)
    df.to_json("hackernews_tbd_raw_full.json", orient="records", indent=2)
    print("✅ Completed. Saved all paginated Hacker News results.")

# --- Run ---
fetch_hn_raw_discussions()


🔍 Searching HN for: trunk based development
🔹 Page 0 -> 84 hits
🔍 Searching HN for: trunk-based development
🔹 Page 0 -> 83 hits
🔍 Searching HN for: mainline development
🔹 Page 0 -> 9 hits
🔍 Searching HN for: GitFlow vs trunk
🔹 Page 0 -> 1 hits
🔍 Searching HN for: short-lived branches
🔹 Page 0 -> 3 hits
🔍 Searching HN for: trunk strategy
🔹 Page 0 -> 7 hits
✅ Completed. Saved all paginated Hacker News results.


**Gerrit OpenStack** (Reurn O)