In [2]:
import requests
from bs4 import BeautifulSoup
import time
import hashlib
import os
import json
from urllib.parse import urljoin

# ==== CONFIGURATION ====
BASE_URL = "https://www.psychforums.com"
OUTPUT_FILE = "psychforums_dataset_2.jsonl"
PAGES_PER_SUBFORUM = 5
REQUEST_DELAY = 1  # seconds

# PLACEHOLDER — paste the exact sub-forum URLs you want to scrape here:
SUBFORUM_URLS = [
    "https://www.psychforums.com/food-addiction/",
    "https://www.psychforums.com/narcissistic-personality/",
    "https://www.psychforums.com/delusional-disorder/",
    "https://www.psychforums.com/alzheimer/",
    "https://www.psychforums.com/domestic-abuse/",
    
    # Add more here...
]

# ==== UTILITIES ====
def clean_text(text):
    return ' '.join(text.split())

def text_hash(text):
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

def save_jsonl(data, filename):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def fetch(url):
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"[Error] Failed to fetch {url}: {e}")
        return None

# ==== SCRAPING ====
def scrape_thread(thread_url):
    html = fetch(thread_url)
    if not html:
        return None

    soup = BeautifulSoup(html, "html.parser")
    posts = []

    for post in soup.select(".postbody"):
        content = clean_text(post.get_text(separator=" ", strip=True))
        if content:
            posts.append(content)

    return posts

def scrape_subforum(subforum_url):
    for page in range(0, PAGES_PER_SUBFORUM):
        offset = page * 40
        url = subforum_url if page == 0 else urljoin(subforum_url, f"page{offset}.html")
        html = fetch(url)
        if not html:
            continue

        soup = BeautifulSoup(html, "html.parser")
        threads = soup.select("a.topictitle")

        if not threads:
            print(f"[{subforum_url}] No threads found on page {page+1}")
            break

        for t in threads:
            thread_href = t.get("href")
            thread_url = urljoin(BASE_URL, thread_href)
            print(f"  -> Scraping thread: {thread_url}")

            posts = scrape_thread(thread_url)
            if posts:
                for p in posts:
                    record = {
                        "url": thread_url,
                        "text": p,
                        "hash": text_hash(p)
                    }
                    save_jsonl(record, OUTPUT_FILE)

        time.sleep(REQUEST_DELAY)

# ==== MAIN ====
if __name__ == "__main__":
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    for subforum in SUBFORUM_URLS:
        print(f"Scraping sub-forum: {subforum}")
        scrape_subforum(subforum)

    print(f"Scraping complete. Data saved to {OUTPUT_FILE}")

Scraping sub-forum: https://www.psychforums.com/food-addiction/
  -> Scraping thread: https://www.psychforums.com:443/announces/topic43798.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic126438.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic198674.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic156051.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic217018.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic215555.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic215416.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic205446.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic90784.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic210417.html
  -> Scraping thread: https://www.psychforums.com:443/food-addiction/topic207413.ht

In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ---------- CONFIG ----------
INPUT_FILE = "psychforums_dataset_cleaned.csv"     # CSV file with a 'text' column
OUTPUT_FILE = "psychforums_dataset_cleaned_1.csv"
SIMILARITY_THRESHOLD = 0.9     # For near-duplicate removal
MIN_WORDS = 5                  # Minimum words to keep a post

# Common boilerplate phrases to remove
BOILERPLATE_PATTERNS = [
    r"forum rules", r"active staff list", r"moderator", r"admins?",
    r"we do not delete posts", r"privacy policy", r"copyright notice",
    r"disclaimer", r"report posts", r"shadow.*post", r"member corne",
    r"index page", r"complaints", r"contact another mod", r"volunteers"
]

# Words that indicate personal expression (keeps human conversation)
HUMAN_KEYWORDS = [
    "i ", "i'm", "i’ve", "my ", "me ", "mine", "myself",
    "feel", "anxious", "anxiety", "worried", "happy", "sad",
    "angry", "upset", "excited", "love", "hate"
]

# ---------- LOAD DATA ----------
df = pd.read_csv(INPUT_FILE)
if 'text' not in df.columns:
    raise ValueError("CSV must have a 'text' column.")

# Lowercase and strip whitespace
df['text'] = df['text'].astype(str).str.strip().str.lower()

# ---------- REMOVE BOILERPLATE ----------
def contains_boilerplate(text):
    return any(re.search(pat, text) for pat in BOILERPLATE_PATTERNS)

df = df[~df['text'].apply(contains_boilerplate)]

# ---------- REMOVE SHORT POSTS ----------
df = df[df['text'].apply(lambda x: len(x.split()) >= MIN_WORDS)]

# ---------- KEEP HUMAN CONVERSATION ----------
def looks_human(text):
    return any(kw in text for kw in HUMAN_KEYWORDS)

df = df[df['text'].apply(looks_human)]

# ---------- REMOVE NEAR-DUPLICATES ----------
vectorizer = TfidfVectorizer().fit_transform(df['text'])
similarity_matrix = cosine_similarity(vectorizer)

# Mark duplicates
to_drop = set()
for i in range(len(similarity_matrix)):
    if i in to_drop:
        continue
    for j in range(i+1, len(similarity_matrix)):
        if similarity_matrix[i, j] > SIMILARITY_THRESHOLD:
            to_drop.add(j)

df_cleaned = df.drop(df.index[list(to_drop)]).reset_index(drop=True)

# ---------- SAVE CLEANED DATA ----------
df_cleaned.to_csv(OUTPUT_FILE, index=False)
print(f"Cleaned dataset saved to {OUTPUT_FILE} with {len(df_cleaned)} examples.")

Cleaned dataset saved to psychforums_dataset_cleaned_1.csv with 2600 examples.


In [3]:
#!/usr/bin/env python3
"""
clean_to_csv.py

Usage:
    python clean_to_csv.py --input scraped.jsonl --output cleaned.csv --min-words 50

Supports input JSONL (lines of JSON) or CSV. Produces a CSV with a single column "text".
"""

import re
import argparse
import pandas as pd
import hashlib
from tqdm import tqdm

INPUT_FILE = "psychforums_dataset_2.jsonl"   # Path to your scraped dataset (JSONL or CSV)
OUTPUT_FILE = "psychforums_dataset_cleaned_2.csv"    # Path to save cleaned CSV
MIN_WORDS = 50                 # Minimum words required to keep a post
REMOVE_DUPLICATES = True       # True to drop duplicate cleaned texts
SHOW_STATS = True      

# ---------------- Helpers ----------------
def load_input(path):
    return pd.read_json(path, lines=True)

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

# Multi-pass cleaner targeting the common forum metadata patterns
# NOTE: tweak MIN_WORDS below if you want stricter/looser filtering.
DAY_NAMES = r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)'
MONTH_NAMES = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)'

# pattern that matches: by <username> » Wed Dec 24, 2008 12:06 am (very common)
PAT_BY_DATE = re.compile(
    rf'\bby\s+[^\»\n]{{1,80}}\s*»\s*{DAY_NAMES}.*?\d{{4}}\s*(?:\d{{1,2}}:\d{{2}}\s*(?:am|pm)?)?',
    flags=re.IGNORECASE
)

# fallback: remove 'by <username> »' even if date not present
PAT_BY_SIMPLE = re.compile(r'\bby\s+[^\»\n]{1,80}\s*»\s*', flags=re.IGNORECASE)

# remove "Posted: Wed Dec 24, 2008 12:06 am" style
PAT_POSTED = re.compile(r'Posted\s*(?:on|:)?\s*.*?\d{4}.*?(?=\s[A-Z]|\n|$)', flags=re.IGNORECASE)

# remove signatures like "-- Name" or "— Name"
PAT_SIGN = re.compile(r'(^|\n)[\-\u2013\u2014]{2,}\s*\w.*', flags=re.MULTILINE)

# remove common boilerplate tokens
BOILERPLATE_LOWER = {"advertisement", "advertisements", "ad"}

def clean_post(text: str) -> str:
    if not isinstance(text, str):
        return ""

    s = text.strip()

    # quick drop for obvious boilerplate-only lines
    if s.strip().lower() in BOILERPLATE_LOWER:
        return ""

    # 1) remove explicit "by ... » <weekday> <month> <day>, <year> <time>" patterns
    s = PAT_BY_DATE.sub('', s)

    # 2) fallback: remove "by <username> »" if any remains
    s = PAT_BY_SIMPLE.sub('', s)

    # 3) remove Posted: ... patterns
    s = PAT_POSTED.sub('', s)

    # 4) remove signature separators and trailing signatures (naive)
    s = PAT_SIGN.sub('', s)

    # 5) collapse multiple whitespace/newlines into single space
    s = re.sub(r'\s+', ' ', s).strip()

    # final safety: if content reduced to short or empty, return empty
    return s

# ---------------- Main ----------------
def main():
    df = load_input(INPUT_FILE)

    if "text" not in df.columns:
        raise SystemExit("Input file must have a 'text' column")

    texts = df['text'].astype(str).tolist()
    cleaned = []
    for t in tqdm(texts, desc="Cleaning posts"):
        c = clean_post(t)
        if not c:
            continue
        # count words
        wc = len(c.split())
        if wc < MIN_WORDS:
            continue
        cleaned.append(c)

    if REMOVE_DUPLICATES:
        # preserve order but drop duplicates
        seen = set()
        uniq = []
        for s in cleaned:
            h = sha1(s)
            if h in seen:
                continue
            seen.add(h)
            uniq.append(s)
        cleaned = uniq

    # save as CSV with single column 'text'
    out_df = pd.DataFrame({"text": cleaned})
    out_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")

    if SHOW_STATS:
        print(f"Input rows : {len(texts)}")
        print(f"Kept rows  : {len(cleaned)}")
        if REMOVE_DUPLICATES:
            print("(duplicates removed)")

if __name__ == "__main__":
    main()

Cleaning posts: 100%|████████████████████████| 3235/3235 [00:00<00:00, 14175.16it/s]

Input rows : 3235
Kept rows  : 2072
(duplicates removed)





In [3]:
import re
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ---------- CONFIG ----------
INPUT_FILE = "psychforums_dataset_cleaned_1.csv"   # CSV file with 'text' column
SIMILARITY_THRESHOLD = 0.9   # for near-duplicate detection
BOILERPLATE_PATTERNS = [
    r"forum rules", r"active staff list", r"moderator", r"admins?",
    r"we do not delete posts", r"privacy policy", r"copyright notice",
    r"disclaimer", r"report posts", r"shadow.*post", r"member corne",
    r"index page", r"complaints", r"contact another mod", r"volunteers"
]

# ---------- LOAD ----------
df = pd.read_csv(INPUT_FILE)
if 'text' not in df.columns:
    raise ValueError("CSV must have a 'text' column.")

texts = df['text'].astype(str).str.strip().str.lower()

# ---------- DUPLICATE CHECK ----------
# Exact duplicates
exact_dupes = texts.duplicated().sum()
exact_dupe_ratio = exact_dupes / len(texts) * 100

# Near duplicates
vectorizer = TfidfVectorizer().fit_transform(texts)
sim_matrix = cosine_similarity(vectorizer)
to_drop = set()
for i in range(len(sim_matrix)):
    if i in to_drop:
        continue
    for j in range(i+1, len(sim_matrix)):
        if sim_matrix[i, j] > SIMILARITY_THRESHOLD:
            to_drop.add(j)
near_dupes = len(to_drop)
near_dupe_ratio = near_dupes / len(texts) * 100

# ---------- BOILERPLATE CHECK ----------
def contains_boilerplate(text):
    return any(re.search(pat, text) for pat in BOILERPLATE_PATTERNS)

boilerplate_count = sum(contains_boilerplate(t) for t in texts)
boilerplate_ratio = boilerplate_count / len(texts) * 100

# ---------- LEXICAL DIVERSITY ----------
all_words = " ".join(texts).split()
unique_words = set(all_words)
lexical_diversity = len(unique_words) / len(all_words)

# ---------- WORD COUNT STATS ----------
word_counts = [len(t.split()) for t in texts]
avg_wc = np.mean(word_counts)
median_wc = np.median(word_counts)
short_posts = sum(wc < 5 for wc in word_counts)

# ---------- COMMON PHRASES ----------
def get_ngrams(tokens, n=3):
    return zip(*[tokens[i:] for i in range(n)])

all_tokens = " ".join(texts).split()
ngram_counts = Counter(get_ngrams(all_tokens, n=3))
common_phrases = [" ".join(k) for k, _ in ngram_counts.most_common(10)]

# ---------- REPORT ----------
print("\n===== DATA QUALITY REPORT =====")
print(f"Total examples: {len(texts)}")
print(f"Exact duplicates: {exact_dupes} ({exact_dupe_ratio:.2f}%)")
print(f"Near duplicates (> {SIMILARITY_THRESHOLD}): {near_dupes} ({near_dupe_ratio:.2f}%)")
print(f"Boilerplate matches: {boilerplate_count} ({boilerplate_ratio:.2f}%)")
print(f"Lexical diversity (type-token ratio): {lexical_diversity:.4f}")
print(f"Average word count: {avg_wc:.2f}")
print(f"Median word count: {median_wc}")
print(f"Posts with <5 words: {short_posts} ({short_posts/len(texts)*100:.2f}%)")
print("\nMost common 3-word phrases:")
for phrase in common_phrases:
    print("  -", phrase)
print("================================\n")


===== DATA QUALITY REPORT =====
Total examples: 2599
Exact duplicates: 0 (0.00%)
Near duplicates (> 0.9): 0 (0.00%)
Boilerplate matches: 0 (0.00%)
Lexical diversity (type-token ratio): 0.0613
Average word count: 247.01
Median word count: 161.0
Posts with <5 words: 0 (0.00%)

Most common 3-word phrases:
  - a lot of
  - i don't know
  - i have a
  - i have been
  - i want to
  - i am not
  - i feel like
  - be able to
  - when i was
  - i don't think



In [4]:
import pandas as pd

df = pd.read_csv('Datasets/journal_reddit_posts_multilabel.csv')

# df has 'text' and 'label' columns
common_phrases = ["i feel like", "i wanted to", "i don't know", "a lot of", "i have a", "i have been", "i want to", "i am not", "be able to", "when i was", "i don't think"]
for phrase in common_phrases:
    subset = df[df['text'].str.contains(phrase, case=False)]
    print(f"Phrase: {phrase}")
    print(subset['label_0'].value_counts(normalize=True))
    print(subset['label_1'].value_counts(normalize=True))
    print(subset['label_2'].value_counts(normalize=True))
    print(subset['label_3'].value_counts(normalize=True))
    print()

Phrase: i feel like
label_0
0    0.912186
1    0.087814
Name: proportion, dtype: float64
label_1
1    0.696325
0    0.303675
Name: proportion, dtype: float64
label_2
1    0.709091
0    0.290909
Name: proportion, dtype: float64
label_3
1    0.659188
0    0.340812
Name: proportion, dtype: float64

Phrase: i wanted to
label_0
0    0.785714
1    0.214286
Name: proportion, dtype: float64
label_1
1    0.588146
0    0.411854
Name: proportion, dtype: float64
label_2
1    0.62614
0    0.37386
Name: proportion, dtype: float64
label_3
1    0.583587
0    0.416413
Name: proportion, dtype: float64

Phrase: i don't know
label_0
0    0.90566
1    0.09434
Name: proportion, dtype: float64
label_1
1    0.702401
0    0.297599
Name: proportion, dtype: float64
label_2
1    0.710978
0    0.289022
Name: proportion, dtype: float64
label_3
1    0.655232
0    0.344768
Name: proportion, dtype: float64

Phrase: a lot of
label_0
0    0.777013
1    0.222987
Name: proportion, dtype: float64
label_1
1    0.55265
0    

In [None]:
# df has columns: 'text' (str), 'label' (int or str)
import pandas as pd

df = pd.read_csv('Datasets/journal_reddit_posts_multilabel.csv')

def phrase_lift(df, phrase, positive_label):
    has = df['text'].str.contains(phrase, case=False, na=False)
    p_label = (df['label'] == positive_label).mean()
    p_label_given = df.loc[has, 'label'].eq(positive_label).mean()
    return p_label, p_label_given, (p_label_given / (p_label + 1e-12))
    
for ph in common_phrases:
    overall, given, lift = phrase_lift(df, ph, positive_label=1)  # adapt positive label
    print(ph, "P(label)=%.3f P(label|phrase)=%.3f lift=%.2f" % (overall, given, lift))

In [5]:
import pandas as pd

# Read your CSV
df = pd.read_csv("neutral_journals_700.csv")

# Add the label columns
df["label_0"] = 1
df["label_1"] = 0
df["label_2"] = 0
df["label_3"] = 0

# Save back to CSV
df.to_csv("neutral_journals_700_final.csv", index=False)

print("✅ Added label columns successfully!")

✅ Added label columns successfully!


In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("resolved_psychforums_dataset.csv")

# Filter rows where label_0 == 1
label0_rows = df[df["label_0"] == 1]

# Randomly sample 300 rows from these
sampled = label0_rows.sample(n=300, random_state=42)  # random_state for reproducibility

# Remove the sampled rows from the original dataframe
filtered_df = df.drop(sampled.index)

# Save to a new CSV if needed
filtered_df.to_csv("filtered_psychforums_dataset.csv", index=False)

print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(filtered_df)}")

Original dataset size: 2599
Filtered dataset size: 2299


In [2]:
import pandas as pd

# Load the CSV
df = pd.read_csv("neutral_journals_700_final.csv")

# Randomly drop 222 rows
df_filtered = df.sample(frac=1, random_state=42).iloc[222:].reset_index(drop=True)

# Save the filtered dataset
df_filtered.to_csv("neutral_journals_700_filtered.csv", index=False)

print(f"Original dataset size: {len(df)}")
print(f"Filtered dataset size: {len(df_filtered)}")

Original dataset size: 700
Filtered dataset size: 478


In [3]:
import pandas as pd

# Load the CSV
df = pd.read_csv("Datasets/psychforums_multilabel.csv")

# Convert label columns from float to int
label_cols = ["label_0", "label_1", "label_2", "label_3"]
df[label_cols] = df[label_cols].astype(int)

# Save the updated dataset
df.to_csv("Datasets/psychforums_multilabel.csv", index=False)

print(df.head())
print(df.dtypes)

   id                                               text  label_0  label_1  \
0   1  how are you feeling today? i had a fight with ...        0        0   
1   2  how are you feeling today? snaga wrote: a stic...        1        0   
2   3  how are you feeling today? like there's really...        0        1   
3   4  how are you feeling today? not good, but not b...        0        0   
4   5  how are you feeling today? i've been strugglin...        0        1   

   label_2  label_3  
0        1        0  
1        0        0  
2        1        0  
3        1        1  
4        1        1  
id          int64
text       object
label_0     int64
label_1     int64
label_2     int64
label_3     int64
dtype: object
