In [5]:
import requests
from datetime import datetime, timedelta
import time
import pandas as pd
import numpy as np

In [6]:
# List of subreddits we want to collect data from
subreddits = ["MentalHealthSupport","mentalhealth","MentalHealthUK","MentalHealthIsland"]

In [7]:
def collect_subreddit_data(subreddit):
    print(f"Collecting data from r/{subreddit}...")
    
    # Create base API URL
    base_api_url = f'https://api.pullpush.io/reddit/search/submission/?subreddit={subreddit}&size=1000'
    
    # Set overall time period (September 2024 to present)
    start_date = datetime(2015, 1, 1)
    end_date = datetime.now()
    
    all_posts = []
    current_date = start_date
    
    # Collect data in 10-day intervals
    while current_date < end_date:
        # Calculate end of 10-day period
        next_date = min(current_date + timedelta(days=10), end_date)
        
        # Convert to timestamps
        start_time = int(current_date.timestamp())
        end_time = int(next_date.timestamp())
        
        # Create URL for this time period
        api_url = f"{base_api_url}&after={start_time}&before={end_time}"
        
        # Make API request
        try:
            response = requests.get(api_url)
            
            if response.status_code == 200:
                data = response.json()
                period_posts = data.get('data', [])
                
                # Add current_date timestamp to each post
                for post in period_posts:
                    post['timestamps'] = start_time
                
                all_posts.extend(period_posts)
                print(f"Found {len(period_posts)} posts from {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}")
            else:
                print(f"Error: Could not collect data from r/{subreddit} for period {current_date.strftime('%Y-%m-%d')} to {next_date.strftime('%Y-%m-%d')}")
        
        except Exception as e:
            print(f"Error during data collection: {str(e)}")
        
        # Move to next period
        current_date = next_date
        
        # Wait between requests to avoid rate limiting
        time.sleep(1)
    
    print(f"Total posts collected from r/{subreddit}: {len(all_posts)}")
    return all_posts

In [8]:
# Collect data from all subreddits
all_posts = []
for subreddit in subreddits:
    # Collect data
    posts = collect_subreddit_data(subreddit)
    
    # Add subreddit name to each post
    for post in posts:
        post['subreddit'] = subreddit
    
    # Add to our list of all posts
    all_posts.extend(posts)
    
    # Wait a bit before next request
    time.sleep(1)

Collecting data from r/MentalHealthSupport...
Found 0 posts from 2015-01-01 to 2015-01-11
Found 10 posts from 2015-01-11 to 2015-01-21
Found 5 posts from 2015-01-21 to 2015-01-31
Found 1 posts from 2015-01-31 to 2015-02-10
Found 1 posts from 2015-02-10 to 2015-02-20
Found 1 posts from 2015-02-20 to 2015-03-02
Found 0 posts from 2015-03-02 to 2015-03-12
Found 2 posts from 2015-03-12 to 2015-03-22
Found 4 posts from 2015-03-22 to 2015-04-01
Found 2 posts from 2015-04-01 to 2015-04-11
Found 3 posts from 2015-04-11 to 2015-04-21
Found 1 posts from 2015-04-21 to 2015-05-01
Found 0 posts from 2015-05-01 to 2015-05-11
Found 0 posts from 2015-05-11 to 2015-05-21
Found 1 posts from 2015-05-21 to 2015-05-31
Found 0 posts from 2015-05-31 to 2015-06-10
Found 0 posts from 2015-06-10 to 2015-06-20
Found 0 posts from 2015-06-20 to 2015-06-30
Found 0 posts from 2015-06-30 to 2015-07-10
Found 1 posts from 2015-07-10 to 2015-07-20
Found 1 posts from 2015-07-20 to 2015-07-30
Found 0 posts from 2015-07-30

In [9]:
# Create DataFrame
df = pd.DataFrame(all_posts)
# Keep only the columns we want
df = df[['title', 'selftext', 'timestamps','subreddit','link_flair_text']].copy()
df.head()

Unnamed: 0,title,selftext,timestamps,subreddit,link_flair_text
0,Something other than CODA?,"12 step groups really doesn't work for me, esp...",1420902000,MentalHealthSupport,
1,What's hard for you that seems to be easy for ...,I suppose everyone here is a little challenged...,1420902000,MentalHealthSupport,
2,An Experiment,I was wondering if anyone dealing with PTSD wo...,1420902000,MentalHealthSupport,
3,Resolving emptiness,I'm wondering if anyone has had success with t...,1420902000,MentalHealthSupport,
4,Imagine you get to tell your younger self one ...,"""Younger self"" can mean whatever you want it t...",1420902000,MentalHealthSupport,


In [10]:
print(f"Total Data: {len(df)}")

Total Data: 74407


In [11]:
df = df[~df['selftext'].isin(['[removed]', '[deleted]'])]  # Remove deleted posts
print(f"Remain: {len(df)}")
df.drop_duplicates(inplace=True)
print(f"Remain: {len(df)}")
df.dropna(inplace=True)
print(f"Remain: {len(df)}")

Remain: 50062
Remain: 49871
Remain: 21776


In [12]:
print(f"Total Data: {len(df)}")

Total Data: 21776


In [13]:
# Save to CSV
filename = f'mental_support_{datetime.now().strftime("%Y%m%d_%H%M")}.csv'
df.to_csv(f"./dataset/{filename}", index=False)
print(f"\nSaved {len(df)} posts to {filename}")


Saved 21776 posts to mental_support_20250114_1830.csv


In [14]:
df.head()

Unnamed: 0,title,selftext,timestamps,subreddit,link_flair_text
13795,Gravity - talk it out,"Hey you, yeah you! Take a moment... to realis...",1708614000,MentalHealthSupport,Resources
13796,Anger issues at just one person?,"I’ve always been a super chill person, and I’v...",1708614000,MentalHealthSupport,Question
13798,Help Needed,"Hello, my name is Pamela. Not sure if this all...",1708614000,MentalHealthSupport,Resources
13799,Need advice- Should/when should I tell my part...,"Hello, i'm a 24 y/o woman who's been dating a ...",1708614000,MentalHealthSupport,Question
13800,Need advice about project for a Uni course,Hi Redditors! \n\nI have a situation I need ad...,1708614000,MentalHealthSupport,Question


In [18]:
len(df["link_flair_text"].unique())

254

In [17]:
len(df[df["link_flair_text"]=="Human Resource Issue"])

1