In [1]:
# randomized delay between requests as a consideration to Reddit's servers and security staff 
# run this cell 

import requests 
import time 
import pandas as pd 
from random import randint 

In [None]:
url = "https://www.reddit.com/r/stress.json"

In [2]:
headers = {"User-agent" : "randuser"}

In [3]:
# functions to automate the data collection process 
def reddit_scrape(url_string, number_of_scrapes, output_list):
  after = None 
  for _  in range(number_of_scrapes):
    if _ == 0:
      print(f"SCRAPING {url_string}\n--------------------------------------------------")
      print("<<<SCRAPING COMMENCED>>>")
      print(f"Downloading Batch {1} of {number_of_scrapes}")
    elif (_+1) % 5 == 0:
      print(f"Downloading batch {(_ + 1)} of {number_of_scrapes}")

    if after == None:
      params = {} 
    else:
      # tells the scraper to get the next set after reddit's after code 
      params = {"after": after}
    res = requests.get(url_string, params=params, headers=headers)
    if res.status_code == 200:
      the_json = res.json() 
      output_list.extend(the_json["data"]["children"])
      after = the_json["data"]["after"]
    else:
      print(res.status_code)
      break 
    time.sleep(randint(1,6))

  print("<<<SCRAPING COMPLETED>>>")
  print(f"Number of posts downloaded: {len(output_list)}")
  print("Number of unique posts: {}".format(len(set([p["data"]["name"] for p in output_list]))))

In [4]:
# call the function 
stress_scraped = [] 
# 50 batches of 26
reddit_scrape("https://www.reddit.com/r/stress.json", 100, stress_scraped)

SCRAPING https://www.reddit.com/r/stress.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 100
Downloading batch 5 of 100
Downloading batch 10 of 100
Downloading batch 15 of 100
Downloading batch 20 of 100
Downloading batch 25 of 100
Downloading batch 30 of 100
Downloading batch 35 of 100
Downloading batch 40 of 100
Downloading batch 45 of 100
Downloading batch 50 of 100
Downloading batch 55 of 100
Downloading batch 60 of 100
Downloading batch 65 of 100
Downloading batch 70 of 100
Downloading batch 75 of 100
Downloading batch 80 of 100
Downloading batch 85 of 100
Downloading batch 90 of 100
Downloading batch 95 of 100
Downloading batch 100 of 100
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 2480
Number of unique posts: 818


In [5]:
# function that scrapes the data 
def create_unique_list(original_scrape_list, new_list_name):
  data_name_list =[] 
  for i in range(len(original_scrape_list)):
    # name is the unique name assigned to each post 
    # if the name of the post is not already in data_name_list 
    # then add it to the new_list_name 
    if original_scrape_list[i]["data"]["name"] not in data_name_list:
      new_list_name.append(original_scrape_list[i]["data"])
      data_name_list.append(original_scrape_list[i]["data"]["name"])
  print(f"List now contains {len(new_list_name)} unique scraped posts")

In [6]:
stress_scraped_unique = [] 
create_unique_list(stress_scraped, stress_scraped_unique)

List now contains 818 unique scraped posts


In [7]:
# putting the data into a dataframe saving to csv 
stress = pd.DataFrame(stress_scraped_unique)
# naively assign stress label to every post 
stress["is_stress"] = 1 

In [None]:
# stress.head()

In [8]:
url_conspiracy = "https://www.reddit.com/r/conspiracy.json"

In [9]:
conspiracy_scraped = []
reddit_scrape(url_conspiracy, 100, conspiracy_scraped)

SCRAPING https://www.reddit.com/r/conspiracy.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 100
Downloading batch 5 of 100
Downloading batch 10 of 100
Downloading batch 15 of 100
Downloading batch 20 of 100
Downloading batch 25 of 100
Downloading batch 30 of 100
Downloading batch 35 of 100
Downloading batch 40 of 100
Downloading batch 45 of 100
Downloading batch 50 of 100
Downloading batch 55 of 100
Downloading batch 60 of 100
Downloading batch 65 of 100
Downloading batch 70 of 100
Downloading batch 75 of 100
Downloading batch 80 of 100
Downloading batch 85 of 100
Downloading batch 90 of 100
Downloading batch 95 of 100
Downloading batch 100 of 100
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 2482
Number of unique posts: 943


In [10]:
consp_scraped_unique = []
create_unique_list(conspiracy_scraped, consp_scraped_unique)

List now contains 943 unique scraped posts


In [11]:
conspiracy = pd.DataFrame(consp_scraped_unique)
conspiracy["is_stress"] = 0

In [13]:
# stress, anxiety, homeless, assistance, food, casual, conspiracy, jokes, med 

stress_columns = stress[["title", "selftext", "author",  "num_comments", "is_stress","url"]]
consp_columns = conspiracy[["title", "selftext", "author",  "num_comments", "is_stress","url"]]

In [14]:
combined_data = pd.concat([stress_columns, consp_columns], axis=0, ignore_index=True)

In [15]:
combined_data["selftext"].fillna("emptypost",inplace=True)

In [16]:
combined_data.head()

Unnamed: 0,title,selftext,author,num_comments,is_stress,url
0,"Free Covid-19 Anxiety e-Workbook. Please, take...",The book is available [Here](https://thewellne...,Impudence,11,1,https://www.reddit.com/r/Stress/comments/fwes8...
1,Any tips to breathe deeply? I'm always short b...,,kind-sofa,5,1,https://www.reddit.com/r/Stress/comments/ubu69...
2,Just realized I’ve forgotten a lot of things f...,,yanshixo,2,1,https://www.reddit.com/r/Stress/comments/ublh6...
3,Self employed business stress. Slowly eating m...,I’m a 2021 graduate. I went straight into bein...,Ok-kitsunekitty,0,1,https://www.reddit.com/r/Stress/comments/ubhs0...
4,Stressed out,Honestly I just need to tell someone like bruh...,AppointmentNo2153,4,1,https://www.reddit.com/r/Stress/comments/ubaea...


In [17]:
combined_data.isnull().sum()

title           0
selftext        0
author          0
num_comments    0
is_stress       0
url             0
dtype: int64

In [18]:
# dataframe[dataframe['Percentage'] > 80]
len(combined_data[combined_data['is_stress'] == 1])

818

In [19]:
len(combined_data[combined_data['is_stress'] == 0])

943

In [20]:
combined_data.to_csv('combined.csv', index = False)

In [21]:
combined_data.iloc[[10]]["selftext"]

10    I really have no idea what my next step. 100% ...
Name: selftext, dtype: object