# Scraping reddit data 

In [1]:
# randomized delay between requests as a consideration to Reddit's servers and security staff 
# run this cell 

import requests 
import time 
import pandas as pd 
from random import randint 

In [2]:
url = "https://www.reddit.com/r/stress.json"

In [3]:
headers = {"User-agent" : "randuser"}
res = requests.get(url, headers=headers)
res.status_code 

200

In [None]:
stress_json = res.json()
stress_json

In [None]:
# keys of reddit data 
sorted(stress_json["data"].keys())

['after', 'before', 'children', 'dist', 'geo_filter', 'modhash']

In [None]:
# the after string is the query url that indicateswe want to see the next 25 posts after(predetermined number) 
stress_json["data"]["after"]

't3_tzs0l4'

In [None]:
[post["data"]["name"] for post in stress_json["data"]["children"]]

['t3_fwes89',
 't3_u6wml7',
 't3_u6kp9a',
 't3_u61dcm',
 't3_u5pdeg',
 't3_u5lh2q',
 't3_u5e2ry',
 't3_u4liwf',
 't3_u3zvjb',
 't3_u3b4w8',
 't3_u3ke4p',
 't3_u3iikk',
 't3_u3fx4t',
 't3_u344p5',
 't3_u2uhxi',
 't3_u2mzd2',
 't3_u20vdl',
 't3_u1lss6',
 't3_u19f0m',
 't3_u13s1b',
 't3_u0ksan',
 't3_u0zoub',
 't3_u0pcbs',
 't3_u0e83h',
 't3_u01tzi',
 't3_tzs0l4']

In [None]:
# number of posts in one page 
len(stress_json["data"]["children"])

26

In [None]:
pd.DataFrame(stress_json["data"]["children"])

In [None]:
stress_json["data"]["children"][0]["data"]["selftext"]

"The book is available [Here](https://thewellnesssociety.org/free-coronavirus-anxiety-workbook/) from The Wellness Society. Everyone right now needs a little extra help and hopefully, this e-book can assist some of you in uncovering the toolset you need during this abnormal time, or at least it might help with bridging the gap between now and when you may be able to seek more professional assistance. Obviously, it's not a solution to all problems, and some of you are going to be going through a lot more than others, but I hope many of you can find it useful.\nStay safe, stay healthy."

In [4]:
# functions to automate the data collection process 
def reddit_scrape(url_string, number_of_scrapes, output_list):
  after = None 
  for _  in range(number_of_scrapes):
    if _ == 0:
      print(f"SCRAPING {url_string}\n--------------------------------------------------")
      print("<<<SCRAPING COMMENCED>>>")
      print(f"Downloading Batch {1} of {number_of_scrapes}")
    elif (_+1) % 5 == 0:
      print(f"Downloading batch {(_ + 1)} of {number_of_scrapes}")

    if after == None:
      params = {} 
    else:
      # tells the scraper to get the next set after reddit's after code 
      params = {"after": after}
    res = requests.get(url_string, params=params, headers=headers)
    if res.status_code == 200:
      the_json = res.json() 
      output_list.extend(the_json["data"]["children"])
      after = the_json["data"]["after"]
    else:
      print(res.status_code)
      break 
    time.sleep(randint(1,6))

  print("<<<SCRAPING COMPLETED>>>")
  print(f"Number of posts downloaded: {len(output_list)}")
  print("Number of unique posts: {}".format(len(set([p["data"]["name"] for p in output_list]))))

In [9]:
# call the function 
stress_scraped = [] 

# 50 batches of 26
reddit_scrape("https://www.reddit.com/r/stress.json", 300, stress_scraped)

SCRAPING https://www.reddit.com/r/stress.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 300
Downloading batch 5 of 300
Downloading batch 10 of 300
Downloading batch 15 of 300
Downloading batch 20 of 300
Downloading batch 25 of 300
Downloading batch 30 of 300
Downloading batch 35 of 300
Downloading batch 40 of 300
Downloading batch 45 of 300
Downloading batch 50 of 300
Downloading batch 55 of 300
Downloading batch 60 of 300
Downloading batch 65 of 300
Downloading batch 70 of 300
Downloading batch 75 of 300
Downloading batch 80 of 300
Downloading batch 85 of 300
Downloading batch 90 of 300
Downloading batch 95 of 300
Downloading batch 100 of 300
Downloading batch 105 of 300
Downloading batch 110 of 300
Downloading batch 115 of 300
Downloading batch 120 of 300
Downloading batch 125 of 300
Downloading batch 130 of 300
Downloading batch 135 of 300
Downloading batch 140 of 300
Downloading batch 145 of 300
Downloading batch 150 of 300
D

In [10]:
# function that scrapes the data 
def create_unique_list(original_scrape_list, new_list_name):
  data_name_list =[] 
  for i in range(len(original_scrape_list)):
    # name is the unique name assigned to each post 
    # if the name of the post is not already in data_name_list 
    # then add it to the new_list_name 
    if original_scrape_list[i]["data"]["name"] not in data_name_list:
      new_list_name.append(original_scrape_list[i]["data"])
      data_name_list.append(original_scrape_list[i]["data"]["name"])
  print(f"List now contains {len(new_list_name)} unique scraped posts")

In [11]:
stress_scraped_unique = [] 
create_unique_list(stress_scraped, stress_scraped_unique)

List now contains 818 unique scraped posts


In [75]:
# putting the data into a dataframe saving to csv 
stress = pd.DataFrame(stress_scraped_unique)
# naively assign stress label to every post 
stress["is_stress"] = 1 
stress.to_csv("stress.csv")

In [74]:
stress.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,call_to_action,poll_data,author_cakeday,crosspost_parent_list,url_overridden_by_dest,crosspost_parent,is_stress
0,,Stress,The book is available [Here](https://thewellne...,t2_3adaf,False,,0,False,"Free Covid-19 Anxiety e-Workbook. Please, take...",[],...,1,,False,,,,,,,1
1,,Stress,I am currently at my wits end with my job. \n...,t2_4yp9irp9,False,,0,False,Stress leave from work,[],...,0,,False,,,,,,,1
2,,Stress,,t2_815ykpup,False,,0,False,Is it normal that I lose appetite when I’m so ...,[],...,0,,False,,,,,,,1
3,,Stress,"I feel very stressed, but since I live at my p...",t2_5mvda6wc,False,,0,False,Is there a way to measure how stressed I am ri...,[],...,0,,False,,,,,,,1
4,,Stress,i don’t know how to fix this. every time I fin...,t2_c8sfjcku,False,,0,False,does anyone else get a headache when they try ...,[],...,0,,False,,,,,,,1


In [29]:
url_anxiety = "https://www.reddit.com/r/anxiety.json"

In [30]:
anxiety_scraped = []
reddit_scrape(url_anxiety, 50, anxiety_scraped)

SCRAPING https://www.reddit.com/r/anxiety.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50
Downloading batch 5 of 50
Downloading batch 10 of 50
Downloading batch 15 of 50
Downloading batch 20 of 50
Downloading batch 25 of 50
Downloading batch 30 of 50
Downloading batch 35 of 50
Downloading batch 40 of 50
Downloading batch 45 of 50
Downloading batch 50 of 50
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 1243
Number of unique posts: 991


In [31]:
anxiety_scraped_unique = []
create_unique_list(anxiety_scraped, anxiety_scraped_unique)

List now contains 991 unique scraped posts


In [34]:
anxiety = pd.DataFrame(anxiety_scraped_unique)
anxiety["is_stress"] = 1
anxiety.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,subreddit_subscribers,created_utc,num_crossposts,media,is_video,call_to_action,post_hint,preview,author_cakeday,is_stress
0,,Anxiety,Happy Sunday /r/Anxiety! \n\nIt's everyone's f...,t2_6l4z3,False,,0,False,Set your intention,[],...,538319,1650193000.0,0,,False,,,,,1
1,,Anxiety,If you are suffering from anxiety related to t...,t2_9awrwk94,False,,0,False,Ukraine Anxiety,[],...,538319,1650061000.0,0,,False,,,,,1
2,,Anxiety,Any tips for getting sleep? I have these racin...,t2_iv9lz0v8,False,,0,False,HELP I've been struggling getting sleep. Any t...,[],...,538319,1650414000.0,0,,False,,,,,1
3,,Anxiety,I was speaking to a friend today about how I f...,t2_3swjjb1s,False,,0,False,I don’t think people truly realize how debilit...,[],...,538319,1650417000.0,0,,False,,,,,1
4,,Anxiety,"When I get upset with myself or anxious, I fee...",t2_2qjy6h69,False,,0,False,Does anyone else like to hide?,[],...,538319,1650376000.0,0,,False,,,,,1


In [76]:
anxiety.to_csv("stress.csv", mode='a')

In [35]:
url_homeless = "https://www.reddit.com/r/almosthomeless.json"

In [38]:
homeless_scraped = []
reddit_scrape(url_homeless, 50, homeless_scraped)

SCRAPING https://www.reddit.com/r/almosthomeless.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50
Downloading batch 5 of 50
Downloading batch 10 of 50
Downloading batch 15 of 50
Downloading batch 20 of 50
Downloading batch 25 of 50
Downloading batch 30 of 50
Downloading batch 35 of 50
Downloading batch 40 of 50
Downloading batch 45 of 50
Downloading batch 50 of 50
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 1246
Number of unique posts: 995


In [39]:
homeless_scraped_unique = []
create_unique_list(homeless_scraped, homeless_scraped_unique)

List now contains 995 unique scraped posts


In [78]:
homeless = pd.DataFrame(homeless_scraped_unique)
homeless["is_stress"] = 1
homeless.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,user_reports,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,call_to_action,link_flair_template_id,author_cakeday,is_stress
0,,almosthomeless,,[],False,,0,False,Don't give people money on here!,[],...,/r/homeless/comments/994p7j/dont_give_people_m...,40523,1579631000.0,0,,False,,,,1
1,,almosthomeless,My landlord has threatened eviction throughout...,[],False,,0,False,Disabled 55 yr old facing eviction,[],...,https://www.reddit.com/r/almosthomeless/commen...,40523,1650392000.0,1,,False,,,,1
2,,almosthomeless,So it started when I was wrongfully fired from...,[],False,,0,False,19f recently was fired for no real reason and ...,[],...,https://www.reddit.com/r/almosthomeless/commen...,40523,1650399000.0,0,,False,,bb935dde-20fe-11e5-a0b2-0e09b4299f63,,1
3,,almosthomeless,Me and my fiancé and our daughter live with my...,[],False,,0,False,What do I do?,[],...,https://www.reddit.com/r/almosthomeless/commen...,40523,1650387000.0,0,,False,,,,1
4,,almosthomeless,I lived with my ex-gf for 2+ years. During tha...,[],False,,0,False,I lost everything,[],...,https://www.reddit.com/r/almosthomeless/commen...,40523,1650334000.0,0,,False,,,,1


In [79]:
homeless.to_csv("stress.csv", mode='a')

In [41]:
url_assistance = "https://www.reddit.com/r/Assistance.json"

In [42]:
assistance_scraped = []
reddit_scrape(url_assistance, 50, assistance_scraped)

SCRAPING https://www.reddit.com/r/Assistance.json
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50
Downloading batch 5 of 50
Downloading batch 10 of 50
Downloading batch 15 of 50
Downloading batch 20 of 50
Downloading batch 25 of 50
Downloading batch 30 of 50
Downloading batch 35 of 50
Downloading batch 40 of 50
Downloading batch 45 of 50
Downloading batch 50 of 50
<<<SCRAPING COMPLETED>>>
Number of posts downloaded: 1208
Number of unique posts: 528


In [43]:
assistance_scraped_unique = []
create_unique_list(assistance_scraped, assistance_scraped_unique)

List now contains 528 unique scraped posts


In [44]:
assistance = pd.DataFrame(assistance_scraped_unique)
assistance["is_stress"] = 1 
assistance.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,call_to_action,author_cakeday,is_stress
0,,Assistance,"With a devastating war going on in Ukraine, ou...",t2_1c8b1t,False,,0,False,Assistance related to the war in Ukraine,[],...,True,https://www.reddit.com/r/Assistance/comments/t...,217869,1646596000.0,0,,False,,,1
1,,Assistance,Aspiring is the man behind the magic here. He ...,t2_izjlr,False,,1,False,Let's wish our genius mod u/AspiringInspirator...,[],...,True,https://www.reddit.com/r/Assistance/comments/t...,217869,1647100000.0,0,,False,,,1
2,,Assistance,I am the worst at asking for help so I have no...,t2_8i6ct1gn,False,,0,False,Swallowing my pride and asking for help with f...,[],...,False,https://www.reddit.com/r/Assistance/comments/u...,217869,1650395000.0,0,,False,,,1
3,,Assistance,"if I give you an award, please go ahead and DM...",t2_k6bbtni7,False,,0,False,"OFFER. I have a spare £30, around 40 dollars, ...",[],...,False,https://www.reddit.com/r/Assistance/comments/u...,217869,1650441000.0,0,,False,,,1
4,,Assistance,Times are really tough right now. In a month i...,t2_15kc3g,False,,0,False,19 year old hanging on by a thread.,[],...,False,https://www.reddit.com/r/Assistance/comments/u...,217869,1650433000.0,0,,False,,,1


In [80]:
assistance.to_csv("stress.csv", mode='a')

In [81]:
# Non-stress data 
# Lana start running from here 
url_casual = "https://www.reddit.com/r/CasualConversation/"

In [82]:
casual_scraped = []
reddit_scrape(url_casual, 50, casual_scraped)

SCRAPING https://www.reddit.com/r/CasualConversation/
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50


JSONDecodeError: ignored

In [69]:
url_food = "https://www.reddit.com/r/stress/"
# headers = {"User-agent" : "randuser"}
# res = requests.get(url, headers=headers)
# res.status_code 

In [60]:
# homeless_var = res.json()

KeyError: ignored

In [70]:
food_scraped = []
reddit_scrape(url_food, 50, food_scraped)

SCRAPING https://www.reddit.com/r/stress/
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50


JSONDecodeError: ignored

In [50]:
url_homeless1 = "https://www.reddit.com/r/homeless/"

In [52]:
homeless1 = []
reddit_scrape(url_homeless1, 50, homeless1)

SCRAPING https://www.reddit.com/r/homeless/
--------------------------------------------------
<<<SCRAPING COMMENCED>>>
Downloading Batch 1 of 50


JSONDecodeError: ignored