In [None]:
!pip install  praw

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.0-py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 5.1 MB/s 
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.4.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.6 MB/s 
[?25hCollecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.6.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.4.1


In [None]:
import pandas as pd 
import praw

#### All of the required fields below can be generated via: https://www.reddit.com/prefs/apps
> tutorial incase needed: "https://www.geeksforgeeks.org/scraping-reddit-using-python/

In [None]:


reddit_read_only = praw.Reddit(client_id="---" ,        # your client id
                               client_secret="---",      # your client secret
                               user_agent="---",  #your reddit user name
                               check_for_async= False)  

#### Flairs is how a subredd distinguishes between its posts, this can make our task easier if some flair coincides with our project. Then we can simply only store data wrt that particular flair

#### Let us create a function to check all the flairs available 

In [None]:
def get_unique_flairs(reddit,sub,num_posts):
  
  
  """
  This function returns the unique flairs availabe
  
  reddit accepts the instance of praw(read-only in our case)
  sub accepts the subreddit we want to use
  num_posts stores the number of posts we want to check
  
  Note: This function can only return unique flairs found in x amount of posts as the function ".flair()" returns 403 and requires the user to be a mod of the said subreddit
  """

  #Storing the amount of posts we need in "posts" 
  posts=reddit.subreddit(sub).top(limit=num_posts)#We are sorting by top, we can also do it by hot or new


  flairs=[] #Empty list to store  the flairs

  #going though each of our posts
  for post  in posts:
    

    #storing the flair 
    post_flair=post.link_flair_text
    
    #checking if flair is unique, if yes we store it in our list
    if post_flair != None:
      if post_flair not in flairs:
        flairs.append(post_flair)

  if len(flairs)!=0: #as some subs do not have flairs we should check if flairs[] is empty
    return flairs


  else:
    print("no flairs")
    return 0

In [None]:
flairs=get_unique_flairs(reddit_read_only,sub="MentalHealthUk",num_posts=100)

In [None]:
flairs ##Let us check our flairs

['Uplifting',
 'Meme',
 'Resources',
 'Vent',
 'Idea/project/petition',
 'Other',
 'Video',
 'Informative',
 'News ',
 'Discussion',
 'News',
 'Idea/project/petition/survey',
 'I need advice/support',
 'Blog post',
 'Introduction']

#### Now let us focus on a function that will return the scraped data in well split labels
> Note: This is a basic function, one can add more intricacies say gender,date,flair conditions to get only relevant data

In [None]:
# Let us check what labels do we need right now, this can be modified per user needs

labels =["ID", 
         "Title",
         "Flair",
         "Body",
         "Upvotes",
         "Number of comments",
         "URL",
         "Time"]

In [None]:
from datetime import datetime

def Post_Dataframe(reddit,sub,num_posts):

  """
   This function returns a pandas dataframe that we can export as a csv if we want
   reddit accepts the instantialized
  """
  #An empty list to store all our data, we will convert this later into a dataframe
  pdf=[]  
  
  #Storing the posts
  posts=reddit.subreddit(sub).hot(limit=num_posts)
  
  #checking each of our posts
  for post in posts:
    
    #dt is a list where we store each part of the post seperately and later append it into pdf as a row
    dt=[]
    
    #since ".created" returns a unix time stamp we have to convert it into yyyy-mm-dd hh-mm-ss format using datetime object
    ts=int(post.created)
    
    dt=[
        
        #storing the id
        str(post.id),
        #the title
        str(post.title),
        #flair
        str(post.link_flair_text),
        #body of the post
        str(post.selftext),
        #Number of upvotes, note: reddits upvote system is inconsistent to prevent spam bots
        post.score,
        #number of comments under a post, we can also call the comment tree using "submission.comments"
        post.num_comments,
        
        #The url of the post incase we need to do more searching for a particular post
        str(post.url),
        
        #Converting and storing the unix timestamp into a datetime object
        datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
       ]

    ##adding all the above in our pdf list   
    pdf.append(dt)
  
  #Finally converting pdf into a dataframe so we can export it later
  posts_df=pd.DataFrame(pdf,columns=labels)
  
  return posts_df

In [None]:
df=Post_Dataframe(reddit_read_only,sub="MentalHealthUK",num_posts=15)

In [None]:
df[:10]

Unnamed: 0,ID,Title,Flair,Body,Upvotes,Number of comments,URL,Time
0,bc3jtm,Mental Health UK master post,Resources,Here I will include a master post of UK mental...,11,25,https://www.reddit.com/r/MentalHealthUK/commen...,2019-04-11 18:46:07
1,xpl4mv,iPOF - Improving Peer Online Forums Study (Lan...,Research/Study,"Hi,\n\nMy name is Matt. I am a researcher from...",9,5,https://www.reddit.com/r/MentalHealthUK/commen...,2022-09-27 16:16:12
2,ya8xj2,has anyone had the same experience,Vent,I remember a while ago I tried to get help fro...,9,3,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-22 00:10:29
3,ya0icq,4 times.,I need advice/support,This is the 4th time this year that they didnt...,18,12,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-21 18:12:58
4,yaeiae,How do i help my friend,I need advice/support,my best friend lives in the u.k. and i live in...,1,2,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-22 04:55:05
5,y9z7vx,Anybody experienced weight gain from Mirtazapine?,Discussion,I started taking 15mg in February for sleep is...,8,17,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-21 17:19:08
6,ya5qhc,Has anybody experienced side effects from tape...,Discussion,I take 150mg per day of clomipramine to treat ...,3,3,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-21 21:47:24
7,ya2wel,After an intense mashup of emotions / experien...,I need advice/support,So a few weeks ago I posted about how I’d been...,3,4,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-21 19:50:23
8,yaa98i,Mental health work or benefits poll,Poll,Mental health conditions can often make workin...,1,4,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-22 01:14:57
9,y9yh9s,Coercive control accusation (carer),I need advice/support,I'm hoping somebody can help me with this.\n\n...,4,5,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-21 16:47:18


#### From here on, We can simply plug a subreddit, number of posts needed, add required conditions(flairs,gender,etc) to  get the desired data 

## A query specific function, The main function to be used

In [None]:
from datetime import datetime

def Post_Dataframe2(reddit,sub,num_posts,query):

  """
   This function returns a pandas dataframe that we can export as a csv if we want
   reddit accepts the instantialized
  """
  #An empty list to store all our data, we will convert this later into a dataframe
  pdf=[]  
  
  #Storing the posts
  
  #THE ONLY ADDITION:instead of sorting by hot,top,etc we are now sorting via our query,basically works like the search bar of reddit
  posts=reddit.subreddit(sub).search(query,limit=num_posts)
  
  #checking each of our posts
  for post in posts:
    
    
   


    #dt is a list where we store each part of the post seperately and later append it into pdf as a row
    dt=[]
    
    #since ".created" returns a unix time stamp we have to convert it into yyyy-mm-dd hh-mm-ss format using datetime object
    ts=int(post.created)
    
    dt=[
        
    #storing the id
    str(post.id),
    #the title
    str(post.title),
    #flair
    str(post.link_flair_text),
    #body of the post
    str(post.selftext),
    #Number of upvotes, note: reddits upvote system is inconsistent to prevent spam bots
    post.score,
    #number of comments under a post, we can also call the comment tree using "submission.comments"
    post.num_comments,
        
    #The url of the post incase we need to do more searching for a particular post
    str(post.url),
        
    #Converting and storing the unix timestamp into a datetime object
    datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
        ]

    ##adding all the above in our pdf list   
    pdf.append(dt)
  
  #Finally converting pdf into a dataframe so we can export it later
  posts_df=pd.DataFrame(pdf,columns=labels)
  
  return posts_df


    

In [None]:
x=Post_Dataframe2(reddit_read_only,sub="MentalHealthUk",num_posts=None,query="23F")

In [None]:
len(x)

4

In [None]:
x.head()

Unnamed: 0,ID,Title,Flair,Body,Upvotes,Number of comments,URL,Time
0,wrqnxo,fluoxetine,Discussion,"23F, UK\n\nLife has been hell, i've quit my jo...",6,7,https://www.reddit.com/r/MentalHealthUK/commen...,2022-08-18 18:17:06
1,spgmuf,I don't know what to do,I need advice/support,"For context, I (23F) have struggled with depre...",2,3,https://www.reddit.com/r/MentalHealthUK/commen...,2022-02-10 20:46:43
2,klz6ic,How to stop feeling disgusted and guilty with ...,I need advice/support,I’m 23F and for years I’ve been haunted by thi...,20,15,https://www.reddit.com/r/MentalHealthUK/commen...,2020-12-28 21:14:13
3,me1b5u,I’m really struggling with my mental health an...,Vent,I’m 23F and have had depression and OCD for ab...,3,3,https://www.reddit.com/r/MentalHealthUK/commen...,2021-03-26 23:27:36


In [None]:
y=Post_Dataframe2(reddit_read_only,sub="mentalhealth",num_posts=None,query="23F")

In [None]:
len(y)

235

In [None]:
y.head()

Unnamed: 0,ID,Title,Flair,Body,Upvotes,Number of comments,URL,Time
0,xjif5z,"Men, do you express your vulnerability?",Opinion / Thoughts,I(23F) watched a YT video today of a guy discu...,32,84,https://www.reddit.com/r/mentalhealth/comments...,2022-09-20 19:41:52
1,xyawhk,looking for someone to chat and listen to,,Hey 23f and struggle with my mental health at ...,10,29,https://www.reddit.com/r/mentalhealth/comments...,2022-10-07 21:33:23
2,wy1264,I just saw my boyfriend texting another girl a...,Need Support,I was at the bar and I saw my boyfriend textin...,42,63,https://www.reddit.com/r/mentalhealth/comments...,2022-08-26 05:56:14
3,y9uiic,"S******* because of anxiety, should I change a...",Need Support,Hello I’m 23f I’ve had severe anxiety for 13 y...,1,4,https://www.reddit.com/r/mentalhealth/comments...,2022-10-21 14:01:23
4,y6kleb,Antidepressants,Opinion / Thoughts,I (23F) started taking antidepressants last ye...,3,3,https://www.reddit.com/r/mentalhealth/comments...,2022-10-17 19:46:04


#### Note:If I am not wrong,some subs have an additional layer of checkboxes for permissions,like EDAanonymous has a trigger warning when you enter the subs, these subs will give you a 401. I got the said error even after joining the sub.

#### Praw returns 1k results per call, this also depends on how relevant(how many datapoints  the query has in that the particular sub) your query is as shown in the example above.

#### The above should be enough, but incase we need more datapoints or something later, there is another wrapper available called pushshift(although,I don't know much about it right now)


---

# Now let us start calling our function and scraping

In [1]:
#These are the subs we are scraping rn
Subreddits=["mentalhealth",
            "depressed",
            "lonely",
            "adhd_anxiety",
            "ADHD",
            "Anxiety",
            "OCD",
            "mentalillness",
            "depression",
            "depression_help",
            "SuicideWatch",
            "EatingDisorders",
            "eating_disorders",
            "EDAnonymous",
            "MentalHealthUK",
            "bingeeating",
            "healthateverysize",
            "BodyDysmorphia",
            "Anxietyhelp",
            "bulimia",
            "fuckeatingdisorders",
            "AnorexiaNervosa",
            "BingeEatingDisorder"]


In [None]:
%%time
cd=pd.DataFrame() #Our dataframe where we will store everything


for i in Subreddits:#Calling each subreddit
  
  cs = pd.DataFrame()#A df to if you want to store csv for every subreddit
  
  for j in range(10,24): #Age range
    
    y=Post_Dataframe2(reddit_read_only,sub=i,num_posts=None,query=f"{j} F")
    cs=cs.append(y)
    y["Subreddit"]=i
    cd=cd.append(y)
  
  print(len(cs),f'for sub:{i}')#Checking how many posts we get per sub
  

2312 for sub:mentalhealth
185 for sub:depressed
580 for sub:lonely
46 for sub:adhd_anxiety
2321 for sub:ADHD
1715 for sub:Anxiety
548 for sub:OCD
247 for sub:mentalillness
2909 for sub:depression
215 for sub:depression_help
2117 for sub:SuicideWatch
188 for sub:EatingDisorders
58 for sub:eating_disorders
273 for sub:EDAnonymous
43 for sub:MentalHealthUK
2 for sub:bingeeating
0 for sub:healthateverysize
107 for sub:BodyDysmorphia
151 for sub:Anxietyhelp
132 for sub:bulimia
142 for sub:fuckeatingdisorders
57 for sub:AnorexiaNervosa
148 for sub:BingeEatingDisorder
CPU times: user 12.5 s, sys: 771 ms, total: 13.2 s
Wall time: 16min 22s


In [None]:
cd.to_csv(Path) #Storing the csv

In [None]:
%%time 
#Doing the same thing above for but querying by "** female". Created two loops for  better visualisation
cd2=pd.DataFrame() 
for i in Subreddits:
  cs2 = pd.DataFrame()
  for j in range(10,24):
    y=Post_Dataframe2(reddit_read_only,sub=i,num_posts=None,query=f"{j} female")
    cs2=cs2.append(y)
    y["Subreddit"]=i
    cd2=cd2.append(y)
  print(len(cs2),f'for sub:{i}')
  

2622 for sub:mentalhealth
156 for sub:depressed
651 for sub:lonely
30 for sub:adhd_anxiety
1790 for sub:ADHD
1812 for sub:Anxiety
658 for sub:OCD
338 for sub:mentalillness
3071 for sub:depression
343 for sub:depression_help
2641 for sub:SuicideWatch
246 for sub:EatingDisorders
95 for sub:eating_disorders
251 for sub:EDAnonymous
98 for sub:MentalHealthUK
3 for sub:bingeeating
12 for sub:healthateverysize
125 for sub:BodyDysmorphia
147 for sub:Anxietyhelp
126 for sub:bulimia
139 for sub:fuckeatingdisorders
59 for sub:AnorexiaNervosa
168 for sub:BingeEatingDisorder
CPU times: user 11.8 s, sys: 636 ms, total: 12.4 s
Wall time: 11min 50s


In [None]:
len(cd2),len(cd) #Checking how many posts we have

(15581, 14496)

In [None]:
cd3=cd2.append(cd) #Adding the two dataframes

In [None]:
len(cd3)

21781

In [None]:
cd3=cd3.drop_duplicates() #removing duplicates
cd3=cd3.reset_index()

In [None]:
cd3.to_csv(Path)

In [4]:
cd3.sample(10)

Unnamed: 0.1,Unnamed: 0,ID,Title,Flair,Body,Upvotes,Number of comments,URL,Time,Subreddit
5305,40,9xqnw8,I'm 22f and just found I've been living with O...,,I've been in an out of counseling since I was ...,6,7,https://www.reddit.com/r/OCD/comments/9xqnw8/i...,2018-11-16 21:57:32,OCD
17481,2,u4fd5s,F (13) I don't think this is home..,,I have been hospitalized for 8 months due to s...,4,1,https://www.reddit.com/r/depression/comments/u...,2022-04-15 19:09:13,depression
16029,137,6owd4r,Waking up in a sweat. Anxiety related?,,For as long as I can remember I've had anxiety...,6,12,https://www.reddit.com/r/Anxiety/comments/6owd...,2017-07-22 17:54:34,Anxiety
18892,21,wulexi,i went from making 25k a month to broke and de...,,"\nhi everyone, 22 F. About 2.5 years ago I sta...",38,36,https://www.reddit.com/r/depression/comments/w...,2022-08-22 05:56:06,depression
16213,5,cf30i8,I [22/f] always feel like I am in trouble or d...,,Thank you in advance for reading this! I'll t...,6,2,https://www.reddit.com/r/Anxiety/comments/cf30...,2019-07-19 04:28:31,Anxiety
11427,64,g2r0i3,I've recently noticed I've been feeling empty ...,,"I'm a first time redditor, I don't want to tel...",2,0,https://www.reddit.com/r/mentalhealth/comments...,2020-04-17 00:08:16,mentalhealth
9910,43,kg0y6p,I don’t think I’m going to make it,,I’m so sad. Everyday is a struggle to get thro...,1,4,https://www.reddit.com/r/SuicideWatch/comments...,2020-12-19 04:12:16,SuicideWatch
3802,83,ezeyaz,Anxiety about teenage girls.,,I have been so Anxious about teenage girls rec...,0,3,https://www.reddit.com/r/Anxiety/comments/ezey...,2020-02-05 19:32:36,Anxiety
5025,59,bcvfcs,[23F] Admitted to ER while traveling abroad- d...,Advice Needed,"Sorry this is a little long, but I just want t...",1,29,https://www.reddit.com/r/Anxiety/comments/bcvf...,2019-04-13 21:21:49,Anxiety
18137,138,azbdix,I think I am a sociopath,,**Disclaimer:- English is my second language s...,1,5,https://www.reddit.com/r/depression/comments/a...,2019-03-10 02:56:37,depression
