In [2]:
!pip install  praw

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.0-py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 24.5 MB/s 
[?25hCollecting websocket-client>=0.54.0
  Downloading websocket_client-1.4.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.8 MB/s 
[?25hCollecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.6.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.4.1


In [3]:
import pandas as pd 
import praw

#### All of the required fields below can be generated via: https://www.reddit.com/prefs/apps
> tutorial incase needed: "https://www.geeksforgeeks.org/scraping-reddit-using-python/

In [53]:


reddit_read_only = praw.Reddit(client_id="---" ,        # your client id
                               client_secret="---",      # your client secret
                               user_agent="---",  #your reddit user name
                               check_for_async= False)  

#### Flairs is how a subredd distinguishes between its posts, this can make our task easier if some flair coincides with our project. Then we can simply only store data wrt that particular flair

#### Let us create a function to check all the flairs available 

In [81]:
def get_unique_flairs(reddit,sub,num_posts):
  
  
  """
  This function returns the unique flairs availabe
  
  reddit accepts the instance of praw(read-only in our case)
  sub accepts the subreddit we want to use
  num_posts stores the number of posts we want to check
  
  Note: This function can only return unique flairs found in x amount of posts as the function ".flair()" returns 403 and requires the user to be a mod of the said subreddit
  """

  #Storing the amount of posts we need in "posts" 
  posts=reddit.subreddit(sub).top(limit=num_posts)#We are sorting by top, we can also do it by hot or new


  flairs=[] #Empty list to store  the flairs

  #going though each of our posts
  for post  in posts:
    

    #storing the flair 
    post_flair=post.link_flair_text
    
    #checking if flair is unique, if yes we store it in our list
    if post_flair != None:
      if post_flair not in flairs:
        flairs.append(post_flair)

  if len(flairs)!=0: #as some subs do not have flairs we should check if flairs[] is empty
    return flairs


  else:
    print("no flairs")
    return 0

In [119]:
flairs=get_unique_flairs(reddit_read_only,sub="MentalHealthUk",num_posts=100)

In [120]:
flairs ##Let us check our flairs

['Uplifting',
 'Meme',
 'Resources',
 'Vent',
 'Idea/project/petition',
 'Other',
 'Video',
 'Informative',
 'News ',
 'Discussion',
 'News',
 'Idea/project/petition/survey',
 'I need advice/support',
 'Blog post',
 'Introduction']

#### Now let us focus on a function that will return the scraped data in well split labels
> Note: This is a basic function, one can add more intricacies say gender,date,flair conditions to get only relevant data

In [125]:
# Let us check what labels do we need right now, this can be modified per user needs

labels =["ID", 
         "Title",
         "Flair",
         "Body",
         "Upvotes",
         "Number of comments",
         "URL",
         "Time"]

In [122]:
from datetime import datetime

def Post_Dataframe(reddit,sub,num_posts):

  """
   This function returns a pandas dataframe that we can export as a csv if we want
   reddit accepts the instantialized
  """
  #An empty list to store all our data, we will convert this later into a dataframe
  pdf=[]  
  
  #Storing the posts
  posts=reddit.subreddit(sub).hot(limit=num_posts)
  
  #checking each of our posts
  for post in posts:
    
    #dt is a list where we store each part of the post seperately and later append it into pdf as a row
    dt=[]
    
    #since ".created" returns a unix time stamp we have to convert it into yyyy-mm-dd hh-mm-ss format using datetime object
    ts=int(post.created)
    
    dt=[
        
        #storing the id
        str(post.id),
        #the title
        str(post.title),
        #flair
        str(post.link_flair_text),
        #body of the post
        str(post.selftext),
        #Number of upvotes, note: reddits upvote system is inconsistent to prevent spam bots
        post.score,
        #number of comments under a post, we can also call the comment tree using "submission.comments"
        post.num_comments,
        
        #The url of the post incase we need to do more searching for a particular post
        str(post.url),
        
        #Converting and storing the unix timestamp into a datetime object
        datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
       ]

    ##adding all the above in our pdf list   
    pdf.append(dt)
  
  #Finally converting pdf into a dataframe so we can export it later
  posts_df=pd.DataFrame(pdf,columns=labels)
  
  return posts_df

In [126]:

df=Post_Dataframe(reddit_read_only,sub="MentalHealthUk",num_posts=15)

In [127]:
df[:10]

Unnamed: 0,ID,Title,Flair,Body,Upvotes,Number of comments,URL,Time
0,bc3jtm,Mental Health UK master post,Resources,Here I will include a master post of UK mental...,11,25,https://www.reddit.com/r/MentalHealthUK/commen...,2019-04-11 18:46:07
1,xpl4mv,iPOF - Improving Peer Online Forums Study (Lan...,Research/Study,"Hi,\n\nMy name is Matt. I am a researcher from...",6,5,https://www.reddit.com/r/MentalHealthUK/commen...,2022-09-27 16:16:12
2,y7v4aj,The cost of living crisis risks becoming a sui...,External blog/vlog post,,5,3,https://gal-dem.com/cost-of-living-crisis-suic...,2022-10-19 06:36:12
3,y7zemu,Mens Psychology Research,Research/Study,"Hi guys,\n\nI hope you are well.\n\nI am a doc...",2,1,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-19 10:48:04
4,y7laa5,Do you ever feel like work and everything that...,Vent,I have worked every day since I was 16 and I’v...,14,4,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-18 22:41:52
5,y77n69,When you spend the day speaking to hundreds of...,Uplifting,,74,9,https://www.reddit.com/gallery/y77n69,2022-10-18 13:51:14
6,y7elgv,CMHT Experience,I need advice/support,"Hello, friends.\n\nI am under a Community Ment...",5,7,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-18 18:20:18
7,y7k02v,Trying to unpack a traumatic event.,I need advice/support,"Hello,\n\nFor context, I am 17 but this all ha...",3,3,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-18 21:49:51
8,y73ltg,Still burnt out from previous job - bailed on ...,I need advice/support,"Hi Everyone,\n\nI just wanted to get something...",23,14,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-18 10:46:51
9,y7idcf,"How common are psychosis symptoms in ""just"" de...",I need advice/support,So I recently gained access to my medical hist...,3,7,https://www.reddit.com/r/MentalHealthUK/commen...,2022-10-18 20:47:03


#### From here on, We can simply plug a subreddit, number of posts needed, add required conditions(flairs,gender,etc) to  get the desired data 