# Subreddit scraper
## Author: Karina Lopez
### Last updated: 06/09/2021

**Purpose:** Scrape specified subreddit based on specific search terms; Returns datasets with post information and comments.

In [6]:
import praw
import pandas as pd
import os

## Get your token key

Follow these instructions to get your token key: https://towardsdatascience.com/scraping-reddit-data-1c0af3040768

at this site: https://www.reddit.com/prefs/apps

- **client id:** (personal use script)
- **secret:** 
- **user_agent:** (name)
- **redirect URI:** http://localhost:8080



In [7]:
reddit = praw.Reddit(client_id = '',
                     client_secret = '',
                     user_agent = '')

## Enter your variables


### top n subreddit

In [8]:
####### FUNCTION VARIABLES #######

# Base directory string
BASE_DIR = '/Users/karinalopez/Desktop/ds_projects/nlp/'

# subreddit_name: name of the subreddit you would like to scrape
subreddit_name_ = 'femalefashionadvice'

# num_posts: number of posts you would like to scrape
num_posts_ = 10000

# fname: name of your new CSV data file
fname_ = 'top10000_ffa_subreddit_posts.csv'

# folder_dir: Directory where your destination CSV file should be stored
folder_dir_ = 'data/raw/ffa_sustainability/'

### query search subreddit

In [None]:
# DIRECTORY: Directory where your destination CSV file should be stored
DIRECTORY_ = '/Users/karinalopez/Desktop/ds_projects/nlp/data/raw/ffa_sustainability/sustainable'

# query: key words you would like to search in each subreddit
query_ = ['ethics', 
          'sustainable', 
          'activism', 
          'ethical consumption', 
          'microplastics', 
          'ethical', 
          'greenwashing',
          'pure',
          'natural',
          'earth-friendly',
          'eco-friendly',
          'organic',
          'reduced emissions',
          'sustainable development', 
          'carbon-neutral',
          'plant-based']

# sub: list of subreddits you would like to scrape
sub_ = ['femalefashionadvice']

# posts fname
posts_fname = 'MEGA_ffa_posts_sustainability.csv'

# commments fname
comments_fname = 'MEGA_ffa_comments_sustainability.csv'


## Scrape top n posts on specified subreddit

In [10]:
def top_n_posts(subreddit_name, num_posts, fname, folder_dir):
    
    hot_posts = reddit.subreddit(subreddit_name).hot(limit = num_posts)
    
    
    # save your dataset into a pandas dataframe

    posts = []

    subreddit_instance = reddit.subreddit(subreddit_name)

    for post in subreddit_instance.hot(limit = num_posts):
        posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])

    posts = pd.DataFrame(posts, columns = ['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

    posts.head(n = 10)
    
    # save your dataframe
    os.chdir(BASE_DIR + folder_dir)
    posts.to_csv(fname, index = False)

In [12]:
top_n_posts(subreddit_name_, num_posts_, fname_, folder_dir_)

## Keyword comment scraper

In [15]:
def scrape_comments(DIRECTORY, query, sub = ['femalefashionadvice']):
    
    os.chdir(DIRECTORY)
    
    # initiate your empty dataframes
    comments_list = []
    posts_list = []
    
    
    for s in sub:

        subreddit = reddit.subreddit(s)   # Chosing the subreddit


    ########################################
    #   CREATING DICTIONARY TO STORE THE DATA WHICH WILL BE CONVERTED TO A DATAFRAME
    ########################################

    #   NOTE: ALL THE POST DATA AND COMMENT DATA WILL BE SAVED IN TWO DIFFERENT
    #   DATASETS AND LATER CAN BE MAPPED USING IDS OF POSTS/COMMENTS AS WE WILL 
    #   BE CAPTURING ALL IDS THAT COME IN OUR WAY

    # SCRAPING CAN BE DONE VIA VARIOUS STRATEGIES {HOT,TOP,etc} we will go with keyword strategy i.e using search a keyword
        #query = ['women', 'ethics']

        for item in query:

            # Create empty dictionaries
            post_dict = {
                "title" : [],   #title of the post
                "score" : [],   # score of the post
                "id" : [],      # unique id of the post
                "url" : [],     #url of the post
                "comms_num": [],   #the number of comments on the post
                "created" : [],  #timestamp of the post
                "body" : []         # the descriptionof post
            }

            comments_dict = {
                "comment_id" : [],      #unique comm id
                "comment_parent_id" : [],  # comment parent id
                "comment_body" : [],   # text in comment
                "comment_link_id" : []  #link to the comment
            }


            for submission in subreddit.search(item, sort = "top", limit = 100):

                post_dict["title"].append(submission.title)
                post_dict["score"].append(submission.score)
                post_dict["id"].append(submission.id)
                post_dict["url"].append(submission.url)
                post_dict["comms_num"].append(submission.num_comments)
                post_dict["created"].append(submission.created_utc)
                post_dict["body"].append(submission.name)

                ##### Acessing comments on the post
                submission.comments.replace_more(limit = None)

                for comment in submission.comments.list():
                    comments_dict["comment_id"].append(comment.id)
                    comments_dict["comment_parent_id"].append(comment.parent_id)
                    comments_dict["comment_body"].append(comment.body)
                    comments_dict["comment_link_id"].append(comment.link_id)

            # Create your comment datasets
            post_comments = pd.DataFrame(comments_dict)
            post_comments['keyword'] = item
            
            # Add to a list of dataframes
            comments_list.append(post_comments)
            post_comments.to_csv("comments_" + s + "_" + item + ".csv", index = False)
            
            # Create your post datasets
            post_data = pd.DataFrame(post_dict)
            post_data['keyword'] = item
            
            # Add to a list of dataframes
            posts_list.append(post_data)
            post_data.to_csv('posts_' + s + "_" + item + ".csv", index = False)
            
    
    return(posts_list, comments_list)



In [17]:
posts, comments = scrape_comments(DIRECTORY_, query_, sub_)

ServerError: received 503 HTTP response

In [None]:
os.chdir(DIRECTORY_)
posts.to_csv(posts_fname, index = False)
comments.to_csv(comments_fname, index = False)

In [None]:
print('Done!')