### Reddit PRAW testing

In [None]:
import praw
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict
from loguru import logger
from typing import List


In [3]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_prefix="reddit_")
    api_id : str
    api_secret: str
    username: str
    password: str

In [5]:
cfg=Settings()

In [6]:
reddit = praw.Reddit(user_agent=True,
                        client_id=cfg.api_id,
                        client_secret=cfg.api_secret,
                        username=cfg.username,
                        password=cfg.password)

Search by URL

In [42]:
#url="https://www.reddit.com/r/migraine/comments/12gvvx3/what_headachemigraine_meds_is_most_favorable_in/"
url = "https://i.redd.it/2j1ents09gnd1.jpeg"
submission=reddit.submission(url=url)
submission.comments.replace_more(limit=0)

post_data = {
    "title": submission.title,
    "score": submission.score,
    "url": submission.url,
    "subreddit": str(submission.subreddit),
    "author": str(submission.author),
    "created_utc": submission.created_utc,
    "text": submission.selftext,
    "comments": [{
        "author": str(comment.author),
        "body": comment.body,
        "score": comment.score,
        "created_utc": comment.created_utc
    } for comment in submission.comments]
}

InvalidURL: Invalid URL: https://i.redd.it/2j1ents09gnd1.jpeg

In [20]:
len(post_data["comments"])

76

Search by query

In [45]:
query = "Migraine relief" 
all_subreddits = reddit.subreddit('all')
search = all_subreddits.search(query, sort='relevance', time_filter='all', limit=10)
for post in search:
    print(post.title)
    print(post.url)
    print(post.id)
    # print(post.selftext)
    # print(f"score: {post.score}, upvote ratio: {post.upvote_ratio}")
    print([comment.score for comment in post.comments])
    # print("")

What combination of migraine relief do you use when you have a migraine?
https://www.reddit.com/r/migraine/comments/19brueh/what_combination_of_migraine_relief_do_you_use/
19brueh
[73, 22, 16, 16, 13, 10, 8, 8, 7, 5, 5, 5, 5, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
What’s your go-to migraine relief in 2024?
https://www.reddit.com/r/migraine/comments/1hr5cfz/whats_your_goto_migraine_relief_in_2024/
1hr5cfz
[18, 10, 8, 6, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]
Migraine sufferers - What is your solution for relief?
https://www.reddit.com/r/AskReddit/comments/z4coy3/migraine_sufferers_what_is_your_solution_for/
z4coy3
[16, 12, 9, 5, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

AttributeError: 'MoreComments' object has no attribute 'score'

In [23]:
search

<praw.models.listing.generator.ListingGenerator at 0x240da6d1430>

Creating a class for scraping reddit posts 

In [None]:
cfg = Settings()

class Reply():
    def __init__(self,
                 parent_id: str,
                 id : str,
                 content: str,
                 upvotes: int,
                 created_utc):
        
        self.parent_id =parent_id
        self.id = id
        self.content = content,
        self.upvotes = upvotes,
        self.created_utc = created_utc,
        self.replies = []

class Scraper():
    def __init__(self,
                client_id: str = cfg.api_id,
                client_secret: str = cfg.api_secret,
                username: str = cfg.username,
                password: str = cfg.password,
                max_posts: int = 10
                ):
        self.reddit = praw.Reddit(user_agent=True,
                                    client_id=client_id,
                                    client_secret=client_secret,
                                    username=username,
                                    password=password)
        self.max_posts = max_posts
        logger.info("Reddit API successfully initialized.")
    
    def _get_post_ids(self, 
                       query: str,
                       ) -> List[str]:
        search = self.subreddit('all').search(query, sort='relevance', time_filter='all', limit=self.max_posts)
        return [str(post.id) for post in search]

    def _scrape_reddit_post(self, post_id: str):
        submission = self.reddit.submission(id=post_id)
        submission.comments.replace_more(limit=None)

        post_str = ""
        comment_data, replies_data = [{} for i in range(3)]

        #Get post data
        post_data = {
            "post_id": str(submission.id),
            "title": str(submission.title),
            "author": str(submission.author),
            "subreddit": str(submission.subreddit),
            "content": str(submission.selftext),
            "upvotes": int(submission.score),
            "downvotes": int((1.0 - submission.upvote_ratio)*submission.score),
            "created_utc":submission.created_utc
        }

        #Get comment and replies data
        comment_data = {
            "post_id": str(submission.id),
            "comment_id": [],
            "author": [],
            "content": [],
            "upvotes": [],
            "created_utc": [],
            "replies": []
        }

        def get_replies(reply) -> Reply:
            rep = Reply(parent_id=reply.parent_id,
                          id=reply.id,
                          content=reply.body,
                          upvotes=reply.score,
                          created_utc=reply.created_utc)
            
            if reply.replies:
                for nested_reply in reply.replies:
                    rep.replies.append(get_replies(nested_reply))

            return rep

        for comment in submission.comments:
            comment_data["comment_id"].append(str(comment.id))
            comment_data["author"].append(str(comment.author))
            comment_data["content"].append(str(comment.body))
            comment_data["upvotes"].append(int(comment.score))
            comment_data["created_utc"].append(int(comment.created_utc))

            for reply in comment.replies:
                comment_data["replies"].append(get_replies(reply))


        
        
        return post_data, comment_data
