### Reddit PRAW testing

In [130]:
import praw
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict
from loguru import logger
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed


In [3]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_prefix="reddit_")
    api_id : str
    api_secret: str
    username: str
    password: str

In [5]:
cfg=Settings()

In [6]:
reddit = praw.Reddit(user_agent=True,
                        client_id=cfg.api_id,
                        client_secret=cfg.api_secret,
                        username=cfg.username,
                        password=cfg.password)

Search by URL

In [42]:
#url="https://www.reddit.com/r/migraine/comments/12gvvx3/what_headachemigraine_meds_is_most_favorable_in/"
url = "https://i.redd.it/2j1ents09gnd1.jpeg"
submission=reddit.submission(url=url)
submission.comments.replace_more(limit=0)

post_data = {
    "title": submission.title,
    "score": submission.score,
    "url": submission.url,
    "subreddit": str(submission.subreddit),
    "author": str(submission.author),
    "created_utc": submission.created_utc,
    "text": submission.selftext,
    "comments": [{
        "author": str(comment.author),
        "body": comment.body,
        "score": comment.score,
        "created_utc": comment.created_utc
    } for comment in submission.comments]
}

InvalidURL: Invalid URL: https://i.redd.it/2j1ents09gnd1.jpeg

In [20]:
len(post_data["comments"])

76

Search by query

In [45]:
query = "Migraine relief" 
all_subreddits = reddit.subreddit('all')
search = all_subreddits.search(query, sort='relevance', time_filter='all', limit=10)
for post in search:
    print(post.title)
    print(post.url)
    print(post.id)
    # print(post.selftext)
    # print(f"score: {post.score}, upvote ratio: {post.upvote_ratio}")
    print([comment.score for comment in post.comments])
    # print("")

What combination of migraine relief do you use when you have a migraine?
https://www.reddit.com/r/migraine/comments/19brueh/what_combination_of_migraine_relief_do_you_use/
19brueh
[73, 22, 16, 16, 13, 10, 8, 8, 7, 5, 5, 5, 5, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
What’s your go-to migraine relief in 2024?
https://www.reddit.com/r/migraine/comments/1hr5cfz/whats_your_goto_migraine_relief_in_2024/
1hr5cfz
[18, 10, 8, 6, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1]
Migraine sufferers - What is your solution for relief?
https://www.reddit.com/r/AskReddit/comments/z4coy3/migraine_sufferers_what_is_your_solution_for/
z4coy3
[16, 12, 9, 5, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

AttributeError: 'MoreComments' object has no attribute 'score'

In [23]:
search

<praw.models.listing.generator.ListingGenerator at 0x240da6d1430>

In [120]:
indentation = '   '

post_tmpl = \
"""<POST_SUBREDDIT>
{post_subreddit}
<POST_TITLE>
{post_title}
<POST_AUTHOR>
{post_author}
<UPVOTES vs DOWNVOTES>
{post_upvotes} - {post_downvotes}
<POST_CONTENT>
{post_content}
"""

comment_tmpl = \
"""
<COMMENT>
{comment_content}\n
"""

reply_tmpl = \
"""
<REPLY>
{reply_content}\n

"""

def indent_text(text: str, 
                indentation: str, 
                level: int):
    return '\n'.join(f"{indentation*level}{line}" for line in text.splitlines())

Creating a class for scraping reddit posts 

In [205]:
cfg = Settings()

class Reply():
    def __init__(self,
                 parent_id: str,
                 id : str,
                 content: str,
                 upvotes: int,
                 created_utc):
        
        self.parent_id =parent_id
        self.id = id
        self.content = content,
        self.upvotes = upvotes,
        self.created_utc = created_utc,
        self.replies = []

class Scraper():
    def __init__(self,
                client_id: str = cfg.api_id,
                client_secret: str = cfg.api_secret,
                username: str = cfg.username,
                password: str = cfg.password,
                max_posts: int = 10
                ):
        self.reddit = praw.Reddit(user_agent=True,
                                    client_id=client_id,
                                    client_secret=client_secret,
                                    username=username,
                                    password=password)
        self.max_posts = max_posts
        logger.info("Reddit API successfully initialized.")
    
    def _get_post_ids(self, 
                       query: str,
                       ) -> List[str]:
        search = self.reddit.subreddit('all').search(query, sort='relevance', time_filter='all', limit=self.max_posts)
        logger.info("Post IDs retrieved.")
        return [str(post.id) for post in search]

    def _scrape_reddit_post(self, post_id: str):
        submission = self.reddit.submission(id=post_id)
        submission.comments.replace_more(limit=0)

        #Get post data
        post_data = {
            "post_id": str(submission.id),
            "title": str(submission.title),
            "author": str(submission.author),
            "subreddit": str(submission.subreddit),
            "content": str(submission.selftext),
            "upvotes": int(submission.score),
            "downvotes": int((1.0 - submission.upvote_ratio)*submission.score),
            "created_utc":submission.created_utc
        }

        post_str = post_tmpl.format(post_subreddit=str(submission.subreddit),
                                    post_author=str(submission.author),
                                    post_title=str(submission.title),
                                    post_content=str(submission.selftext),
                                    post_upvotes=int(submission.score),
                                    post_downvotes=int((1.0 - submission.upvote_ratio)*submission.score)
                                    )

        #Get comment and replies data
        comment_data = {
            "post_id": str(submission.id),
            "comment_id": [],
            "author": [],
            "content": [],
            "upvotes": [],
            "created_utc": [],
            "replies": []
        }

        def get_replies(reply, level) -> Reply:
            nonlocal post_str
            post_str += indent_text(reply_tmpl.format(reply_content=str(reply.body)),
                                    indentation=indentation,
                                    level=level
                                    )

            rep = Reply(parent_id=reply.parent_id,
                          id=reply.id,
                          content=reply.body,
                          upvotes=reply.score,
                          created_utc=reply.created_utc)
            
            if reply.replies:
                for nested_reply in reply.replies:
                    rep.replies.append(get_replies(nested_reply, level+1))

            return rep

        for comment in submission.comments:
            comment_data["comment_id"].append(str(comment.id))
            comment_data["author"].append(str(comment.author))
            comment_data["content"].append(str(comment.body))
            comment_data["upvotes"].append(int(comment.score))
            comment_data["created_utc"].append(int(comment.created_utc))

            # appending to post txt
            post_str += indent_text(comment_tmpl.format(comment_content=str(comment.body)),
                                                        indentation=indentation,
                                                        level=1)                              

            replies = []
            for reply in comment.replies:
                replies.append(get_replies(reply, level=2))
                
            comment_data["replies"].append(replies)
        
        logger.info(f"Post ID: {post_id} - {submission.title} post & comment data of retrieved.")
        return post_data, comment_data, post_str

    def _scrape_multiple_posts(self, post_ids: List[str]):
        post_data_list, comment_data_list, post_str_list = [[] for i in range(3)]
        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_post_id = {executor.submit(self._scrape_reddit_post, post_id): post_id for post_id in post_ids}
            for future in as_completed(future_to_post_id):
                post_id = future_to_post_id[future]
                try:
                    post_data, comment_data, post_str = future.result()
                    post_data_list.append(post_data)
                    comment_data_list.append(comment_data)
                    post_str_list.append(post_str)
                except Exception as exc:
                    print(f'Post ID: {post_id} generated an exception: {exc}')
        
        logger.info(f"Processing of all posts complete.")
        return post_data_list, comment_data_list, post_str_list

    def _scrape_via_post_id_search(self, query: str):
        """
        1. Get post ids via reddit search
        2. Get post data in parallel via ThreadPoolExecuter
        """
        post_ids = self._get_post_ids(query=query)

        #Returns post_data_list, comment_data_list, post_str_list
        return self._scrape_multiple_posts(post_ids=post_ids)

    def _scrape_via_reddit_search(self, query: str):
        """
        Directly scrape from the search function, runs serially
        """
        post_data_list, comment_data_list, post_str_list = [[] for i in range(3)]
        search = self.reddit.subreddit('all').search(query, sort='relevance', time_filter='all', limit=self.max_posts)

        for post in search:
            post.comments.replace_more(limit=0)

            #Get post data
            post_data = {
                "post_id": str(post.id),
                "title": str(post.title),
                "author": str(post.author),
                "subreddit": str(post.subreddit),
                "content": str(post.selftext),
                "upvotes": int(post.score),
                "downvotes": int((1.0 - post.upvote_ratio)*post.score),
                "created_utc":post.created_utc
            }

            post_str = post_tmpl.format(post_subreddit=str(post.subreddit),
                                        post_author=str(post.author),
                                        post_title=str(post.title),
                                        post_content=str(post.selftext),
                                        post_upvotes=int(post.score),
                                        post_downvotes=int((1.0 - post.upvote_ratio)*post.score)
                                        )

            #Get comment and replies data
            comment_data = {
                "post_id": str(post.id),
                "comment_id": [],
                "author": [],
                "content": [],
                "upvotes": [],
                "created_utc": [],
                "replies": []
            }

            def get_replies(reply, level) -> Reply:
                nonlocal post_str
                post_str += indent_text(reply_tmpl.format(reply_content=str(reply.body)),
                                        indentation=indentation,
                                        level=level
                                        )

                rep = Reply(parent_id=reply.parent_id,
                            id=reply.id,
                            content=reply.body,
                            upvotes=reply.score,
                            created_utc=reply.created_utc)
                
                if reply.replies:
                    for nested_reply in reply.replies:
                        rep.replies.append(get_replies(nested_reply, level+1))

                return rep

            for comment in post.comments:
                comment_data["comment_id"].append(str(comment.id))
                comment_data["author"].append(str(comment.author))
                comment_data["content"].append(str(comment.body))
                comment_data["upvotes"].append(int(comment.score))
                comment_data["created_utc"].append(int(comment.created_utc))

                # appending to post txt
                post_str += indent_text(comment_tmpl.format(comment_content=str(comment.body)),
                                                            indentation=indentation,
                                                            level=1)                              

                replies = []
                for reply in comment.replies:
                    replies.append(get_replies(reply, level=2))
                    
                comment_data["replies"].append(replies)

            post_data_list.append(post_data)
            comment_data_list.append(comment_data)
            post_str_list.append(post_str)
            logger.info(f"Post ID: {post.id} - {post.title} post & comment data of retrieved.")

        logger.info(f"Processing of all posts complete.")
        return post_data_list, comment_data_list, post_str_list

    def scrape_posts(self, query: str, parallel: bool = True):
        if parallel:
            post_data, comment_data, post_str = self._scrape_via_post_id_search(query)
        else:
            post_data, comment_data, post_str = self._scrape_via_reddit_search(query)
        
        explodecols = ["comment_id", "author", "content", "upvotes", "created_utc","replies"]
        return pd.DataFrame(post_data), pd.DataFrame(comment_data).explode(column=explodecols).reset_index(drop=True), post_str

In [122]:
scraper = Scraper(client_id=cfg.api_id,
                  client_secret=cfg.api_secret,
                  username=cfg.username,
                  password=cfg.password,
                  max_posts=2)
post_data, comment_data, post_str = scraper._scrape_reddit_post(post_id="19brueh")
print(post_str)

[32m2025-01-20 12:08:21.351[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m32[0m - [1mReddit API successfully initialized.[0m


<POST_SUBREDDIT>
migraine
<POST_TITLE>
What combination of migraine relief do you use when you have a migraine?
<POST_AUTHOR>
surelyshirls
<UPVOTES vs DOWNVOTES>
66 - 3
<POST_CONTENT>
Fighting a migraine is like…finding anything that could help and bundle it up. For example, when I have a migraine I do anything to get rid of it and it ends up looking like: 

- tying my head with a tight cloth 
- putting a CBD roll on
- using a fever patch, migraine cap, or cold potato slices 
- making it dark 
- using earplugs 
- massaging the area between my thumb and index for the pressure point 
- listening to gentle singing bowls 

ALL THAT just to try and fight a migraine. What’s your routine?

EDIT: thank you everyone for sharing your routines! I’ve learned a few new things that I should/can add to my routine depending on severity 
   
   <COMMENT>
   My routine is: 600mg alleve (3 liquid capsuls), 1 Benadryl (liquid capsul), and then I slather Aspercreme with 4% lidocaine all over my forehead, e

In [151]:
ids = ["1e7uahp", "1hr5cfz", "z4coy3"]
scraper = Scraper(client_id=cfg.api_id,
                  client_secret=cfg.api_secret,
                  username=cfg.username,
                  password=cfg.password,
                  max_posts=2)
post_data, comment_data, post_strs = scraper._scrape_multiple_posts(post_ids=ids)

[32m2025-01-20 12:41:33.660[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m32[0m - [1mReddit API successfully initialized.[0m


In [207]:
scraper = Scraper(client_id=cfg.api_id,
                  client_secret=cfg.api_secret,
                  username=cfg.username,
                  password=cfg.password,
                  max_posts=10)
query="Migraine relief"
posts_df, comments_df, post_strs = scraper.scrape_posts(query, parallel=False)

[32m2025-01-20 14:27:33.648[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m32[0m - [1mReddit API successfully initialized.[0m
[32m2025-01-20 14:27:36.365[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_via_reddit_search[0m:[36m224[0m - [1mPost ID: 19brueh - What combination of migraine relief do you use when you have a migraine? post & comment data of retrieved.[0m
[32m2025-01-20 14:27:36.998[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_via_reddit_search[0m:[36m224[0m - [1mPost ID: 1hr5cfz - What’s your go-to migraine relief in 2024? post & comment data of retrieved.[0m
[32m2025-01-20 14:27:37.994[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_via_reddit_search[0m:[36m224[0m - [1mPost ID: z4coy3 - Migraine sufferers - What is your solution for relief? post & comment data of retrieved.[0m
[32m2025-01-20 14:27:39.197[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_via_reddit_search[0m:[36m224[0m - [1mPost I

In [208]:
scraper = Scraper(client_id=cfg.api_id,
                  client_secret=cfg.api_secret,
                  username=cfg.username,
                  password=cfg.password,
                  max_posts=10)
query="Migraine relief"
posts_df, comments_df, post_strs = scraper.scrape_posts(query, parallel=True)

[32m2025-01-20 14:27:56.152[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m32[0m - [1mReddit API successfully initialized.[0m
[32m2025-01-20 14:27:56.153[0m | [1mINFO    [0m | [36m__main__[0m:[36m_get_post_ids[0m:[36m38[0m - [1mPost IDs retrieved.[0m
[32m2025-01-20 14:27:58.370[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_reddit_post[0m:[36m113[0m - [1mPost ID: 1hr5cfz - What’s your go-to migraine relief in 2024? post & comment data of retrieved.[0m
[32m2025-01-20 14:27:58.382[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_reddit_post[0m:[36m113[0m - [1mPost ID: 1e7uahp - migraine relief? I'm desperate post & comment data of retrieved.[0m
[32m2025-01-20 14:27:58.832[0m | [1mINFO    [0m | [36m__main__[0m:[36m_scrape_reddit_post[0m:[36m113[0m - [1mPost ID: 1h69w7d - Natural Remedies for Migraine Relief: What Works for You? post & comment data of retrieved.[0m
[32m2025-01-20 14:27:58.917[0m | [1mINFO    [0m

In [201]:
comments_df

Unnamed: 0,post_id,comment_id,author,content,upvotes,created_utc,replies
0,1hr5cfz,m4vyllv,sharkeyes,Ice cold coke while taking a hot shower in a d...,17,1735758120,[<__main__.Reply object at 0x00000240DD0DD820>...
1,1hr5cfz,m4v1lip,Complete-Extension-8,Ubrelvy and wrapping my head tightly,10,1735747191,[<__main__.Reply object at 0x00000240DD0DD070>]
2,1hr5cfz,m4vfocc,molluscstar,Starbucks iced brown sugar oat shaken espresso...,7,1735752034,[<__main__.Reply object at 0x00000240DD0DD4C0>]
3,1hr5cfz,m4vgz7m,CampadLovesSpace,"Salt, ubrelvy, and a metric fuck ton of water",6,1735752467,[<__main__.Reply object at 0x00000240DC4879B0>]
4,1hr5cfz,m4vzyo9,Ok-Dot-9036,"Nurtec, but my insurance has decided that I ca...",4,1735758557,[<__main__.Reply object at 0x00000240DC4852B0>...
...,...,...,...,...,...,...,...
116,19brueh,kiyfcbn,elfsteel,- eletriptan \n- chug cold gatorade or water\n...,1,1705877176,[]
117,19brueh,kiyphao,CommanderNat,Excedrine or my triptan\nDark\nNo glasses\nCoo...,1,1705880837,[]
118,19brueh,kiyv68v,butterbean_11,"Two ibuprofen, one exedrin migraine (two round...",1,1705882961,[]
119,19brueh,kiywylw,mostcommonhauntings,"Ice pack, heated blanket, zofran, Benadryl, el...",1,1705883620,[]


In [202]:
posts_df

Unnamed: 0,post_id,title,author,subreddit,content,upvotes,downvotes,created_utc
0,1hr5cfz,What’s your go-to migraine relief in 2024?,SteepinAndBrewin,migraine,Hello \n\nMy mom has frequent migraine episode...,13,1,1735746000.0
1,19brueh,What combination of migraine relief do you use...,surelyshirls,migraine,Fighting a migraine is like…finding anything t...,70,2,1705801000.0


In [203]:
print(post_strs[1])

<POST_SUBREDDIT>
migraine
<POST_TITLE>
What combination of migraine relief do you use when you have a migraine?
<POST_AUTHOR>
surelyshirls
<UPVOTES vs DOWNVOTES>
70 - 2
<POST_CONTENT>
Fighting a migraine is like…finding anything that could help and bundle it up. For example, when I have a migraine I do anything to get rid of it and it ends up looking like: 

- tying my head with a tight cloth 
- putting a CBD roll on
- using a fever patch, migraine cap, or cold potato slices 
- making it dark 
- using earplugs 
- massaging the area between my thumb and index for the pressure point 
- listening to gentle singing bowls 

ALL THAT just to try and fight a migraine. What’s your routine?

EDIT: thank you everyone for sharing your routines! I’ve learned a few new things that I should/can add to my routine depending on severity 
   
   <COMMENT>
   My routine is: 600mg alleve (3 liquid capsuls), 1 Benadryl (liquid capsul), and then I slather Aspercreme with 4% lidocaine all over my forehead, e