# Passing PRAW output to LLM

In [1]:
from openai import AzureOpenAI
from pydantic_settings import BaseSettings, SettingsConfigDict
from loguru import logger
import json
from typing import List
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_prefix="azure_openai_")
    api_key : str
    endpoint : str
    version : str
    model: str = "gpt-4o"
    timeout_seconds: int = 120

In [3]:
cfg = Settings()

In [48]:
post_summarize_prompt_tmpl = \
"""
You have been provided with the entire text of a reddit post under <<<<REDDIT POST>>>>.
This reddit post was retrieved when a user made the following google query: {query}
In the query, the user is seeking solutions to a problem they have or wants to get advice to make the best decision.
Given the context of the query, summarize the advice and solutions that people are saying in the thread.
OUTPUT THE SUMMARY BETWEEN <<<<SUMMARY>>>> and <<<</SUMMARY>>>>

After outputting the summary, output a python list of key words that can be verbs or nouns, which are solutions to the problem or advice provided in the thread.
For example, if the user has queried "Migraine relief", the list would be something like ["exedrin", "Magnesium Glycinate", "ibuprofen", "earplugs", "stretching", ...].
As another example, if the user has queried "Europe trip planning", the list would be something like ["Italy", "Budapest", "Barcelona", "Netherlands", ...]
YOU MUST ONLY USE WORDS THAT EXIST IN THE TEXT AS THEY ARE. OUTPUT THE LIST OF WORDS BETWEEN <KEYWORDS> and </KEYWORDS>

Follow the following chain of thought:
1. Identify the solutions or suggestions that are central to the discussion 
2. Determine the consensus about these solutions or suggestions. Is it good, or is it bad?
3. Identify unique solutions or suggestions that are not an integral part of the discussion, but could be worth trying out or exploring.
4. Provide a medium-length summary of the discussion, including the pros and cons of some of the discussed solutions or suggestions.
5. Output the key word list including the discussed solutions and suggestions.

Below are examples of the output that is expected:

{example_output}

Below is the reddit post for you to summarize:
<<<<REDDIT POST>>>>

{reddit_post}
"""

In [58]:
migraine_summary_example = \
"""
OUTPUT FOR THE QUERY "Migraine relief":

<<<<SUMMARY>>>>
1. Central Suggestions
The central strategies revolve around combinations of over-the-counter (OTC) medications, prescription drugs, and physical remedies:

- Medications: Common drugs include triptans (e.g., Sumatriptan), NSAIDs (e.g., Ibuprofen, Naproxen), and acetaminophen (e.g., Tylenol). Other popular medications are Nurtec, Fioricet, and Benadryl for sedation or nausea relief.
- Topical Treatments: Products like Aspercreme with 4% lidocaine and CBD ointments are frequently recommended for numbing facial pain.
- Physical Remedies: Ice packs, heating pads, cold showers, and migraine caps are widely used for physical relief. Compression (tight headbands or scarves) and darkness are also key.
- Dietary and Hydration Support: Electrolytes, water, and caffeine (in moderation) are emphasized as essential aids.

2. Consensus
The community consensus is largely positive about these methods:

- Aspercreme is hailed as a "game-changer," with numerous users praising its immediate effectiveness and ease of application.
- Triptans and combination therapies (e.g., pairing NSAIDs with caffeine) receive widespread support for their efficacy.
- OTC remedies like Excedrin and strategies like using cold compresses or dark rooms are universally well-regarded.
- Unique remedies such as green light therapy and meditation also receive cautious optimism, often noted as subjective but potentially helpful.

3. Unique and Emerging Suggestions
Several lesser-discussed but intriguing approaches surfaced:

- Shockwave Therapy: A user reported significant relief after this treatment, suggesting its potential for muscle-tension-related migraines.
- Green Light Therapy: Some users experimented with green light filters or lamps, noting reduced migraine intensity, although effects were not universally confirmed.
- Dietary Additions: Specific items like fresh lemonade or magnesium supplements were mentioned as beneficial, with anecdotal evidence of effectiveness.
- Singing Bowls and ASMR: While polarizing, some users found these methods helpful for relaxation during migraines.
<<<</SUMMARY>>>>

<KEYWORDS>
["Sumatriptan", "Ibuprofen", "Naproxen", "Acetaminophen", "Tylenol",
"Excedrin", "Benadryl", "Nurtec", "Fioricet", "Ubrelvy", "Zofran",
"Ondansetron", "Promethazine", "Diclofenac", "Amitriptyline", 
"Rizatriptan", "Naratriptan", "Tramadol", "Aspirin", "Cambia",
"Propranolol", "Anarex", "Dexamethasone", "Meclizine", "Elatriptan",
"Flexeril", "Hydroxyzine", "Xanax", "Omeprazole", "BC Powder",
"Aspercreme", "CBD ointment", "Tiger Balm", "Salonpas", "Lidocaine patches",
"Migraine cap", "Ice packs", "Heating pads", "Compression headbands",
"Dark room", "Green light therapy", "Electrolytes", "Caffeine",
"Ginger chews", "Gua Sha", "Meditation", "ASMR", "Singing bowls",
"Fresh lemonade", "Magnesium supplements", "Vitamin D", "Pedialyte",
"Gatorade", "Cold showers", "Shockwave therapy", "Cefaly device",
"Avulux glasses", "Allay lamp", "McDonald's fries", "Rice sock",
"Motion sickness pills", "Energy drinks", "Sleep"]
</KEYWORDS>
"""

In [59]:
summary_summarize_prompt_tmpl = \
"""
You have been provided with a list of reddit post summaries under <<<<REDDIT POST SUMMARIES>>>>
These reddit posts were retrieved when a user made the following google query: {query}
In the query, the user is seeking solutions to a problem they have or wants to get advice to make the best decision.
Please create a concise overarching summary of all the post summaries in short PARAGRAPH format (not bullet points)
OUTPUT THE SUMMARY BETWEEN the tags <SUMMARY> and </SUMMARY>.

Follow the following chain of thought when thinking about the summary:
1. Which solutions or suggestions frequently come up in the discussion?
2. Do people have good or bad opinions about these solutions or suggestions?
3. Are there any unique solutions or suggestions that could be worth trying out or exploring?
4. How can I best summarize all all the info in a way that is wholistic yet concise?

Below are examples of the output that is expected:
{example_output}

Below are the summaries of the reddit posts for you to summarize:
<<<<REDDIT POST SUMMARIES>>>>

{reddit_post_summaries}
"""

In [60]:
migraine_summary_summary_example = \
"""
<SUMMARY>Discussions on migraine relief focus on a combination of medications, physical remedies, and lifestyle changes. Users widely endorse triptans, NSAIDs, and combination therapies like Excedrin and caffeine for their effectiveness, often providing rapid and reliable relief. Prescription options such as Sumatriptan and Nurtec are frequently highlighted as particularly effective.
Physical aids such as ice packs, migraine caps, and dark, quiet spaces are highly praised for their ability to alleviate symptoms quickly. Additional remedies like cold showers and compression headbands are commonly used to enhance comfort during an episode.

Lifestyle adjustments, including staying hydrated, making dietary tweaks, and maintaining a regular sleep schedule, are emphasized as preventive strategies. Supplements like magnesium and B2 are often recommended to reduce the frequency and intensity of migraines over time.

Unique approaches, such as green light therapy, singing bowls, and the use of devices like the Cefaly, show promise but remain anecdotal. While some users find these methods transformative, others report minimal impact, highlighting the variability in effectiveness.

Overall, the consensus is that a holistic approach combining medications, physical remedies, and lifestyle adjustments offers the best results for managing migraines effectively.
</SUMMARY>
"""

In [63]:
summary_examples = [migraine_summary_example]

In [65]:
summary_summary_examples = [migraine_summary_summary_example]

In [36]:
system_prompt = \
"""
You are a reddit post summarization agent, with the objective of providing a well-rounded yet informative summary to the user
"""

In [62]:
def find_text_in_between_tags(text, start_tag, end_tag):
    start_pos = text.find(start_tag)
    end_pos = text.find(end_tag)
    text_between_tags = text[start_pos + len(start_tag): end_pos]

    return text_between_tags


In [None]:
class Summarizer():
    def __init__(self,
                 openai_api_key: str = cfg.api_key,
                 openai_endpoint: str = cfg.endpoint,
                 openai_model: str = cfg.model,
                 openai_version: str = cfg.version,
                 timeout_seconds: int = cfg.timeout_seconds,
                 temperature: float = 0.0,
                 retries: int = 1
                 ):
        
        self.openai_client = AzureOpenAI(
            api_key=openai_api_key,
            azure_endpoint=openai_endpoint,
            api_version=openai_version
        )
        self.openai_model = openai_model
        self.temperature = temperature
        self.retries = retries
        self.timeout_seconds = timeout_seconds
    
    def _summarize_post(self, 
                        query: str,
                        post_str: str):
        
        llm_prompt = post_summarize_prompt_tmpl.format(query=query,
                                        example_output="".join(summary_examples),
                                        reddit_post=post_str
                                        )
        
        call_tries = 0
        while call_tries <= self.retries:
            try:
                response = self.openai_client.chat.completions.create(
                                model=self.openai_model,
                                messages=[
                                    {"role": "system", "content": system_prompt},
                                    {"role": "user", "content": llm_prompt}
                                ],
                                timeout=self.timeout_seconds,
                                temperature=self.temperature
                )

                llm_output = response.choices[0].message.content
                logger.info(f"LLM output: {llm_output}")

                summary = find_text_in_between_tags(llm_output, "<<<<SUMMARY>>>>", "<<<</SUMMARY>>>>")
                keywords = find_text_in_between_tags(llm_output, "<KEYWORDS>", "</KEYWORDS>")

                keywords_list = json.loads(keywords)
                
                logger.info(f"Post ID - {find_text_in_between_tags(post_str,
                                                                    "<POST_ID>",
                                                                    "<POST_SUBREDDIT>"
                                                                    ).replace("\n", "")}: Summary generated and keywords extracted")
                return {"summary" : summary, "keywords": keywords_list}
            
            except Exception as e:
                call_tries += 1
                logger.error(f"Failed to call LLM: {e}. Retrying {call_tries}/{self.retries}")

        return {"error": "LLM call failed", "details": str(e)}
    
    def summarize_posts(self, query: str, 
                        post_strings: List[str], 
                        max_workers: int = 5):
        
        all_summaries, all_keywords = [], set()

        # Create a ThreadPoolExecutor with the specified number of workers
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit tasks to the executor
            future_to_post = {
                executor.submit(self._summarize_post, query, post): post
                for post in post_strings
            }

            # Collect results as they are completed
            for future in as_completed(future_to_post):
                post_str = future_to_post[future]
                try:
                    llm_output = future.result()
                    if ("summary" in llm_output) and ("keywords" in llm_output):
                        all_summaries.append(llm_output["summary"])
                        all_keywords.update(llm_output["keywords"])
                except Exception as e:
                    logger.error(f"Error processing post: {e}")

        return all_summaries, all_keywords
    
    def summarize_summaries(self, query: str,
                             summaries: List[str]
                            ):
        llm_prompt = summary_summarize_prompt_tmpl.format(
            query=query,
            example_output="".join(summary_examples),
            reddit_post_summaries="".join([f"SUMMARY {i+1}: {summary}" for i, summary in enumerate(summaries)]))
        
        call_tries = 0
        while call_tries <= self.retries:
            try:
                response = self.openai_client.chat.completions.create(
                                model=self.openai_model,
                                messages=[
                                    {"role": "system", "content": system_prompt},
                                    {"role": "user", "content": llm_prompt}
                                ],
                                timeout=self.timeout_seconds,
                                temperature=self.temperature
                )

                llm_output = response.choices[0].message.content
                overall_summary = find_text_in_between_tags(llm_output, "<SUMMARY>", "</SUMMARY>")
                logger.info(f"Overall summary: {overall_summary}")
                logger.info("Summarization of reddit summaries complete")

                return overall_summary
            
            except Exception as e:
                call_tries += 1
                logger.error(f"Failed to call LLM: {e}. Retrying {call_tries}/{self.retries}")

        raise Exception(f"LLM call failed after last retry. Error: {e}")


Testrun

In [11]:
import sys
import os

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add the parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [12]:
from redditsearch.reddit.scrape import scraper
query="Migraine relief"
posts_df, comments_df, post_strs = scraper.scrape_posts(query, parallel=True, max_posts=3)

[32m2025-01-22 10:44:07.204[0m | [1mINFO    [0m | [36mredditsearch.reddit.scrape[0m:[36m__init__[0m:[36m54[0m - [1mReddit API successfully initialized.[0m
[32m2025-01-22 10:44:07.209[0m | [1mINFO    [0m | [36mredditsearch.reddit.scrape[0m:[36m_get_post_ids[0m:[36m61[0m - [1mPost IDs retrieved.[0m
[32m2025-01-22 10:44:09.266[0m | [1mINFO    [0m | [36mredditsearch.reddit.scrape[0m:[36m_scrape_reddit_post[0m:[36m137[0m - [1mPost ID: 1hr5cfz - What’s your go-to migraine relief in 2024? post & comment data of retrieved.[0m
[32m2025-01-22 10:44:09.495[0m | [1mINFO    [0m | [36mredditsearch.reddit.scrape[0m:[36m_scrape_reddit_post[0m:[36m137[0m - [1mPost ID: z4coy3 - Migraine sufferers - What is your solution for relief? post & comment data of retrieved.[0m
[32m2025-01-22 10:44:09.884[0m | [1mINFO    [0m | [36mredditsearch.reddit.scrape[0m:[36m_scrape_reddit_post[0m:[36m137[0m - [1mPost ID: 19brueh - What combination of migraine relie

In [13]:
posts_df

Unnamed: 0,post_id,title,author,subreddit,content,upvotes,downvotes,created_utc
0,1hr5cfz,What’s your go-to migraine relief in 2024?,SteepinAndBrewin,migraine,Hello \n\nMy mom has frequent migraine episode...,12,1,1735746000.0
1,z4coy3,Migraine sufferers - What is your solution for...,josepapiblanco,AskReddit,,18,3,1669381000.0
2,19brueh,What combination of migraine relief do you use...,surelyshirls,migraine,Fighting a migraine is like…finding anything t...,71,2,1705801000.0


In [44]:
comments_df

Unnamed: 0,post_id,comment_id,author,content,upvotes,created_utc,replies
0,1hr5cfz,m4vyllv,sharkeyes,Ice cold coke while taking a hot shower in a d...,17,1735758120,[<redditsearch.reddit.scrape.Reply object at 0...
1,1hr5cfz,m4v1lip,Complete-Extension-8,Ubrelvy and wrapping my head tightly,10,1735747191,[<redditsearch.reddit.scrape.Reply object at 0...
2,1hr5cfz,m4vfocc,molluscstar,Starbucks iced brown sugar oat shaken espresso...,8,1735752034,[<redditsearch.reddit.scrape.Reply object at 0...
3,1hr5cfz,m4vgz7m,CampadLovesSpace,"Salt, ubrelvy, and a metric fuck ton of water",7,1735752467,[<redditsearch.reddit.scrape.Reply object at 0...
4,1hr5cfz,m4vzyo9,Ok-Dot-9036,"Nurtec, but my insurance has decided that I ca...",5,1735758557,[<redditsearch.reddit.scrape.Reply object at 0...
...,...,...,...,...,...,...,...
173,19brueh,kiyfcbn,elfsteel,- eletriptan \n- chug cold gatorade or water\n...,1,1705877176,[]
174,19brueh,kiyphao,CommanderNat,Excedrine or my triptan\nDark\nNo glasses\nCoo...,1,1705880837,[]
175,19brueh,kiyv68v,butterbean_11,"Two ibuprofen, one exedrin migraine (two round...",1,1705882961,[]
176,19brueh,kiywylw,mostcommonhauntings,"Ice pack, heated blanket, zofran, Benadryl, el...",1,1705883620,[]


In [14]:
post_strs

["\n<POST_ID>\n1hr5cfz\n<POST_SUBREDDIT>\nmigraine\n<POST_TITLE>\nWhat’s your go-to migraine relief in 2024?\n<POST_AUTHOR>\nSteepinAndBrewin\n<UPVOTES vs DOWNVOTES>\n12 - 1\n<POST_CONTENT>\nHello \n\nMy mom has frequent migraine episodes, and we can’t predict the trigger ahead of time. The last time, we were at a place with a large screen that even made me nauseated. \n\nWhat's your go-to migraine relief? Are there any products you’d recommend, especially ones that can fit in a handbag for these unpredictable situations? What products do you use to prevent or manage migraines at home? \n\nI'd appreciate your help & Happy New Year. \n   \n   <COMMENT>\n   Ice cold coke while taking a hot shower in a dark bathroom.\n         \n      <REPLY>\n      Omg this sounds delightful even without a migraine\n      \n            \n      <REPLY>\n      Add two Advil and a Benadryl and it's a party\n      \n            \n      <REPLY>\n      Oh man that sounds wonderful\n      \n         \n   <COMME

In [41]:
summarizer = Summarizer()
summaries, all_keywords = summarizer.summarize_posts(query=query,
                                                     post_strings=post_strs)

[32m2025-01-22 11:11:51.857[0m | [1mINFO    [0m | [36m__main__[0m:[36m_summarize_post[0m:[36m45[0m - [1mLLM output: <<<<SUMMARY>>>>
1. Central Suggestions
The central strategies revolve around a mix of medications, physical remedies, and lifestyle adjustments:

- Medications: Commonly mentioned drugs include triptans (e.g., Sumatriptan, Naratriptan, Eletriptan), NSAIDs (e.g., Ibuprofen, Naproxen), and other prescription medications like Ubrelvy, Nurtec, and Zofran. Over-the-counter options like Excedrin and Advil are also popular.
- Physical Remedies: Ice packs, heating pads, cold showers, and migraine caps are frequently used for physical relief. Compression (wrapping the head tightly) and darkness are also key.
- Dietary and Hydration Support: Caffeine (e.g., coffee, Coke), electrolytes, and water are emphasized as essential aids. Some users also mention specific foods like McDonald's fries and sugary drinks.

2. Consensus
The community consensus is largely positive about 

In [42]:
all_keywords

{'ASMR',
 'Acetaminophen',
 'Advil',
 'Allay lamp',
 'Amitriptyline',
 'Anarex',
 'Aspercreme',
 'Aspirin',
 'Avulux glasses',
 'B2',
 'BC Powder',
 'Benadryl',
 'CBD balm',
 'CBD oil',
 'CBD ointment',
 'Caffeine',
 'Cambia',
 'Cefaly',
 'Cefaly device',
 'CoQ10',
 'Cold showers',
 'Compression',
 'Compression headbands',
 'Dark room',
 'Darkness',
 'Dexamethasone',
 'Diclofenac',
 'Elatriptan',
 'Electrolytes',
 'Eletriptan',
 'Energy drinks',
 'Excedrin',
 'Fioricet',
 'Flexeril',
 'Fresh lemonade',
 'Gatorade',
 'Ginger chews',
 'Green light therapy',
 'Gua Sha',
 'Heating pads',
 'Hydroxyzine',
 'Ibuprofen',
 'Ice packs',
 'Lidocaine patches',
 'Magnesium',
 'Magnesium supplements',
 'Maxalt',
 "McDonald's fries",
 'Meclizine',
 'Meditation',
 'Migraine cap',
 'Migraine caps',
 'Motion sickness pills',
 'NTI-tss Plus',
 'Naproxen',
 'Naratriptan',
 'Nurtec',
 'Omega-3',
 'Omeprazole',
 'Ondansetron',
 'Pedialyte',
 'Peppermint oil',
 'Promethazine',
 'Propranolol',
 'Rice sock',
 

In [43]:
summaries

["\n1. Central Suggestions\nThe central strategies revolve around a mix of medications, physical remedies, and lifestyle adjustments:\n\n- Medications: Commonly mentioned drugs include triptans (e.g., Sumatriptan, Naratriptan, Eletriptan), NSAIDs (e.g., Ibuprofen, Naproxen), and other prescription medications like Ubrelvy, Nurtec, and Zofran. Over-the-counter options like Excedrin and Advil are also popular.\n- Physical Remedies: Ice packs, heating pads, cold showers, and migraine caps are frequently used for physical relief. Compression (wrapping the head tightly) and darkness are also key.\n- Dietary and Hydration Support: Caffeine (e.g., coffee, Coke), electrolytes, and water are emphasized as essential aids. Some users also mention specific foods like McDonald's fries and sugary drinks.\n\n2. Consensus\nThe community consensus is largely positive about these methods:\n\n- Triptans and combination therapies (e.g., pairing NSAIDs with caffeine) receive widespread support for their ef

In [56]:
#x = [f"\nSummary {i+1}: {summary}" for i, summary in enumerate(summaries)]
print("".join([f"SUMMARY {i+1}: {summary}" for i, summary in enumerate(summaries)]))

SUMMARY 1: 
1. Central Suggestions
The central strategies revolve around a mix of medications, physical remedies, and lifestyle adjustments:

- Medications: Commonly mentioned drugs include triptans (e.g., Sumatriptan, Naratriptan, Eletriptan), NSAIDs (e.g., Ibuprofen, Naproxen), and other prescription medications like Ubrelvy, Nurtec, and Zofran. Over-the-counter options like Excedrin and Advil are also popular.
- Physical Remedies: Ice packs, heating pads, cold showers, and migraine caps are frequently used for physical relief. Compression (wrapping the head tightly) and darkness are also key.
- Dietary and Hydration Support: Caffeine (e.g., coffee, Coke), electrolytes, and water are emphasized as essential aids. Some users also mention specific foods like McDonald's fries and sugary drinks.

2. Consensus
The community consensus is largely positive about these methods:

- Triptans and combination therapies (e.g., pairing NSAIDs with caffeine) receive widespread support for their effi

Creating one hot encoding of words

In [45]:
comments_df_copy = comments_df.copy()
for keyword in all_keywords:
    comments_df_copy[keyword] = comments_df_copy['content'].str.contains(rf'\b{keyword}\b', case=False).astype(int)

In [46]:
comments_df_copy

Unnamed: 0,post_id,comment_id,author,content,upvotes,created_utc,replies,dark room,Tramadol,Ibuprofen,...,Caffeine,Magnesium supplements,Propranolol,Acetaminophen,Amitriptyline,CBD balm,Promethazine,cranial sacral massage,Peppermint oil,ice packs
0,1hr5cfz,m4vyllv,sharkeyes,Ice cold coke while taking a hot shower in a d...,17,1735758120,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1hr5cfz,m4v1lip,Complete-Extension-8,Ubrelvy and wrapping my head tightly,10,1735747191,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1hr5cfz,m4vfocc,molluscstar,Starbucks iced brown sugar oat shaken espresso...,8,1735752034,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1hr5cfz,m4vgz7m,CampadLovesSpace,"Salt, ubrelvy, and a metric fuck ton of water",7,1735752467,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1hr5cfz,m4vzyo9,Ok-Dot-9036,"Nurtec, but my insurance has decided that I ca...",5,1735758557,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,19brueh,kiyfcbn,elfsteel,- eletriptan \n- chug cold gatorade or water\n...,1,1705877176,[],1,0,0,...,1,0,0,0,0,0,0,0,0,0
174,19brueh,kiyphao,CommanderNat,Excedrine or my triptan\nDark\nNo glasses\nCoo...,1,1705880837,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0
175,19brueh,kiyv68v,butterbean_11,"Two ibuprofen, one exedrin migraine (two round...",1,1705882961,[],0,0,1,...,0,0,0,0,0,0,0,0,0,0
176,19brueh,kiywylw,mostcommonhauntings,"Ice pack, heated blanket, zofran, Benadryl, el...",1,1705883620,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
import pandas as pd

def onehotencode(df: pd.DataFrame,
                 keywords: set,
                 content_key: str = 'content'
                 ) -> pd.DataFrame:
    
    for keyword in keywords:
        df[keyword] = df[content_key].str.contains(rf'\b{keyword}\b', case=False).astype(int)

    return df

def get_keyword_counts(df : pd.DataFrame,
                       keywords : set
                       ):
    
    keyword_sums = df[list(keywords)].sum()

    # Sort the keywords by their total mentions in descending order
    sorted_keywords = keyword_sums[keyword_sums > 0].sort_values(ascending=False)

    # Extract the sorted keywords and their corresponding mentions
    result = {
        "keywords": sorted_keywords.index.tolist(),
        "mentions": sorted_keywords.values.tolist()
    }

    return result

In [77]:
x = onehotencode(comments_df, all_keywords)
x

Unnamed: 0,post_id,comment_id,author,content,upvotes,created_utc,replies,dark room,Tramadol,Ibuprofen,...,Caffeine,Magnesium supplements,Propranolol,Acetaminophen,Amitriptyline,CBD balm,Promethazine,cranial sacral massage,Peppermint oil,ice packs
0,1hr5cfz,m4vyllv,sharkeyes,Ice cold coke while taking a hot shower in a d...,17,1735758120,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1hr5cfz,m4v1lip,Complete-Extension-8,Ubrelvy and wrapping my head tightly,10,1735747191,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1hr5cfz,m4vfocc,molluscstar,Starbucks iced brown sugar oat shaken espresso...,8,1735752034,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1hr5cfz,m4vgz7m,CampadLovesSpace,"Salt, ubrelvy, and a metric fuck ton of water",7,1735752467,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1hr5cfz,m4vzyo9,Ok-Dot-9036,"Nurtec, but my insurance has decided that I ca...",5,1735758557,[<redditsearch.reddit.scrape.Reply object at 0...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,19brueh,kiyfcbn,elfsteel,- eletriptan \n- chug cold gatorade or water\n...,1,1705877176,[],1,0,0,...,1,0,0,0,0,0,0,0,0,0
174,19brueh,kiyphao,CommanderNat,Excedrine or my triptan\nDark\nNo glasses\nCoo...,1,1705880837,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0
175,19brueh,kiyv68v,butterbean_11,"Two ibuprofen, one exedrin migraine (two round...",1,1705882961,[],0,0,1,...,0,0,0,0,0,0,0,0,0,0
176,19brueh,kiywylw,mostcommonhauntings,"Ice pack, heated blanket, zofran, Benadryl, el...",1,1705883620,[],0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
get_keyword_counts(x, all_keywords)

{'keywords': ['Sleep',
  'sleep',
  'Water',
  'Ibuprofen',
  'Caffeine',
  'Dark room',
  'Sumatriptan',
  'dark room',
  'Benadryl',
  'Nurtec',
  'Excedrin',
  'Ubrelvy',
  'Tylenol',
  'Magnesium',
  'Advil',
  'Electrolytes',
  'magnesium',
  'Zofran',
  'Naproxen',
  'Darkness',
  'Cefaly',
  'Naratriptan',
  'Aspirin',
  'paracetamol',
  'Fioricet',
  'Meditation',
  'Tramadol',
  'Ice packs',
  'peppermint oil',
  'Acetaminophen',
  'Gatorade',
  'Eletriptan',
  'ice packs',
  'Peppermint oil',
  'Salonpas',
  'Cambia',
  'Maxalt',
  'B2',
  'Amitriptyline',
  'Propranolol',
  'propranolol',
  'Ondansetron',
  'Rizatriptan',
  'Xanax',
  'Migraine cap',
  'Tiger Balm',
  'Omeprazole',
  'Meclizine',
  'codeine',
  'migraine cap',
  'BC Powder',
  'Fresh lemonade',
  'ASMR',
  'Omega-3',
  'Anarex',
  'Shockwave therapy',
  'CBD oil',
  'Allay lamp',
  'Vitamin D',
  'Diclofenac',
  'Aspercreme',
  'marijuana',
  'Singing bowls',
  'Pedialyte',
  'Avulux glasses',
  'Rice sock',