In [15]:
import asyncio
import asyncpraw
from datetime import datetime
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
import json
import os

# Your Reddit API credentials
reddit_client_id = os.getenv("REDDIT_CLIENT_ID")
reddit_client_secret = os.getenv("REDDIT_CLIENT_SECRET")
reddit_user_agent = "Ideagen"

# Your NVIDIA API key
nvidia_api_key = os.getenv("NVIDIA_API_KEY")

# Subreddits to scrape
subreddits = [
    "r/ecommerce"
]

In [3]:
async def scrape_subreddit(reddit, subreddit_name, limit=10):
    subreddit = await reddit.subreddit(subreddit_name.replace('r/', ''))
    posts = []

    async for submission in subreddit.hot(limit=limit):
        posts.append({
            'subreddit': subreddit_name,
            'title': submission.title,
            'body': submission.selftext,
            'score': submission.score,
            'url': submission.url,
            'created': datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
        })

    return posts

async def scrape_reddit():
    reddit = asyncpraw.Reddit(
        client_id=reddit_client_id,
        client_secret=reddit_client_secret,
        user_agent=reddit_user_agent
    )

    all_posts = []
    tasks = []

    for subreddit in subreddits:
        task = asyncio.create_task(scrape_subreddit(reddit, subreddit))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    for result in results:
        all_posts.extend(result)

    await reddit.close()
    return all_posts


In [4]:
"""
Extract pain points from posts
"""
class RedditPost(BaseModel):
    subreddit: str
    title: str
    body: str
    score: int
    url: str
    created: str
    tags: List[str] = Field(default_factory=list)
    pain_point: Optional[str] = Field(description="The problem encountered from the post")
    audience_affected: Optional[str] = Field(description="The audience affected by this problem")
    problem_why: Optional[str] = Field(description="Why the problem exists / root cause")
    cost_impact: Optional[str] = Field(description="Cost or impact of the problem (time, money)")
    detailed_examples: Optional[str] = Field(description="Detailed examples / user story (if possible)")
    stats_fact: Optional[str] = Field(description="Insightful facts about the problem / any insightful numbers or facts used in the post")
    combined_insights: Optional[str] = Field(description="A long analysis paragraph combining all points from pain point, audience, why it happens, its impact, its examples and stats")


In [5]:
client = OpenAI()

In [6]:
def filter_posts(posts):
    filtered_posts = []
    for post in posts:
        content = f"Title: {post['title']}\n\nBody: {post['body']}"
        
        # Step 1: Classification prompt
        classification_prompt = f"""Classify the following Reddit post into one or more of these categories: 
        [solution_request, pain_and_anger, money_talks, ideas, opportunities, advice_request], reply "Yes" if it does.
        If the post doesn't fit any of these categories, respond with "None".

        Post:
        {content[:500]}

        Classification:"""

        classification = client.chat.completions.create(
            # model="nvidia/llama-3.1-nemotron-70b-instruct",
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": classification_prompt}],
        )

        tags = classification.choices[0].message.content.strip()

        # If the post doesn't fit any category, skip it
        if tags == "None":
            continue

        filtered_posts.append(post)
    
    return filtered_posts

In [18]:
from pydantic import ValidationError


def extract_pain_points(posts):
    
    refined_posts = []
    for post in posts:
        content = f"Title: {post['title']}\n\nBody: {post['body']}"
        
        analysis_prompt = f"""Analyze the following Reddit post and provide insights:

        1. Tag the post with one or more of these tags: [solution_request, pain_and_anger, money_talks, ideas, opportunities, advice_request]
        2. If the post discusses a business-related pain point, provide a high-level summary of the pain point in one sentence.
        3. If a pain point is identified, provide detailed insights:
           - Audience affected
           - Why the problem exists / root cause
           - Cost or impact of the problem (time, money)
           - A user story describing the situation
           - Detailed examples / user story (if possible)
           - Relevant statistics (if possible)
        4. Provide a combined paragraph that includes all the above insights in a coherent narrative.

        Post:
        {content}

        Response format:
        {RedditPost.model_json_schema()}
        
        Insights:"""

        print("Calling model for " + post["title"])
        completion = client.chat.completions.create(
            # model="nvidia/llama-3.1-nemotron-70b-instruct",
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": analysis_prompt}],
        )

        response = completion.choices[0].message.content.strip()
        print("Got response")

        try:
            # First, try to parse the response as JSON
            response_dict = json.loads(response)
        except json.JSONDecodeError:
            # If it's not valid JSON, try to parse it manually
            response_dict = {}
            for line in response.split('\n'):
                if ':' in line:
                    key, value = line.split(':', 1)
                    response_dict[key.strip()] = value.strip()

        # Add missing fields from the original post
        response_dict['subreddit'] = post['subreddit']
        response_dict['title'] = post['title']
        response_dict['body'] = post['body']
        response_dict['score'] = post['score']
        response_dict['url'] = post['url']
        response_dict['created'] = post['created']
        
        try:
            refined_post = RedditPost(**response_dict)
            print("Refined post")
        except ValidationError as e:
            print(f"Validation error: {e}")
            refined_post = None

        refined_posts.append(refined_post)

    return refined_posts


In [9]:
posts = await scrape_reddit()
print("Fetched posts")
print("Calling llms")
posts = filter_posts(posts)
print("Filtered posts", posts)
post = posts[:3]
print(posts)

Fetched posts
Calling llms
Filtered posts [{'subreddit': 'r/ecommerce', 'title': 'Now what?', 'body': 'Hey guys!\n\nI’m currently in China to visit the Canton Fair.\nMy plan was to come here, visit the fair as well as the infamous Huaqiangbei electronic markets in Shenzhen and find a product to sell online.\n\nNow my problem is that I just found too many things. The selection of products that could potentially sell is so extremely huge that I’m in a kind of analysis paralysis. \n\nThere are a million different kinds of drones, smart watches, headphones, powerbanks, CPUs, GPUs… you name it. \n\nWhat would you do in such kind of a situation? \nHow would you now choose the right product? Which filters would you use to come closer to a product that might be a good seller? ', 'score': 16, 'url': 'https://www.reddit.com/r/ecommerce/comments/1g8s891/now_what/', 'created': '2024-10-21 16:20:03'}, {'subreddit': 'r/ecommerce', 'title': 'Learn coding/html vs hiring web developers ', 'body': 'Want

In [10]:
len(post)

3

In [19]:
res = await extract_pain_points(post)
res

Calling model for Now what?
Got response
Validation error: 7 validation errors for RedditPost
pain_point
  Field required [type=missing, input_value={'"properties"': '{', '"s...: '2024-10-21 16:20:03'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
audience_affected
  Field required [type=missing, input_value={'"properties"': '{', '"s...: '2024-10-21 16:20:03'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
problem_why
  Field required [type=missing, input_value={'"properties"': '{', '"s...: '2024-10-21 16:20:03'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
cost_impact
  Field required [type=missing, input_value={'"properties"': '{', '"s...: '2024-10-21 16:20:03'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
detailed_examples
  Field required [type=missing, input_value={'"properties"': '{', '"s..

TypeError: object list can't be used in 'await' expression

In [None]:

def cluster_pain_points(pain_points):
    client = OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key=nvidia_api_key
    )

    pain_point_list = "\n".join([f"- {p['pain_point']}" for p in pain_points])
    
    prompt = f"""Given the following list of business pain points, cluster them into 5-7 groups based on similarity. 
    Provide a label for each cluster and list the pain points that belong to it.

    Pain points:
    {pain_point_list}

    Clustered results:"""

    completion = client.chat.completions.create(
        model="nvidia/llama-3.1-nemotron-70b-instruct",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=1000
    )

    return completion.choices[0].message.content.strip()


In [None]:

def generate_report(clustered_results, pain_points):
    client = OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key=nvidia_api_key
    )

    prompt = f"""Based on the following clustered results of business pain points, generate a comprehensive market research report. 
    For each cluster, provide:
    1. A summary of the main issue
    2. Potential business opportunities addressing this pain point
    3. Recommendations for further investigation

    Clustered results:
    {clustered_results}

    Market Research Report:"""

    completion = client.chat.completions.create(
        model="nvidia/llama-3.1-nemotron-70b-instruct",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=2000
    )

    report = completion.choices[0].message.content.strip()

    # Add links to original posts
    for cluster in clustered_results.split("\n\n"):
        cluster_name = cluster.split("\n")[0]
        report += f"\n\nRelevant posts for {cluster_name}:\n"
        for pain_point in pain_points:
            if pain_point['pain_point'] in cluster:
                report += f"- [{pain_point['pain_point']}]({pain_point['url']}) (Score: {pain_point['score']})\n"

    return report

In [3]:
async def main():
    # Scrape Reddit
    posts = await scrape_reddit()
    print(f"Scraped {len(posts)} posts from Reddit")

    # Extract pain points
    pain_points = extract_pain_points(posts)
    print(f"Extracted {len(pain_points)} pain points")

    # Cluster pain points
    clustered_results = cluster_pain_points(pain_points)
    print("Clustered pain points")

    # Generate report
    report = generate_report(clustered_results, pain_points)
    print("Generated market research report")

    # Save report to file
    with open("market_research_report.md", "w") as f:
        f.write(report)

    print("Report saved to market_research_report.md")

In [4]:
await main()

Scraped 40 posts from Reddit
Extracted 40 pain points
Clustered pain points
Generated market research report
Report saved to market_research_report.md
