In [1]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

df = pd.read_csv('reddit_posts_per_sub.csv')
df

Unnamed: 0,title,score,url,created_utc,created_time,author,comments_count,subreddit,text,top_comments
0,"Notion, we need to talk: without an offline mo...",5548,https://www.reddit.com/r/Notion/comments/kqlfo...,1.609804e+09,2021-01-04 23:53:58,JoJokerer,226,Notion,"An open letter to Notion,\n\nI've evaluated so...",This is the angriest upvote I’ve ever given. |...
1,I'm just going to leave this here...,4627,https://i.redd.it/p7kakryw7y5e1.png,1.733805e+09,2024-12-10 04:27:32,carlpadonosk,32,Notion,,Me scouring the internet for a custom icon to ...
2,ouch this one hurts :|,3719,https://i.imgur.com/K2MYOEf.png,1.606835e+09,2020-12-01 15:09:09,whoibrar,65,Notion,,"While getting into Notion, I was also sorting ..."
3,So true,3707,https://i.redd.it/e29f98hl0qo61.png,1.616480e+09,2021-03-23 06:20:13,Cam223,60,Notion,,"Weirdly enough, I hate using Notion as a note-..."
4,What do you mean this is not work,3584,https://i.imgur.com/4FUEZWH.jpg,1.623255e+09,2021-06-09 16:14:24,jerrygoyal,34,Notion,,Damn! I created new notion templates for proce...
...,...,...,...,...,...,...,...,...,...,...
3983,Physique Phriday,200,https://www.reddit.com/r/Fitness/comments/1i3d...,1.737108e+09,2025-01-17 10:00:42,AutoModerator,169,Fitness,**Welcome to the Physique Phriday thread**\n\n...,Been cutting hard and lifting since May of las...
3984,Victory Sunday,190,https://www.reddit.com/r/Fitness/comments/1i4v...,1.737281e+09,2025-01-19 10:00:42,AutoModerator,183,Fitness,**Welcome to the Victory Sunday Thread**\n\nIt...,Currently at the gym and just caught a glimpse...
3985,Victory Sunday,179,https://www.reddit.com/r/Fitness/comments/1hzj...,1.736676e+09,2025-01-12 10:00:40,AutoModerator,164,Fitness,**Welcome to the Victory Sunday Thread**\n\nIt...,I can finally curl 30lbs and squat 120lbs. I k...
3986,Gym Story Saturday,160,https://www.reddit.com/r/Fitness/comments/1i9h...,1.737789e+09,2025-01-25 07:09:30,FGC_Valhalla,193,Fitness,Hi! Welcome to your weekly thread where you ca...,This week the front desk people didn't show up...


In [4]:
def clean_text(text):
    """Clean Reddit post text for analysis"""
    if not isinstance(text, str):
        return ""
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

In [12]:
def filter_posts(df):
    """Filter out low-quality posts"""

    df = df[df['text'].apply(lambda x: len(str(x)) > 20)]
    df = df[~df['text'].str.contains(r'\[deleted\]|\[removed\]', na=False, regex=True)]
    df = df[df['score'] >= 1]
    if 'comments_count' in df.columns:
        df = df[df['comments_count'] >= 1]  
    return df

In [6]:
KEYWORD_DICT = {
    'base': {
        'recommend': 3, 'looking for': 3, 'alternative': 3, 'alternative to': 3,
        'pricing': 2, 'expensive': 2, 'trial': 2, 'free version': 2,
        'switch': 3, 'migrate': 3, 'replace': 3, 'better than': 2,
        'any app': 2, 'what tool': 3, 'need a tool': 3, 'suggestions': 2
    },
    'Artificial': {
        'ai': 4, 'automate': 3, 'llm': 2, 'agent': 3,
        'workflow': 2, 'gpt': 2, 'assistant': 2, 'automatic': 2,
        'integration': 2, 'robot': 2, 'model': 2, 'prompt': 1
    },
    'Notion': {
        'template': 2, 'database': 2, 'integrate': 2, 'learning curve': 2,
        'too complex': 2, 'customize': 1, 'plugin': 2, 'extension': 2,
        'mobile app': 2, 'offline': 2, 'sync': 2, 'backup': 1
    },
    'SmallBusiness': {
        'crm': 4, 'invoice': 3, 'client': 2, 'team': 2,
        'scale': 2, 'manage': 2, 'business': 2, 'software': 2,
        'solution': 2, 'platform': 2, 'system': 2, 'operation': 1
    },
    'Fitness': {
        'track': 3, 'habit': 2, 'journal': 2, 'reminder': 1,
        'routine': 2, 'progress': 2, 'goal': 2, 'workout': 2,
        'plan': 2, 'schedule': 2, 'monitor': 2, 'log': 2
    }
}

In [13]:
def detect_keywords(text, subreddit):
    """Detect relevant keywords in text with subreddit-specific weights"""
    if not isinstance(text, str):
        return [], 0
    text = text.lower()
    found_keywords = []
    total_score = 0
 
    for kw, weight in KEYWORD_DICT['base'].items():
        if kw in text:
            found_keywords.append(kw)
            total_score += weight
    if subreddit in KEYWORD_DICT:
        for kw, weight in KEYWORD_DICT[subreddit].items():
            if kw in text:
                found_keywords.append(kw)
                total_score += weight
    return list(set(found_keywords)), total_score

In [14]:
def calculate_recency_score(created_utc):
    """Calculate score based on post age (newer = higher score)"""
    try:
        post_date = datetime.fromtimestamp(created_utc)
    except:
        post_date = datetime.now()
    days_old = (datetime.now() - post_date).days
    recency_score = max(0, (30 - days_old) / 30 * 5)  # Max 5 points
    return round(recency_score, 2)



In [15]:
def calculate_engagement_score(score, comments_count):
    """Calculate score based on post engagement"""
    
    upvote_score = np.log1p(score) * 1.5  
    comment_score = np.log1p(comments_count) * 1.0 if pd.notna(comments_count) else 0
    return min(5, round(upvote_score + comment_score, 2))


In [10]:
def calculate_intent_score(row):
    """Calculate composite intent score (0-10 scale)"""
    keyword_score = row['keyword_score']
    recency_score = row['recency_score']
    engagement_score = row['engagement_score']
    

    composite_score = (keyword_score * 0.5) + (recency_score * 0.3) + (engagement_score * 0.2)
    normalized_score = min(10, max(0, composite_score * 2))
    
    return round(normalized_score, 1)

In [16]:
def process_posts(df):
    """Process raw posts through cleaning, filtering, and scoring"""

    df['text'] = df['text'].apply(clean_text)
    df['title'] = df['title'].apply(clean_text)
    

    df['full_text'] = df['title'] + ' ' + df['text']

    df = filter_posts(df)

    keyword_results = df.apply(
        lambda row: detect_keywords(row['full_text'], row['subreddit']), 
        axis=1
    )
    df['matched_keywords'] = keyword_results.apply(lambda x: x[0])
    df['keyword_score'] = keyword_results.apply(lambda x: x[1])
    
    df = df[df['keyword_score'] > 0]

    if 'created_utc' in df.columns:
        df['recency_score'] = df['created_utc'].apply(calculate_recency_score)
    else:
        df['recency_score'] = 2.5
    
    df['engagement_score'] = df.apply(
        lambda row: calculate_engagement_score(row['score'], row.get('comments_count', 0)),
        axis=1
    )

    df['intent_score'] = df.apply(
        lambda row: (row['keyword_score'] * 0.5 + 
                    row['recency_score'] * 0.3 + 
                    row['engagement_score'] * 0.2) * 2,
        axis=1
    )
    df['intent_score'] = df['intent_score'].clip(0, 10).round(1)

    df = df.sort_values('intent_score', ascending=False)
  
    final_columns = [
        'author', 'subreddit', 'title', 'url',
        'created_utc', 'intent_score', 'matched_keywords',
        'score', 'comments_count'
    ]
    final_columns = [col for col in final_columns if col in df.columns]
    
    return df[final_columns]
    
processed_df = process_posts(df)
processed_df.to_csv('reddit_intent_leads_scored_filtered.csv', index=False)

print(f"Processed {len(processed_df)} high-intent posts")
print("Top 5 posts by intent score:")
print(processed_df.head(5)[['subreddit', 'title', 'intent_score']])

Processed 1729 high-intent posts
Top 5 posts by intent score:
          subreddit                                              title  \
3980        Fitness  the tom platz experience pain, pleasure, and h...   
2626  SmallBusiness  a client wants me to quit my job and work for ...   
2597  SmallBusiness  own small flooring company, client ghosted me ...   
3527        Fitness  my guidereview for the most effective muscle g...   
2599  SmallBusiness  silicon valley bank to donate all fees from pp...   

      intent_score  
3980          10.0  
2626          10.0  
2597          10.0  
3527          10.0  
2599          10.0  
