In [5]:
import json
import time
import os
import praw
import re
from datetime import datetime
from dotenv import load_dotenv

# Load environment variables
load_dotenv("../config/cred.env")

# Reddit API Credentials
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
USERNAME = os.getenv("REDDIT_USERNAME")
PASSWORD = os.getenv("REDDIT_PASSWORD")
USER_AGENT = os.getenv("USER_AGENT")

reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    username=USERNAME,
    password=PASSWORD,
    user_agent=USER_AGENT
)

# Subreddits related to disasters
relevant_subreddits = [
    "r/aircrashinvestigation", "r/StormComing", "r/NaturalDisastersToday",
    "r/naturesfury", "r/DisasterUpdate", "r/hurricane", "r/Earthquakes",
    "r/tornado", "r/caraccidents"
]

def get_author_history(author_name):
    """Fetches the author's account history for credibility scoring."""
    try:
        redditor = reddit.redditor(author_name)
        account_age = (time.time() - redditor.created_utc) / (60 * 60 * 24)  # Age in days
        recent_posts = list(redditor.submissions.new(limit=10))  # Last 10 posts
        subreddit_counts = {post.subreddit.display_name: 1 for post in recent_posts}
        multiple_subreddits = len(subreddit_counts) > 5

        return {
            "account_age_days": account_age,
            "num_recent_posts": len(recent_posts),
            "posts_in_multiple_subreddits": multiple_subreddits,
        }
    except Exception as e:
        print(f"Error fetching author history: {e}")
        return None

def classify_post_intent(post):
    """Classifies post intent based on keywords."""
    keywords = {
        "question": ["who", "what", "where", "when", "why", "how", "?"],
        "scam": ["donate", "fundraiser", "help fund", "support victims"],
        "political": ["government failure", "blame", "policy", "should have prevented"],
        "emotional": ["thoughts and prayers", "stay strong", "heartbreaking"],
        "informational": ["guide", "analysis", "report", "information"],
        "damage_report": ["casualties", "damage", "affected area"],
        "prevention": ["prepare", "emergency kit", "evacuation", "safety measures"],
        "awareness": ["earthquake", "flood", "hurricane", "disaster", "storm", "alert"],
    }

    text = (post.get("title", "") + " " + post.get("selftext", "")).lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    flair = (post.get("link_flair_text") or "").lower()

    for intent, words in keywords.items():
        if any(word in text for word in words) or flair == intent:
            return intent

    return "unknown"

def calculate_credibility(post):
    """Calculates credibility score of a Reddit post with debugging output."""
    score = 0
    print(f"\nProcessing post: {post.get('title', 'No Title')}")

    author_data = get_author_history(post["author"]) if post.get("author") else None
    intent = classify_post_intent(post)

    if author_data:
        if author_data["account_age_days"] > 180:
            score += 1
            print(f"  +1: Account age > 180 days ({int(author_data['account_age_days'])} days)")
        if not author_data["posts_in_multiple_subreddits"]:
            score += 1
            print("  +1: Author posts in limited subreddits")
        if author_data["num_recent_posts"] > 5:
            score += 1
            print(f"  +1: Author has {author_data['num_recent_posts']} recent posts")

    if post.get("author_premium"):
        score += 1
        print("  +1: Author has premium membership")

    if post.get("subreddit_name_prefixed") in relevant_subreddits:
        score += 1
        print(f"  +1: Post in relevant subreddit {post['subreddit_name_prefixed']}")
    elif post.get("subreddit_subscribers", 0) > 10000:
        score += 1
        print(f"  +1: Subreddit has {post['subreddit_subscribers']} subscribers")

    intent_scores = {
        "question": 2, "scam": -3, "political": -2, "emotional": 0,
        "informational": 3, "damage_report": 5, "prevention": 2, "awareness": 3
    }
    
    intent_score = intent_scores.get(intent, 0)
    score += intent_score
    print(f"  {intent_score:+d}: Intent classified as '{intent}'")

    if post.get("num_comments", 0) > 5:
        score += 1
        print(f"  +1: Post has {post['num_comments']} comments")
    if post.get("upvote_ratio", 0) > 0.7 and post.get("ups", 0) > 50:
        score += 2
        print(f"  +2: Upvote ratio {post['upvote_ratio']}, {post['ups']} upvotes")
    if post.get("edited"):
        score -= 1
        print("  -1: Post has been edited")
    if post.get("mod_reports") or post.get("user_reports"):
        score -= 2
        print("  -2: Post has been reported by users or moderators")
    if post.get("removed_by") or post.get("banned_by"):
        score -= 3
        print("  -3: Post has been removed or banned")

    print(f"  Final Score: {score}\n")
    return score


def process_credible_posts():
    """Loads classified posts, evaluates credibility, and saves credible posts."""
    try:
        with open("classified_reddit_posts.json", "r") as file:
            reddit_posts = json.load(file)
    except FileNotFoundError:
        print("⚠️ classified_reddit_posts.json not found. Waiting for new data...")
        return

    credible_posts = []
    threshold = 6

    for post in reddit_posts:
        post["credibility_score"] = calculate_credibility(post)
        if post["credibility_score"] >= threshold:
            credible_posts.append(post)

    with open("credible_reddit_posts.json", "w") as outfile:
        json.dump(credible_posts, outfile, indent=4)

    print(f"✅ Processed {len(reddit_posts)} posts. {len(credible_posts)} are credible.")

    # Load the latest credible posts
    with open("credible_reddit_posts.json", "r", encoding="utf-8") as file:
        new_posts = json.load(file)
    
    # Check if the master file exists
    master_file = "all_processed_posts.json"

    if os.path.exists(master_file):
        with open(master_file, "r", encoding="utf-8") as file:
            all_posts = json.load(file)
    else:
        all_posts = []

    # Convert existing post URLs to a set to prevent duplicates
    existing_post_urls = {post["url"] for post in all_posts}

    # Add only new posts that aren't already in the master file
    for post in new_posts:
        if post["url"] not in existing_post_urls:
            all_posts.append(post)

    # Save back to the master file
    with open(master_file, "w", encoding="utf-8") as file:
        json.dump(all_posts, file, indent=4)

    print("\n✅ All posts are now stored permanently in all_processed_posts.json")


# Monitor changes in classified_reddit_posts.json
last_mod_time = 0

while True:
    try:
        current_mod_time = os.path.getmtime("classified_reddit_posts.json")

        if current_mod_time != last_mod_time:
            print("🔄 Detected change in classified_reddit_posts.json! Re-running credibility check...")
            process_credible_posts()
            last_mod_time = current_mod_time
        else:
            print("⏳ No changes detected. Waiting...")

    except FileNotFoundError:
        print("⚠️ classified_reddit_posts.json not found. Retrying...")

    time.sleep(10)

🔄 Detected change in classified_reddit_posts.json! Re-running credibility check...
✅ Processed 0 posts. 0 are credible.

✅ All posts are now stored permanently in all_processed_posts.json
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
⏳ No changes detected. Waiting...
🔄 Detected change in classified_reddit_posts.json! Re-running credibility check...

Processing post: Severe Flooding in Mumbai!
  +1: Author posts in limited subreddits
  +1: Author has 7 recent posts
  +3: Intent classified as 'awareness'
  Final Score: 5

✅ Processed 1 posts. 0 are credible.

✅ All posts are now stored permanently in all_processed_posts.json
🔄 Detected change in classified_reddit_posts.json! Re-running credibility check...

Processing post: Severe Flooding in Mumbai!
  +1: Author posts in limited subreddits
  +1: Author has 7 recent posts
  +3: Intent classified as 'awareness'
  Final Score: 5


Processing post: Kansas Hit by Massive Tornado!
  +1: Author posts in limited subreddits

KeyboardInterrupt: 