# Reddit Real-Time Analytics - Producer

**Team Members:** 
- Alina Insam
- Sumedh Bamane
- Rafael Machado Da Rocha
- Kaan Ak

**Project Description:** Real-time analysis of Germany-related discussions on Reddit using Spark Streaming. This producer streams comments from multiple subreddits, filters for Germany-related content, and sends structured data via socket to a Spark consumer for real-time analytics including TF-IDF analysis, sentiment analysis, and reference extraction.

**Topic Focus:** German Politics and Social Issues
**Subreddits:** europe, worldnews, politics, germany, de, ask_europe
**Keywords:** German politics, immigration, elections, social issues, economy, historical events

**Architecture Overview:**
1. **Data Source**: Reddit API via PRAW
2. **Filtering**: Germany-related keyword matching
3. **Processing**: Sentiment analysis, reference extraction
4. **Transmission**: Socket streaming to Spark consumer
5. **Analytics**: Real-time metrics and statistics

In [None]:
# .env dosyasından API anahtarlarını yükle
import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
SECRET_TOKEN = os.getenv('REDDIT_SECRET_TOKEN')
USER_AGENT = os.getenv('REDDIT_USER_AGENT')

if not CLIENT_ID or not SECRET_TOKEN or not USER_AGENT:
    raise ValueError("Reddit API anahtarları eksik! Lütfen .env dosyasını doldurun.")

In [None]:
# Install required packages
!pip install praw textblob vaderSentiment python-dotenv
!python -m textblob.download_corpora

print("📦 Installing required packages...")
print("✅ PRAW - Python Reddit API Wrapper")
print("✅ TextBlob - Natural Language Processing")
print("✅ VADER Sentiment - Sentiment Analysis")
print("✅ python-dotenv - Environment Variable Management")

[33mDEPRECATION: Loading egg at /opt/bitnami/python/lib/python3.11/site-packages/pip-23.3.2-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import all required libraries
import praw
import time
import datetime
import socket
import json
import re
import threading
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("📚 All libraries imported successfully!")
print("🔧 Logger configured")

In [None]:
# Initialize Reddit API connection
reddit = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=SECRET_TOKEN,
                     user_agent=USER_AGENT)

print("✅ Reddit API initialized successfully!")
print(f"🔑 Connected as: {USER_AGENT}")

# Test Reddit connection
try:
    reddit.user.me()
    print("🌐 Reddit API connection verified!")
except Exception as e:
    print("⚠️ Reddit API connection test failed - continuing with read-only access")
    print(f"   Error: {e}")

In [None]:
# Socket connection with retry logic
def connect_socket():
    """Connect to socket with retry logic"""
    HOST = 'localhost'  # Changed from host.docker.internal for local testing
    PORT = 9998
    
    for attempt in range(CONFIG['MAX_RECONNECT_ATTEMPTS']):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.connect((HOST, PORT))
            logger.info(f"Connected to {HOST}:{PORT}")
            return s
        except Exception as e:
            logger.error(f"Connection attempt {attempt + 1} failed: {e}")
            if attempt < CONFIG['MAX_RECONNECT_ATTEMPTS'] - 1:
                time.sleep(CONFIG['RECONNECT_DELAY'])
            else:
                raise e

# Initial connection
s = connect_socket()
print("🔌 Socket connection established!")

In [None]:
# Enhanced Germany-related keywords
GERMANY_KEYWORDS = [
    # Country and cities
    "germany", "german", "deutschland", "berlin", "munich", "hamburg", "cologne", "frankfurt",
    # Political figures
    "scholz", "german chancellor", "merkel", "habeck", "lindner", "weidel", "höcke",
    # Political parties
    "spd", "afd", "cdu", "csu", "greens", "green party", "fdp", "die linke", "bsw",
    # Political terms
    "far-right", "right-wing", "left-wing", "german election", "bundestag", "bundesrat",
    # Military and institutions
    "bundeswehr", "german army", "nato germany", "eu germany",
    # Social issues
    "refugees in germany", "immigration germany", "asylum germany", "integration germany",
    # Economy
    "german economy", "german gdp", "german industry", "german exports",
    # Historical
    "nazi", "holocaust", "east germany", "west germany", "reunification",
    # Current issues
    "german energy", "german climate", "german covid", "german healthcare"
]

# Configuration
CONFIG = {
    'SUBREDDITS': 'europe+worldnews+politics+germany+de+ask_europe',
    'BUFFER_SIZE': 100,
    'WINDOW_SECONDS': 60,
    'MAX_RECONNECT_ATTEMPTS': 5,
    'RECONNECT_DELAY': 5
}

print(f"🔍 Germany keywords loaded: {len(GERMANY_KEYWORDS)} terms")
print(f"📡 Target subreddits: {CONFIG['SUBREDDITS']}")

In [None]:
def is_about_germany(text):
    """Check if text contains Germany-related keywords"""
    if not text or len(text.strip()) < 10:  # Skip very short comments
        return False
    
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in GERMANY_KEYWORDS)

def extract_references(text):
    """Extract user mentions, subreddit references, and URLs"""
    if not text:
        return [], [], []
    
    # User mentions (u/username or /u/username)
    user_mentions = re.findall(r'/?u/([A-Za-z0-9_-]+)', text, re.IGNORECASE)
    
    # Subreddit references (r/subreddit or /r/subreddit)
    subreddit_refs = re.findall(r'/?r/([A-Za-z0-9_-]+)', text, re.IGNORECASE)
    
    # URLs (http/https)
    urls = re.findall(r'https?://[^\s]+', text)
    
    return user_mentions, subreddit_refs, urls

def get_sentiment(text):
    """Get sentiment analysis using VADER"""
    try:
        analyzer = SentimentIntensityAnalyzer()
        scores = analyzer.polarity_scores(text)
        return {
            'compound': scores['compound'],
            'positive': scores['pos'],
            'negative': scores['neg'],
            'neutral': scores['neu']
        }
    except Exception as e:
        logger.error(f"Sentiment analysis error: {e}")
        return {'compound': 0, 'positive': 0, 'negative': 0, 'neutral': 1}

def create_message(comment):
    """Create structured message from Reddit comment"""
    try:
        user_mentions, subreddit_refs, urls = extract_references(comment.body)
        sentiment = get_sentiment(comment.body)
        
        return {
            "id": comment.id,
            "text": comment.body,
            "created_utc": comment.created_utc,
            "author": str(comment.author) if comment.author else "[deleted]",
            "subreddit": str(comment.subreddit),
            "score": comment.score,
            "link": f"https://www.reddit.com{comment.permalink}",
            "user_mentions": user_mentions,
            "subreddit_references": subreddit_refs,
            "urls": urls,
            "sentiment": sentiment,
            "word_count": len(comment.body.split()),
            "char_count": len(comment.body),
            "timestamp": datetime.datetime.now().isoformat()
        }
    except Exception as e:
        logger.error(f"Error creating message: {e}")
        return None

print("🔧 Processing functions defined successfully!")

In [None]:
# Setup Reddit subreddit stream
subreddit = reddit.subreddit(CONFIG['SUBREDDITS'])
logger.info(f"Monitoring subreddits: {CONFIG['SUBREDDITS']}")
print(f"📡 Monitoring subreddits: {CONFIG['SUBREDDITS']}")
print("🔄 Stream setup complete - ready to start!")

In [None]:
import datetime
import json
import socket
import time
import praw
import logging

# --- Configure Logging ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Load Configuration ---
CONFIG = {
    "SUBREDDITS": "Germany",
    "SOCKET_PATH": "/path/to/socket"
}

# --- Initialize Reddit Instance ---
reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="YOUR_USER_AGENT"
)

# --- Setup Reddit subreddit stream ---
subreddit = reddit.subreddit(CONFIG['SUBREDDITS'])
logger.info(f"Monitoring subreddits: {CONFIG['SUBREDDITS']}")

# --- Stream and Print ---
print("📡 Streaming Germany-related comments to socket...")

buffer = []
window_start = time.time()

for comment in subreddit.stream.comments(skip_existing=True):
    if is_about_germany(comment.body):
        msg = {
            "text": comment.body,
            "created_utc": comment.created_utc,
            "link": f"https://www.reddit.com{comment.permalink}"
        }

        # Send to socket
        try:
            print(f"🔼 Sending comment to socket: {msg['text'][:80]}...")
            s.sendall((json.dumps(msg) + "\n").encode("utf-8"))
        except Exception as e:
            print(f"⚠️ Error sending data: {e}")

        # Save for local printing
        buffer.append(msg)

    # Every 60 seconds, print summary
    if time.time() - window_start >= 60:
        print(f"\n⏰ Window at {datetime.datetime.now().strftime('%H:%M:%S')}")
        print(f"✅ Sent {len(buffer)} Germany-related comments:")
        for post in buffer:
            print(f" - {post['text'][:100]}...")
        buffer = []
        window_start = time.time()

📡 Streaming Germany-related comments to socket...
🔼 Sending comment to socket: Germany in the 30s was a very different country with a very different government...

⏰ Window at 09:03:46
✅ Sent 1 Germany-related comments:
 - Germany in the 30s was a very different country with a very different governmental setup.

Do you un...
🔼 Sending comment to socket: Ok, you want us to be aware without being enslaved -- what exactly enslaves us? ...

⏰ Window at 09:05:02
✅ Sent 1 Germany-related comments:
 - Ok, you want us to be aware without being enslaved -- what exactly enslaves us? If anyone enslaves u...
🔼 Sending comment to socket: Right, we should've used understanding against Nazi Germany.

I understand the s...


KeyboardInterrupt: 

In [None]:
# Enhanced streaming with comprehensive analytics
print("📡 Starting Germany-focused Reddit Analytics Stream...")
print(f"🎯 Monitoring: {CONFIG['SUBREDDITS']}")
print(f"🔍 Keywords: {len(GERMANY_KEYWORDS)} terms")
print("="*60)

# Statistics tracking
stats = {
    'total_comments': 0,
    'germany_comments': 0,
    'sent_successfully': 0,
    'errors': 0,
    'start_time': time.time()
}

buffer = []
window_start = time.time()

try:
    for comment in subreddit.stream.comments(skip_existing=True):
        stats['total_comments'] += 1
        
        if is_about_germany(comment.body):
            stats['germany_comments'] += 1
            
            # Create structured message
            msg = create_message(comment)
            if msg:
                # Send to socket
                try:
                    json_msg = json.dumps(msg) + "\n"
                    s.sendall(json_msg.encode("utf-8"))
                    stats['sent_successfully'] += 1
                    
                    print(f"✅ Sent: {msg['text'][:100]}...")
                    print(f"   📊 Score: {msg['score']}, Sentiment: {msg['sentiment']['compound']:.2f}")
                    print(f"   👥 Users: {len(msg['user_mentions'])}, 🔗 URLs: {len(msg['urls'])}")
                    
                except Exception as e:
                    stats['errors'] += 1
                    logger.error(f"Socket error: {e}")
                    
                    # Try to reconnect
                    try:
                        s.close()
                        s = connect_socket()
                        s.sendall(json_msg.encode("utf-8"))
                        stats['sent_successfully'] += 1
                    except:
                        logger.error("Failed to reconnect and send")

                buffer.append(msg)

        # Print statistics every window
        if time.time() - window_start >= CONFIG['WINDOW_SECONDS']:
            elapsed = time.time() - stats['start_time']
            rate = stats['germany_comments'] / elapsed * 60 if elapsed > 0 else 0
            
            print(f"\n⏰ === WINDOW SUMMARY at {datetime.datetime.now().strftime('%H:%M:%S')} ===")
            print(f"📈 Total comments processed: {stats['total_comments']}")
            print(f"🇩🇪 Germany-related: {stats['germany_comments']} ({stats['germany_comments']/max(stats['total_comments'],1)*100:.1f}%)")
            print(f"✅ Successfully sent: {stats['sent_successfully']}")
            print(f"❌ Errors: {stats['errors']}")
            print(f"📊 Rate: {rate:.1f} Germany comments/minute")
            print(f"📝 Buffer size: {len(buffer)}")
            
            if buffer:
                print("\n🔥 Recent Germany-related comments:")
                for i, post in enumerate(buffer[-5:]):  # Show last 5
                    sentiment_emoji = "😊" if post['sentiment']['compound'] > 0.1 else "😞" if post['sentiment']['compound'] < -0.1 else "😐"
                    print(f"   {i+1}. {sentiment_emoji} [{post['subreddit']}] {post['text'][:80]}...")
            
            buffer = []
            window_start = time.time()
            print("="*60)

except KeyboardInterrupt:
    print("\n🛑 Stream stopped by user")
except Exception as e:
    logger.error(f"Stream error: {e}")
finally:
    try:
        s.close()
        print("🔌 Socket connection closed")
    except:
        pass
    
    # Final statistics
    elapsed = time.time() - stats['start_time']
    print(f"\n📋 FINAL STATISTICS:")
    print(f"⏱️  Runtime: {elapsed/60:.1f} minutes")
    print(f"📊 Total processed: {stats['total_comments']} comments")
    print(f"🇩🇪 Germany-related: {stats['germany_comments']} comments")
    print(f"✅ Successfully sent: {stats['sent_successfully']} messages")
    print(f"❌ Errors encountered: {stats['errors']}")
    if elapsed > 0:
        print(f"📈 Average rate: {stats['germany_comments']/elapsed*60:.1f} Germany comments/minute")