In [None]:
import praw
import requests
import mysql.connector
from datetime import datetime, timedelta
import time
import logging
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re

print("✓ Imports loaded")
print("✓ Loading FinBERT model... (this takes 30 seconds first time)")

# Initialize FinBERT
finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
finbert_model.eval()

print("✓ FinBERT model loaded!")

In [None]:
def analyze_sentiment_textblob(text):
    """TextBlob sentiment analysis (original method)"""
    if not text or len(text.strip()) == 0:
        return 0.0, 'neutral'
    
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 to +1
    
    if polarity > 0.1:
        label = 'positive'
    elif polarity < -0.1:
        label = 'negative'
    else:
        label = 'neutral'
    
    return float(polarity), label


def analyze_sentiment_finbert(text):
    """FinBERT sentiment analysis (financial-specific)"""
    if not text or len(text.strip()) == 0:
        return 0.0, 'neutral'
    
    # Truncate to 512 tokens (BERT limit)
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = finbert_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # FinBERT outputs: [positive, negative, neutral]
    positive = predictions[0][0].item()
    negative = predictions[0][1].item()
    neutral = predictions[0][2].item()
    
    # Calculate sentiment score (-1 to +1)
    sentiment_score = positive - negative
    
    # Determine label
    max_score = max(positive, negative, neutral)
    if max_score == positive:
        sentiment_label = 'positive'
    elif max_score == negative:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'
    
    return float(sentiment_score), sentiment_label


# Default to FinBERT (better for financial text)
def analyze_sentiment(text):
    """Main sentiment function - uses FinBERT"""
    return analyze_sentiment_finbert(text)


print("✓ Sentiment functions ready (using FinBERT by default)")


In [None]:
# Configurations
reddit_config = {
    'client_id': '6SD-D4KilOopu4O6m9R9VA',
    'client_secret': 'mv-Z3_vFTC7WApSWvtdxUX0MKDq3QQ',
    'user_agent': 'Trading Sentiment Bot by Delicious_Divide6891'
}

news_api_key = '73e9447f080543c3885ec7803f705101'

db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '',
    'database': 'trading_system'
}

print("✓ Configuration loaded")

In [None]:
class SentimentScraper:
    def __init__(self, reddit_config, news_api_key, db_config):
        self.reddit_config = reddit_config
        self.news_api_key = news_api_key
        self.db_config = db_config
        self.reddit = None
        
    def connect_db(self):
        try:
            return mysql.connector.connect(**self.db_config)
        except mysql.connector.Error as e:
            print(f"Database error: {e}")
            return None
    
    def init_reddit(self):
        try:
            self.reddit = praw.Reddit(
                client_id=self.reddit_config['client_id'],
                client_secret=self.reddit_config['client_secret'],
                user_agent=self.reddit_config['user_agent']
            )
            return True
        except Exception as e:
            print(f"Reddit API error: {e}")
            return False
    
    def scrape_reddit(self, symbols, subreddits=None, limit=100, time_filter='month'):
        """Scrape Reddit with BOTH sentiment analyzers"""
        if subreddits is None:
            subreddits = [
                'stocks', 'investing', 'wallstreetbets', 'SecurityAnalysis',
                'StockMarket', 'options', 'Daytrading', 'ValueInvesting',
                'pennystocks', 'FinancialMarkets', 'AlgoTrading', 'dividends',
                'RobinHood', 'Bogleheads', 'ETFs', 'technicalanalysis'
            ]
        
        if not self.reddit:
            if not self.init_reddit():
                return False
        
        conn = self.connect_db()
        if not conn:
            return False
        
        cursor = conn.cursor()
        total_posts = 0
        
        for idx, symbol in enumerate(symbols, 1):
            print(f"\n[{idx}/{len(symbols)}] Processing {symbol}...")
            for subreddit_name in subreddits:
                try:
                    print(f"  → Searching r/{subreddit_name}...", end='')
                    subreddit = self.reddit.subreddit(subreddit_name)
                    search_query = f"${symbol} OR {symbol}"
                    
                    post_count = 0
                    for post in subreddit.search(search_query, time_filter=time_filter, limit=limit):
                        post_count += 1
                        full_text = f"{post.title} {post.selftext}"
                        
                        # Get BOTH sentiment scores
                        textblob_score, _ = analyze_sentiment_textblob(full_text)
                        finbert_score, finbert_label = analyze_sentiment_finbert(full_text)
                        
                        try:
                            query = """
                            INSERT IGNORE INTO reddit_sentiment 
                            (symbol, subreddit, post_id, title, selftext, score, num_comments, 
                             created_utc, sentiment_textblob, sentiment_finbert, sentiment_label)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                            """
                            
                            values = (
                                symbol, subreddit_name, post.id,
                                post.title[:500], post.selftext[:1000] if post.selftext else '',
                                post.score, post.num_comments,
                                datetime.fromtimestamp(post.created_utc),
                                textblob_score, finbert_score, finbert_label
                            )
                            
                            cursor.execute(query, values)
                            total_posts += 1
                            
                        except mysql.connector.Error as e:
                            if "Duplicate entry" not in str(e):
                                print(f"Error: {e}")
                    
                    print(f" {post_count} posts found")
                    time.sleep(2)
                    
                except Exception as e:
                    print(f"Error scraping r/{subreddit_name}: {e}")
                    continue
        
        conn.commit()
        cursor.close()
        conn.close()
        
        print(f"\n✓ Scraped {total_posts} Reddit posts")
        return True
    
    def scrape_news(self, symbols, days_back=30):
        """Scrape news with BOTH sentiment analyzers"""
        conn = self.connect_db()
        if not conn:
            return False
        
        cursor = conn.cursor()
        total_articles = 0
        
        to_date = datetime.now()
        from_date = to_date - timedelta(days=days_back)
        
        for idx, symbol in enumerate(symbols, 1):
            print(f"\n[{idx}/{len(symbols)}] Fetching news for {symbol}...")
            try:
                url = 'https://newsapi.org/v2/everything'
                params = {
                    'q': f"{symbol} stock OR {symbol} shares",
                    'from': from_date.strftime('%Y-%m-%d'),
                    'to': to_date.strftime('%Y-%m-%d'),
                    'language': 'en',
                    'sortBy': 'publishedAt',
                    'apiKey': self.news_api_key,
                    'pageSize': 100
                }
                
                response = requests.get(url, params=params)
                
                if response.status_code == 200:
                    data = response.json()
                    articles = data.get('articles', [])
                    
                    for article in articles:
                        text = f"{article.get('title', '')} {article.get('description', '')}"
                        
                        # Get BOTH sentiment scores
                        textblob_score, _ = analyze_sentiment_textblob(text)
                        finbert_score, finbert_label = analyze_sentiment_finbert(text)
                        
                        try:
                            query = """
                            INSERT IGNORE INTO news_sentiment
                            (symbol, source, title, description, published_at, 
                             sentiment_textblob, sentiment_finbert, sentiment_label)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                            """
                            
                            published = article.get('publishedAt', '')
                            if published:
                                published_dt = datetime.strptime(published, '%Y-%m-%dT%H:%M:%SZ')
                            else:
                                published_dt = datetime.now()
                            
                            values = (
                                symbol,
                                article.get('source', {}).get('name', 'Unknown')[:100],
                                article.get('title', '')[:500],
                                article.get('description', '')[:1000],
                                published_dt,
                                textblob_score, finbert_score, finbert_label
                            )
                            
                            cursor.execute(query, values)
                            total_articles += 1
                            
                        except mysql.connector.Error as e:
                            if "Duplicate entry" not in str(e):
                                print(f"Error: {e}")
                    
                    print(f"  ✓ Found {len(articles)} articles")
                    
                elif response.status_code == 426:
                    print("⚠️ NewsAPI rate limit reached")
                    break
                
                time.sleep(1.5)
                
            except Exception as e:
                print(f"Error scraping news for {symbol}: {e}")
                continue
        
        conn.commit()
        cursor.close()
        conn.close()
        
        print(f"\n✓ Scraped {total_articles} news articles")
        return True
    
    def aggregate_daily_sentiment(self, date=None):
        """Aggregate BOTH sentiment types by date"""
        if date is None:
            date = datetime.now().date()
        
        conn = self.connect_db()
        if not conn:
            return False
        
        cursor = conn.cursor()
        
        cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE is_active = 1")
        symbols = [row[0] for row in cursor.fetchall()]
        
        for symbol in symbols:
            try:
                # Reddit sentiment (both types)
                reddit_query = """
                SELECT 
                    AVG(sentiment_textblob) as avg_textblob,
                    AVG(sentiment_finbert) as avg_finbert,
                    COUNT(*) as post_count
                FROM reddit_sentiment
                WHERE symbol = %s AND DATE(created_utc) = %s
                """
                cursor.execute(reddit_query, (symbol, date))
                reddit_result = cursor.fetchone()
                reddit_textblob = reddit_result[0] if reddit_result[0] else 0
                reddit_finbert = reddit_result[1] if reddit_result[1] else 0
                reddit_count = reddit_result[2] if reddit_result[2] else 0
                
                # News sentiment (both types)
                news_query = """
                SELECT 
                    AVG(sentiment_textblob) as avg_textblob,
                    AVG(sentiment_finbert) as avg_finbert,
                    COUNT(*) as article_count
                FROM news_sentiment
                WHERE symbol = %s AND DATE(published_at) = %s
                """
                cursor.execute(news_query, (symbol, date))
                news_result = cursor.fetchone()
                news_textblob = news_result[0] if news_result[0] else 0
                news_finbert = news_result[1] if news_result[1] else 0
                news_count = news_result[2] if news_result[2] else 0
                
                # Combined sentiment (weighted average)
                total_count = reddit_count + news_count
                if total_count > 0:
                    combined_textblob = ((reddit_textblob * reddit_count) + 
                                        (news_textblob * news_count)) / total_count
                    combined_finbert = ((reddit_finbert * reddit_count) + 
                                       (news_finbert * news_count)) / total_count
                else:
                    combined_textblob = 0
                    combined_finbert = 0
                
                # Insert/update
                insert_query = """
                INSERT INTO daily_sentiment 
                (symbol, date, reddit_avg_textblob, reddit_avg_finbert, reddit_post_count, 
                 news_avg_textblob, news_avg_finbert, news_article_count, 
                 combined_textblob, combined_finbert, total_mentions)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                    reddit_avg_textblob = VALUES(reddit_avg_textblob),
                    reddit_avg_finbert = VALUES(reddit_avg_finbert),
                    reddit_post_count = VALUES(reddit_post_count),
                    news_avg_textblob = VALUES(news_avg_textblob),
                    news_avg_finbert = VALUES(news_avg_finbert),
                    news_article_count = VALUES(news_article_count),
                    combined_textblob = VALUES(combined_textblob),
                    combined_finbert = VALUES(combined_finbert),
                    total_mentions = VALUES(total_mentions)
                """
                
                cursor.execute(insert_query, (
                    symbol, date, reddit_textblob, reddit_finbert, reddit_count,
                    news_textblob, news_finbert, news_count,
                    combined_textblob, combined_finbert, total_count
                ))
                
            except mysql.connector.Error as e:
                print(f"Error aggregating for {symbol}: {e}")
        
        conn.commit()
        cursor.close()
        conn.close()
        
        print(f"✓ Aggregated sentiment for {len(symbols)} symbols on {date}")
        return True
    
    def get_stock_symbols(self):
        conn = self.connect_db()
        if not conn:
            return []
        
        cursor = conn.cursor()
        cursor.execute("SELECT symbol FROM stocks WHERE is_active = 1")
        symbols = [row[0] for row in cursor.fetchall()]
        cursor.close()
        conn.close()
        
        return symbols

In [None]:
class SentimentScraper:
    def __init__(self, reddit_config, news_api_key, db_config):
        self.reddit_config = reddit_config
        self.news_api_key = news_api_key
        self.db_config = db_config
        self.reddit = None
        
    def connect_db(self):
        try:
            return mysql.connector.connect(**self.db_config)
        except mysql.connector.Error as e:
            print(f"Database error: {e}")
            return None
    
    def init_reddit(self):
        try:
            self.reddit = praw.Reddit(
                client_id=self.reddit_config['client_id'],
                client_secret=self.reddit_config['client_secret'],
                user_agent=self.reddit_config['user_agent']
            )
            return True
        except Exception as e:
            print(f"Reddit API error: {e}")
            return False
    
    # --- New/Fixed Method for Table Creation ---
    def create_sentiment_tables(self):
        """Create tables that store BOTH TextBlob and FinBERT sentiment"""
        conn = self.connect_db() # Use self.connect_db() for consistency
        
        if not conn:
            print("Could not connect to database for table creation.")
            return

        cursor = conn.cursor()
        
        # Drop and recreate tables to add new columns
        cursor.execute("DROP TABLE IF EXISTS reddit_sentiment")
        cursor.execute("DROP TABLE IF EXISTS news_sentiment") 
        cursor.execute("DROP TABLE IF EXISTS daily_sentiment")
        
        # Reddit sentiment - WITH BOTH TEXTBLOB AND FINBERT
        reddit_table = """
        CREATE TABLE reddit_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY,
            symbol VARCHAR(10),
            subreddit VARCHAR(50),
            post_id VARCHAR(20) UNIQUE,
            title TEXT,
            selftext TEXT,
            score INT,
            num_comments INT,
            created_utc TIMESTAMP,
            sentiment_textblob DECIMAL(5,4),
            sentiment_finbert DECIMAL(5,4),
            sentiment_label VARCHAR(20),
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            INDEX idx_symbol (symbol),
            INDEX idx_created (created_utc)
        )
        """
        
        # News sentiment - WITH BOTH TEXTBLOB AND FINBERT
        news_table = """
        CREATE TABLE news_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY,
            symbol VARCHAR(10),
            source VARCHAR(100),
            title TEXT,
            description TEXT,
            published_at TIMESTAMP,
            sentiment_textblob DECIMAL(5,4),
            sentiment_finbert DECIMAL(5,4),
            sentiment_label VARCHAR(20),
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            INDEX idx_symbol (symbol),
            INDEX idx_published (published_at)
        )
        """
        
        # Daily sentiment - WITH BOTH METHODS
        daily_sentiment_table = """
        CREATE TABLE daily_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY,
            symbol VARCHAR(10),
            date DATE,
            reddit_avg_textblob DECIMAL(5,4),
            reddit_avg_finbert DECIMAL(5,4),
            reddit_post_count INT DEFAULT 0,
            news_avg_textblob DECIMAL(5,4),
            news_avg_finbert DECIMAL(5,4),
            news_article_count INT DEFAULT 0,
            combined_textblob DECIMAL(5,4),
            combined_finbert DECIMAL(5,4),
            total_mentions INT DEFAULT 0,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE KEY unique_symbol_date (symbol, date)
        )
        """
        
        cursor.execute(reddit_table)
        cursor.execute(news_table)
        cursor.execute(daily_sentiment_table)
        
        conn.commit()
        cursor.close()
        conn.close()
        
        print("✓ Enhanced sentiment tables created (with TextBlob + FinBERT)")
    
    # --- Existing scraping and aggregation methods follow... ---
    def scrape_reddit(self, symbols, subreddits=None, limit=100, time_filter='month'):
        # ... (rest of the scrape_reddit method)
        pass # Placeholder for brevity, the original content is correct
    
    def scrape_news(self, symbols, days_back=30):
        # ... (rest of the scrape_news method)
        pass # Placeholder for brevity, the original content is correct

    def aggregate_daily_sentiment(self, date=None):
        # ... (rest of the aggregate_daily_sentiment method)
        pass # Placeholder for brevity, the original content is correct
    
    def get_stock_symbols(self):
        conn = self.connect_db()
        if not conn:
            return []
        
        cursor = conn.cursor()
        cursor.execute("SELECT symbol FROM stocks WHERE is_active = 1")
        symbols = [row[0] for row in cursor.fetchall()]
        cursor.close()
        conn.close()
        
        return symbols


print("✓ Enhanced SentimentScraper class ready!")

In [4]:
# Initialize scraper
scraper = SentimentScraper(reddit_config, news_api_key, db_config)

# Create sentiment tables
scraper.create_sentiment_tables()

# Get stock symbols from database
symbols = scraper.get_stock_symbols()
print(f"✓ Setup complete! Tracking {len(symbols)} stocks: {symbols}")


2025-10-23 00:26:36,722 - INFO - package: mysql.connector.plugins
2025-10-23 00:26:36,724 - INFO - plugin_name: caching_sha2_password
2025-10-23 00:26:36,725 - INFO - AUTHENTICATION_PLUGIN_CLASS: MySQLCachingSHA2PasswordAuthPlugin
2025-10-23 00:26:36,756 - INFO - Sentiment tables created successfully


✓ Setup complete! Tracking 24 stocks: ['AAPL', 'AMD', 'AMZN', 'BLK', 'CHN', 'ERO', 'FXP', 'GOOGL', 'GXC', 'JPM', 'KR', 'MDT', 'META', 'MSFT', 'NFLX', 'NVDA', 'OXY', 'PGJ', 'RSP', 'SPY', 'TSLA', 'VGK', 'XPP', 'YINN']


In [5]:
print("Starting Reddit scraping...")
scraper.scrape_reddit(symbols, limit=50, time_filter='week')
print("\n✓ Reddit scraping complete!")

2025-10-23 00:26:40,031 - INFO - Reddit API initialized successfully


Starting Reddit scraping...

[1/24] Processing AAPL...
  → Searching r/stocks... 3 posts found
  → Searching r/investing... 1 posts found
  → Searching r/wallstreetbets... 0 posts found
  → Searching r/SecurityAnalysis... 0 posts found

[2/24] Processing AMD...
  → Searching r/stocks... 2 posts found
  → Searching r/investing... 2 posts found
  → Searching r/wallstreetbets... 13 posts found
  → Searching r/SecurityAnalysis... 0 posts found

[3/24] Processing AMZN...
  → Searching r/stocks... 4 posts found
  → Searching r/investing... 2 posts found
  → Searching r/wallstreetbets... 3 posts found
  → Searching r/SecurityAnalysis... 0 posts found

[4/24] Processing BLK...
  → Searching r/stocks... 0 posts found
  → Searching r/investing... 0 posts found
  → Searching r/wallstreetbets... 0 posts found
  → Searching r/SecurityAnalysis... 0 posts found

[5/24] Processing CHN...
  → Searching r/stocks... 0 posts found
  → Searching r/investing... 0 posts found
  → Searching r/wallstreetbets..

2025-10-23 00:30:30,224 - INFO - Scraped 155 Reddit posts



✓ Reddit scraping complete!


In [6]:
print("Starting news scraping...")
print("This will take 1-2 minutes.\n")

scraper.scrape_news(symbols, days_back=7)

print("\n✓ News scraping complete!")

Starting news scraping...
This will take 1-2 minutes.


[1/24] Fetching news for AAPL...
  ✓ Found 39 articles

[2/24] Fetching news for AMD...
  ✓ Found 56 articles

[3/24] Fetching news for AMZN...
  ✓ Found 22 articles

[4/24] Fetching news for BLK...
  ✓ Found 4 articles

[5/24] Fetching news for CHN...
  ✓ Found 0 articles

[6/24] Fetching news for ERO...
  ✓ Found 2 articles

[7/24] Fetching news for FXP...
  ✓ Found 0 articles

[8/24] Fetching news for GOOGL...
  ✓ Found 16 articles

[9/24] Fetching news for GXC...
  ✓ Found 0 articles

[10/24] Fetching news for JPM...
  ✓ Found 19 articles

[11/24] Fetching news for KR...
  ✓ Found 4 articles

[12/24] Fetching news for MDT...
  ✓ Found 1 articles

[13/24] Fetching news for META...
  ✓ Found 93 articles

[14/24] Fetching news for MSFT...
  ✓ Found 14 articles

[15/24] Fetching news for NFLX...
  ✓ Found 25 articles

[16/24] Fetching news for NVDA...
  ✓ Found 46 articles

[17/24] Fetching news for OXY...
  ✓ Found 1 articles

[1

2025-10-23 00:45:24,050 - INFO - Scraped 397 news articles



✓ News scraping complete!


In [7]:
print("Aggregating daily sentiment scores...")

scraper.aggregate_daily_sentiment()

print("✓ Sentiment aggregation complete!")

Aggregating daily sentiment scores...


2025-10-23 00:45:57,308 - INFO - Aggregated sentiment data for 24 symbols on 2025-10-23


✓ Sentiment aggregation complete!


In [None]:
import pandas as pd

conn = mysql.connector.connect(**db_config)

# Reddit posts count
reddit_df = pd.read_sql("SELECT COUNT(*) as total FROM reddit_sentiment", conn)
print(f"Reddit posts collected: {reddit_df['total'][0]}")

# News articles count
news_df = pd.read_sql("SELECT COUNT(*) as total FROM news_sentiment", conn)
print(f"News articles collected: {news_df['total'][0]}")

# Top stocks by sentiment mentions
print("\n" + "="*60)
print("TOP 10 STOCKS BY TOTAL MENTIONS:")
print("="*60)
sentiment_df = pd.read_sql("""
    SELECT symbol, combined_finbert AS combined_sentiment, total_mentions, 
           reddit_post_count, news_article_count
    FROM daily_sentiment 
    WHERE date = CURDATE()
    ORDER BY total_mentions DESC 
    LIMIT 10
""", conn)

for idx, row in sentiment_df.iterrows():
    print(f"{row['symbol']:6s} | Sentiment: {row['combined_sentiment']:+.3f} | "
          f"Mentions: {row['total_mentions']:3d} "
          f"(Reddit: {row['reddit_post_count']}, News: {row['news_article_count']})")

conn.close()

print("\n✓ All sentiment data collection complete!")
print("Check your database tables: reddit_sentiment, news_sentiment, daily_sentiment")

Reddit posts collected: 499
News articles collected: 4159

TOP 10 STOCKS BY TOTAL MENTIONS:
GOOGL  | Sentiment: +0.083 | Mentions:   3 (Reddit: 3, News: 0)
AMZN   | Sentiment: +0.012 | Mentions:   2 (Reddit: 2, News: 0)
SPY    | Sentiment: -0.208 | Mentions:   2 (Reddit: 2, News: 0)
NVDA   | Sentiment: +0.135 | Mentions:   1 (Reddit: 1, News: 0)
NFLX   | Sentiment: +0.000 | Mentions:   1 (Reddit: 1, News: 0)
YINN   | Sentiment: +0.000 | Mentions:   0 (Reddit: 0, News: 0)
AMD    | Sentiment: +0.000 | Mentions:   0 (Reddit: 0, News: 0)
CHN    | Sentiment: +0.000 | Mentions:   0 (Reddit: 0, News: 0)
ERO    | Sentiment: +0.000 | Mentions:   0 (Reddit: 0, News: 0)
FXP    | Sentiment: +0.000 | Mentions:   0 (Reddit: 0, News: 0)

✓ All sentiment data collection complete!
Check your database tables: reddit_sentiment, news_sentiment, daily_sentiment


  reddit_df = pd.read_sql("SELECT COUNT(*) as total FROM reddit_sentiment", conn)
  news_df = pd.read_sql("SELECT COUNT(*) as total FROM news_sentiment", conn)
  sentiment_df = pd.read_sql("""
