In [None]:
# --- CELL 1: Import Dependencies ---

import praw
import requests
import mysql.connector
from datetime import datetime, timedelta
import time
import logging
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

# --- Logging Setup ---
# Setup logging first to capture potential errors early
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        # Use 'utf-8' encoding for the file handler
        logging.FileHandler('enhanced_sentiment_scraper.log', encoding='utf-8'),
        logging.StreamHandler() # Console handler might still error depending on console settings
    ]
)
logger = logging.getLogger(__name__)

# --- MODIFIED: safe_print function ---
def safe_print(*args, **kwargs):
    """Prints messages, falling back to basic encoding if Unicode fails."""
    try:
        print(*args, **kwargs)
    except UnicodeEncodeError:
        sep = kwargs.get('sep', ' ')
        end = kwargs.get('end', '\n')
        message = sep.join(map(str, args)) + end
        cleaned_message = message.encode('utf-8', 'replace').decode('cp1252', 'replace')
        print(cleaned_message, end='')

safe_print("Dependencies loaded")
safe_print(f"PyTorch version: {torch.__version__}")
# --- END CELL 1 ---

Dependencies loaded
PyTorch version: 2.7.1+cpu
CUDA available: False


In [24]:
# --- CELL 2: Configurations ---

# Reddit API Configuration
reddit_config = {
    'client_id': '6SD-D4KilOopu4O6m9R9VA',
    'client_secret': 'mv-Z3_vFTC7WApSWvtdxUX0MKDq3QQ',
    'user_agent': 'Trading Sentiment Bot by Delicious_Divide6891'
}

# News API Key
news_api_key = '73e9447f080543c3885ec7803f705101'

# Database Configuration
db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '',
    'database': 'trading_system'
}

safe_print("Configurations loaded")

Configurations loaded


In [None]:
# --- CELL 3: Sentiment Scraper Class ---

class EnhancedSentimentScraper:
    def __init__(self, reddit_config, news_api_key, db_config):
        self.reddit_config = reddit_config
        self.news_api_key = news_api_key
        self.db_config = db_config
        # Logging is already set up globally
        self.logger = logging.getLogger(__name__)
        self.reddit = None
        self.finbert_model = None
        self.finbert_tokenizer = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Removed setup_logging method as it's now global

    def connect_db(self):
        """Connect to MySQL database"""
        try:
            conn = mysql.connector.connect(**self.db_config)
            return conn
        except mysql.connector.Error as e:
            self.logger.error(f"Database connection error: {e}")
            return None

    def load_finbert(self):
        """Load FinBERT model for financial sentiment analysis"""
        try:
            self.logger.info("Loading FinBERT model...")
            model_name = "ProsusAI/finbert"
            self.finbert_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.finbert_model = AutoModelForSequenceClassification.from_pretrained(model_name)
            self.finbert_model.to(self.device)
            self.finbert_model.eval()
            self.logger.info(f"FinBERT loaded successfully on {self.device}")
            return True
        except Exception as e:
            self.logger.error(f"Error loading FinBERT: {e}")
            return False

    # --- MODIFIED: create_sentiment_tables ---
    def create_sentiment_tables(self):
        """Create or update tables for dual sentiment analysis, compatible with older MySQL."""
        conn = self.connect_db()
        if not conn:
            return False

        cursor = conn.cursor()
        db_name = self.db_config['database'] # Get database name for information_schema query

        # 1. Create reddit_sentiment table (if not exists)
        reddit_table_create = """
        CREATE TABLE IF NOT EXISTS reddit_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY, symbol VARCHAR(10), subreddit VARCHAR(50), post_id VARCHAR(20) UNIQUE,
            title TEXT, selftext TEXT, score INT, num_comments INT, created_utc TIMESTAMP,
            textblob_score DECIMAL(5,4), textblob_label VARCHAR(20), finbert_score DECIMAL(5,4), finbert_label VARCHAR(20),
            finbert_confidence DECIMAL(5,4), combined_score DECIMAL(5,4), combined_label VARCHAR(20),
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, INDEX idx_symbol (symbol), INDEX idx_created (created_utc),
            FOREIGN KEY (symbol) REFERENCES stocks(symbol) ON DELETE CASCADE ON UPDATE CASCADE
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
        """

        # 2. Create news_sentiment table (if not exists)
        news_table_create = """
        CREATE TABLE IF NOT EXISTS news_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY, symbol VARCHAR(10), source VARCHAR(100), title TEXT, description TEXT,
            published_at TIMESTAMP NULL, textblob_score DECIMAL(5,4), textblob_label VARCHAR(20), finbert_score DECIMAL(5,4),
            finbert_label VARCHAR(20), finbert_confidence DECIMAL(5,4), combined_score DECIMAL(5,4), combined_label VARCHAR(20),
            scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, INDEX idx_symbol (symbol), INDEX idx_published (published_at),
             FOREIGN KEY (symbol) REFERENCES stocks(symbol) ON DELETE CASCADE ON UPDATE CASCADE
       ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
        """

        # 3. Create daily_sentiment table (if not exists) - Basic structure first
        daily_sentiment_create = """
        CREATE TABLE IF NOT EXISTS daily_sentiment (
            id INT AUTO_INCREMENT PRIMARY KEY,
            symbol VARCHAR(10),
            date DATE,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE KEY unique_symbol_date (symbol, date),
            FOREIGN KEY (symbol) REFERENCES stocks(symbol) ON DELETE CASCADE ON UPDATE CASCADE
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
        """

        # Columns to ensure exist in daily_sentiment
        required_columns = {
            "reddit_textblob_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "reddit_finbert_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "reddit_combined_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "reddit_post_count": "INT DEFAULT 0",
            "news_textblob_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "news_finbert_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "news_combined_avg": "DECIMAL(5,4) NULL DEFAULT NULL",
            "news_article_count": "INT DEFAULT 0",
            "overall_textblob": "DECIMAL(5,4) NULL DEFAULT NULL",
            "overall_finbert": "DECIMAL(5,4) NULL DEFAULT NULL",
            "overall_combined": "DECIMAL(5,4) NULL DEFAULT NULL",
            "total_mentions": "INT DEFAULT 0"
        }

        # Columns to drop if they exist (old schema)
        columns_to_drop = [
            "reddit_avg_sentiment",
            "news_avg_sentiment",
            "combined_sentiment",
            "avg_sentiment_score",
            "source_count"
        ]

        tables_to_create = [
            ("reddit_sentiment", reddit_table_create),
            ("news_sentiment", news_table_create),
            ("daily_sentiment", daily_sentiment_create)
        ]

        try:
            # Create tables if they don't exist
            for name, query in tables_to_create:
                self.logger.info(f"Executing CREATE IF NOT EXISTS for {name}...")
                cursor.execute(query)
            self.logger.info("Initial table creation check complete.")
            conn.commit() # Commit creation before altering

            # --- Check and Add Columns for daily_sentiment ---
            self.logger.info("Checking/Updating daily_sentiment schema...")
            # Get existing columns
            cursor.execute(f"""
                SELECT COLUMN_NAME
                FROM information_schema.columns
                WHERE table_schema = '{db_name}' AND table_name = 'daily_sentiment';
            """)
            existing_columns = {row[0] for row in cursor.fetchall()}
            self.logger.info(f"Existing columns in daily_sentiment: {existing_columns}")

            # Add missing required columns
            for col_name, col_definition in required_columns.items():
                if col_name not in existing_columns:
                    try:
                        alter_query = f"ALTER TABLE daily_sentiment ADD COLUMN {col_name} {col_definition}"
                        self.logger.info(f"Adding missing column: {col_name}")
                        cursor.execute(alter_query)
                        conn.commit() # Commit after each successful alter add
                    except mysql.connector.Error as e:
                        self.logger.error(f"Error adding column {col_name}: {e}")
                        conn.rollback() # Rollback if alter fails
                #else:
                #    self.logger.info(f"Column '{col_name}' already exists.")

            # Drop old columns
            for col_name in columns_to_drop:
                if col_name in existing_columns:
                    try:
                        drop_query = f"ALTER TABLE daily_sentiment DROP COLUMN {col_name}"
                        self.logger.info(f"Dropping old column: {col_name}")
                        cursor.execute(drop_query)
                        conn.commit() # Commit after each successful alter drop
                    except mysql.connector.Error as e:
                        self.logger.error(f"Error dropping column {col_name}: {e}")
                        conn.rollback() # Rollback if alter fails

            self.logger.info("daily_sentiment schema check/update complete.")
            self.logger.info("Enhanced sentiment tables created/updated successfully")
            return True

        except mysql.connector.Error as e:
            self.logger.error(f"Error during table setup: {e}")
            conn.rollback()
            return False
        finally:
            cursor.close()
            conn.close()
    # --- END MODIFIED: create_sentiment_tables ---

    def init_reddit(self):
        """Initialize Reddit API connection"""
        try:
            self.reddit = praw.Reddit(
                client_id=self.reddit_config['client_id'],
                client_secret=self.reddit_config['client_secret'],
                user_agent=self.reddit_config['user_agent']
            )
            self.logger.info("Reddit API initialized successfully")
            return True
        except Exception as e:
            self.logger.error(f"Error initializing Reddit API: {e}")
            return False

    def analyze_textblob(self, text):
        """
        Analyze sentiment using TextBlob (general-purpose)
        Returns: (score, label)
        """
        if not text or len(text.strip()) == 0:
            return 0.0, 'neutral'

        try:
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity

            if polarity > 0.1:
                label = 'positive'
            elif polarity < -0.1:
                label = 'negative'
            else:
                label = 'neutral'

            return round(polarity, 4), label
        except Exception as e:
            self.logger.error(f"Error in TextBlob analysis: {e}")
            return 0.0, 'neutral'

    def analyze_finbert(self, text):
        """
        Analyze sentiment using FinBERT (finance-specific)
        Returns: (score, label, confidence)
        """
        if not text or len(text.strip()) == 0:
            return 0.0, 'neutral', 0.0

        if self.finbert_model is None:
            self.logger.warning("FinBERT not loaded, using neutral sentiment")
            return 0.0, 'neutral', 0.0

        try:
            # Truncate text to FinBERT's max length (512 tokens)
            text_slice = text[:1024] # Slice potentially longer text first

            # Tokenize
            inputs = self.finbert_tokenizer(
                text_slice,
                return_tensors="pt",
                truncation=True,
                max_length=512, # Hard limit for the model
                padding=True
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Get predictions
            with torch.no_grad():
                outputs = self.finbert_model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

            # FinBERT outputs: [positive, negative, neutral]
            probs = predictions[0].cpu().numpy()

            # Get label and confidence
            labels = ['positive', 'negative', 'neutral']
            max_idx = np.argmax(probs)
            label = labels[max_idx]
            confidence = float(probs[max_idx])

            # Calculate sentiment score: positive - negative
            score = float(probs[0] - probs[1])

            return round(score, 4), label, round(confidence, 4)

        except Exception as e:
            self.logger.error(f"Error in FinBERT analysis processing text snippet '{text[:50]}...': {e}")
            return 0.0, 'neutral', 0.0


    def combine_sentiments(self, textblob_score, finbert_score, finbert_conf):
        """
        Combine TextBlob and FinBERT scores
        Weight FinBERT more for financial text (70% FinBERT, 30% TextBlob)
        """
        finbert_weight = 0.7
        textblob_weight = 0.3

        combined = (finbert_score * finbert_weight) + (textblob_score * textblob_weight)

        if combined > 0.1:
            label = 'positive'
        elif combined < -0.1:
            label = 'negative'
        else:
            label = 'neutral'

        return round(combined, 4), label

    # Uses the expanded default list of subreddits
    def scrape_reddit(self, symbols,
                     subreddits=['stocks', 'investing', 'wallstreetbets', 'SecurityAnalysis',
                                 'StockMarket', 'finance', 'options', 'daytrading'],
                     limit=100, time_filter='week'):
        """Scrape Reddit with dual sentiment analysis"""
        if not self.reddit:
            if not self.init_reddit():
                return False

        conn = self.connect_db()
        if not conn:
            return False

        cursor = conn.cursor()
        total_posts_saved = 0

        self.logger.info(f"Scraping {len(subreddits)} subreddits: {subreddits}")

        for idx, symbol in enumerate(symbols, 1):
            safe_print(f"\n[{idx}/{len(symbols)}] Processing {symbol}...")
            symbol_posts = 0
            for subreddit_name in subreddits:
                try:
                    safe_print(f"  -> Searching r/{subreddit_name}... ", end='')
                    subreddit = self.reddit.subreddit(subreddit_name)
                    search_query = f"'{symbol}' OR ${symbol} OR {symbol}" # Broader query

                    post_count = 0
                    processed_posts = 0
                    for post in subreddit.search(search_query, time_filter=time_filter, limit=limit, sort='new'):
                        processed_posts += 1
                        # Combine title and selftext
                        full_text = f"{post.title} {post.selftext}"

                        # Dual sentiment analysis
                        tb_score, tb_label = self.analyze_textblob(full_text)
                        fb_score, fb_label, fb_conf = self.analyze_finbert(full_text)
                        combined_score, combined_label = self.combine_sentiments(tb_score, fb_score, fb_conf)

                        try:
                            query = """
                            INSERT IGNORE INTO reddit_sentiment
                            (symbol, subreddit, post_id, title, selftext, score, num_comments,
                             created_utc, textblob_score, textblob_label, finbert_score,
                             finbert_label, finbert_confidence, combined_score, combined_label)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                            """
                            values = (
                                symbol, subreddit_name, post.id,
                                post.title[:500], post.selftext[:1000] if post.selftext else '',
                                post.score, post.num_comments, datetime.fromtimestamp(post.created_utc),
                                tb_score, tb_label, fb_score, fb_label, fb_conf,
                                combined_score, combined_label
                            )
                            cursor.execute(query, values)
                            if cursor.rowcount > 0: # Check if insert actually happened
                                total_posts_saved += 1
                                symbol_posts += 1
                            post_count += 1 # Count posts processed, not just saved

                        except mysql.connector.Error as e:
                            # Log only if it's not a duplicate entry error
                            if "Duplicate entry" not in str(e):
                                self.logger.error(f"DB Error inserting Reddit post {post.id}: {e}")
                        except Exception as inner_e:
                             self.logger.error(f"Unexpected error processing post {post.id}: {inner_e}")


                    safe_print(f"{processed_posts} posts processed.")
                    time.sleep(2)  # Adhere to Reddit API rate limits

                except praw.exceptions.PRAWException as e:
                    self.logger.error(f"PRAW Error scraping r/{subreddit_name} for {symbol}: {e}")
                    safe_print(f" Error scraping r/{subreddit_name}.")
                    time.sleep(5) # Longer sleep on API error
                except Exception as e:
                    self.logger.error(f"General Error scraping r/{subreddit_name} for {symbol}: {e}")
                    safe_print(f" Error scraping r/{subreddit_name}.")
                    time.sleep(5) # Longer sleep on error
                    continue # Try next subreddit
            safe_print(f"  Saved {symbol_posts} new posts for {symbol}.")

        try:
            conn.commit()
        except mysql.connector.Error as e:
            self.logger.error(f"Error committing Reddit data: {e}")
        finally:
            cursor.close()
            conn.close()

        self.logger.info(f"Finished Reddit scrape. Saved {total_posts_saved} new posts overall.")
        return True


    def scrape_news(self, symbols, days_back=7):
        """Scrape news with dual sentiment analysis"""
        conn = self.connect_db()
        if not conn: return False
        cursor = conn.cursor()
        total_articles_saved = 0
        to_date = datetime.now()
        from_date = to_date - timedelta(days=days_back)

        for idx, symbol in enumerate(symbols, 1):
            safe_print(f"\n[{idx}/{len(symbols)}] Fetching news for {symbol}...")
            symbol_articles = 0
            retries = 3
            while retries > 0:
                try:
                    url = 'https://newsapi.org/v2/everything'
                    params = {
                        'q': f'"{symbol} stock" OR "{symbol} shares"',
                        'from': from_date.strftime('%Y-%m-%d'),
                        'to': to_date.strftime('%Y-%m-%d'),
                        'language': 'en', 'sortBy': 'publishedAt',
                        'apiKey': self.news_api_key, 'pageSize': 100
                    }
                    response = requests.get(url, params=params, timeout=20) # Added timeout
                    response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

                    data = response.json()
                    articles = data.get('articles', [])
                    processed_articles = 0

                    for article in articles:
                        processed_articles += 1
                        text = f"{article.get('title', '')} {article.get('description', '')}"
                        tb_score, tb_label = self.analyze_textblob(text)
                        fb_score, fb_label, fb_conf = self.analyze_finbert(text)
                        combined_score, combined_label = self.combine_sentiments(tb_score, fb_score, fb_conf)

                        try:
                            query = """
                            INSERT IGNORE INTO news_sentiment
                            (symbol, source, title, description, published_at, textblob_score, textblob_label,
                             finbert_score, finbert_label, finbert_confidence, combined_score, combined_label)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                            """
                            published = article.get('publishedAt', '')
                            published_dt = datetime.now() # Default value
                            if published:
                                try:
                                    # Handle different possible formats, prioritize ISO with Z
                                    if 'Z' in published:
                                       published_dt = datetime.strptime(published, '%Y-%m-%dT%H:%M:%SZ')
                                    elif '+' in published: # Handle timezone offset like +00:00
                                        published_dt = datetime.strptime(published, '%Y-%m-%dT%H:%M:%S%z')
                                    else: # Assume UTC if no timezone info
                                        published_dt = datetime.strptime(published, '%Y-%m-%dT%H:%M:%S')
                                except ValueError:
                                     self.logger.warning(f"Could not parse date '{published}' for {symbol}, using current time.")

                            values = (
                                symbol, article.get('source', {}).get('name', 'Unknown')[:100],
                                article.get('title', '')[:500], article.get('description', '')[:1000],
                                published_dt.replace(tzinfo=None), # Store timezone-naive in DB
                                tb_score, tb_label, fb_score, fb_label, fb_conf,
                                combined_score, combined_label
                            )
                            cursor.execute(query, values)
                            if cursor.rowcount > 0:
                                total_articles_saved += 1
                                symbol_articles += 1

                        except mysql.connector.Error as e:
                            if "Duplicate entry" not in str(e):
                                self.logger.error(f"DB Error inserting news article for {symbol}: {e}")
                        except Exception as inner_e:
                            self.logger.error(f"Unexpected error processing article for {symbol}: {inner_e}")

                    safe_print(f"  Processed {processed_articles} articles. Saved {symbol_articles} new.")
                    break # Success, exit retry loop

                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 426: # Free plan limit
                         self.logger.warning("NewsAPI free plan limit likely reached. Upgrade for more results.")
                         safe_print(" NewsAPI limit reached.")
                         retries = 0 # Don't retry rate limit errors
                         break # Stop processing this symbol
                    elif e.response.status_code == 429: # Rate limit
                         self.logger.warning(f"NewsAPI rate limit hit for {symbol}. Retrying in 5s... ({retries-1} left)")
                         safe_print(" Rate limit hit, retrying...")
                         time.sleep(5)
                         retries -= 1
                    else:
                         self.logger.error(f"NewsAPI HTTP error for {symbol}: {e.response.status_code} - {e.response.text[:100]}")
                         safe_print(f" HTTP Error {e.response.status_code}.")
                         retries = 0 # Don't retry other HTTP errors immediately
                         break
                except requests.exceptions.RequestException as e:
                    self.logger.error(f"NewsAPI Request error for {symbol}: {e}")
                    safe_print(" Network error, retrying...")
                    time.sleep(5)
                    retries -= 1
                except Exception as e:
                    self.logger.error(f"General Error scraping news for {symbol}: {e}")
                    safe_print(" General error.")
                    retries = 0 
                    break 
            # End of while loop
            if retries == 0 and symbol_articles == 0 :
                 safe_print(f"  Failed to fetch news for {symbol} after retries.")
            time.sleep(1.5) # Pause between symbols

        try:
            conn.commit()
        except mysql.connector.Error as e:
            self.logger.error(f"Error committing News data: {e}")
        finally:
            cursor.close()
            conn.close()

        self.logger.info(f"Finished News scrape. Saved {total_articles_saved} new articles overall.")
        return True


    def aggregate_daily_sentiment(self, date_to_aggregate=None):
        """Aggregate dual sentiment data by symbol and date"""
        if date_to_aggregate is None:
            date_to_aggregate = datetime.now().date()

        self.logger.info(f"Aggregating daily sentiment for date: {date_to_aggregate}")

        conn = self.connect_db()
        if not conn: return False
        cursor = conn.cursor()

        # Get list of active symbols
        try:
            cursor.execute("SELECT DISTINCT symbol FROM stocks WHERE is_active = 1")
            symbols = [row[0] for row in cursor.fetchall()]
            if not symbols:
                self.logger.warning("No active symbols found for aggregation.")
                cursor.close()
                conn.close()
                return False
        except mysql.connector.Error as e:
             self.logger.error(f"Error fetching active symbols for aggregation: {e}")
             cursor.close()
             conn.close()
             return False


        aggregated_count = 0
        for symbol in symbols:
            try:
                # Aggregate Reddit sentiment
                reddit_query = """
                SELECT AVG(textblob_score), AVG(finbert_score), AVG(combined_score), COUNT(*)
                FROM reddit_sentiment WHERE symbol = %s AND DATE(created_utc) = %s
                """
                cursor.execute(reddit_query, (symbol, date_to_aggregate))
                r_res = cursor.fetchone()
                r_tb, r_fb, r_comb, r_count = (r or 0 for r in (r_res if r_res else (0,0,0,0)))


                # Aggregate News sentiment
                news_query = """
                SELECT AVG(textblob_score), AVG(finbert_score), AVG(combined_score), COUNT(*)
                FROM news_sentiment WHERE symbol = %s AND DATE(published_at) = %s
                """
                cursor.execute(news_query, (symbol, date_to_aggregate))
                n_res = cursor.fetchone()
                n_tb, n_fb, n_comb, n_count = (n or 0 for n in (n_res if n_res else (0,0,0,0)))

                # Calculate overall sentiment (weighted by count)
                total_count = r_count + n_count
                o_tb = o_fb = o_comb = 0
                if total_count > 0:
                    o_tb = ((r_tb * r_count) + (n_tb * n_count)) / total_count
                    o_fb = ((r_fb * r_count) + (n_fb * n_count)) / total_count
                    o_comb = ((r_comb * r_count) + (n_comb * n_count)) / total_count

                # Insert or update aggregated data
                insert_query = """
                INSERT INTO daily_sentiment
                (symbol, date, reddit_textblob_avg, reddit_finbert_avg, reddit_combined_avg, reddit_post_count,
                 news_textblob_avg, news_finbert_avg, news_combined_avg, news_article_count,
                 overall_textblob, overall_finbert, overall_combined, total_mentions)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                    reddit_textblob_avg=VALUES(reddit_textblob_avg), reddit_finbert_avg=VALUES(reddit_finbert_avg),
                    reddit_combined_avg=VALUES(reddit_combined_avg), reddit_post_count=VALUES(reddit_post_count),
                    news_textblob_avg=VALUES(news_textblob_avg), news_finbert_avg=VALUES(news_finbert_avg),
                    news_combined_avg=VALUES(news_combined_avg), news_article_count=VALUES(news_article_count),
                    overall_textblob=VALUES(overall_textblob), overall_finbert=VALUES(overall_finbert),
                    overall_combined=VALUES(overall_combined), total_mentions=VALUES(total_mentions)
                """
                cursor.execute(insert_query, (
                    symbol, date_to_aggregate, r_tb, r_fb, r_comb, r_count,
                    n_tb, n_fb, n_comb, n_count, o_tb, o_fb, o_comb, total_count
                ))
                aggregated_count += 1

            except mysql.connector.Error as e:
                self.logger.error(f"Error aggregating sentiment for {symbol} on {date_to_aggregate}: {e}")
                conn.rollback() 
            except Exception as e:
                self.logger.error(f"Unexpected error aggregating {symbol} on {date_to_aggregate}: {e}")
                conn.rollback()

        try:
            conn.commit() 
        except mysql.connector.Error as e:
            self.logger.error(f"Final commit error during aggregation: {e}")
        finally:
            cursor.close()
            conn.close()

        self.logger.info(f"Finished aggregation for {date_to_aggregate}. Processed {aggregated_count}/{len(symbols)} symbols.")
        return True


    def get_stock_symbols(self):
        """Get all *active* tracked stock symbols from database"""
        conn = self.connect_db()
        if not conn: return []
        cursor = conn.cursor()
        symbols = []
        try:
            cursor.execute("SELECT symbol FROM stocks WHERE is_active = 1 ORDER BY symbol")
            symbols = [row[0] for row in cursor.fetchall()]
        except mysql.connector.Error as e:
            self.logger.error(f"Error fetching active symbols: {e}")
        finally:
            cursor.close()
            conn.close()

        if not symbols:
            self.logger.warning("No active stocks found in database. Run '01_data_collection_FIXED.py' first.")
        return symbols

# --- END CELL 3 ---


In [26]:
# --- CELL 4: Initialize Enhanced Scraper ---

# ============================================================================
safe_print("\n" + "="*70)
safe_print("INITIALIZING ENHANCED SENTIMENT SCRAPER")
safe_print("="*70)

scraper = EnhancedSentimentScraper(reddit_config, news_api_key, db_config)

# Load FinBERT model
safe_print("\nLoading FinBERT model (this may take 1-2 minutes)...")
if scraper.load_finbert():
    safe_print("FinBERT loaded successfully") # Removed checkmark
else:
    safe_print("FinBERT loading failed - will use TextBlob only")

# Create/Update sentiment tables
safe_print("\nCreating/Updating sentiment tables in database...")
if scraper.create_sentiment_tables():
    safe_print("Database tables are up-to-date.") # Removed checkmark
else:
    safe_print("Failed to create/update database tables!")

# Get stock symbols from database
safe_print("\nFetching active stocks...")
symbols = scraper.get_stock_symbols()
safe_print(f"\nSetup complete! Tracking {len(symbols)} stocks: {symbols}") # Removed checkmark

# --- END CELL 4 ---


2025-10-26 01:05:23,276 - INFO - Loading FinBERT model...



INITIALIZING ENHANCED SENTIMENT SCRAPER

Loading FinBERT model (this may take 1-2 minutes)...


2025-10-26 01:05:25,055 - INFO - FinBERT loaded successfully on cpu
2025-10-26 01:05:25,105 - INFO - Executing CREATE IF NOT EXISTS for reddit_sentiment...
2025-10-26 01:05:25,121 - INFO - Executing CREATE IF NOT EXISTS for news_sentiment...
2025-10-26 01:05:25,128 - INFO - Executing CREATE IF NOT EXISTS for daily_sentiment...
2025-10-26 01:05:25,135 - INFO - Initial table creation check complete.
2025-10-26 01:05:25,139 - INFO - Checking/Updating daily_sentiment schema...
2025-10-26 01:05:25,155 - INFO - Existing columns in daily_sentiment: {'overall_combined', 'reddit_textblob_avg', 'news_combined_avg', 'news_article_count', 'reddit_post_count', 'total_mentions', 'date', 'created_at', 'news_finbert_avg', 'reddit_finbert_avg', 'symbol', 'id', 'news_textblob_avg', 'overall_finbert', 'overall_textblob', 'reddit_combined_avg'}
2025-10-26 01:05:25,155 - INFO - daily_sentiment schema check/update complete.
2025-10-26 01:05:25,155 - INFO - Enhanced sentiment tables created/updated successfu

FinBERT loaded successfully

Creating/Updating sentiment tables in database...
Database tables are up-to-date.

Fetching active stocks...

Setup complete! Tracking 20 stocks: ['AAPL', 'AMZN', 'BLK', 'ERO', 'FXP', 'GOOGL', 'GXC', 'JPM', 'KR', 'MDT', 'META', 'MSFT', 'NVDA', 'OXY', 'PGJ', 'RSP', 'SPY', 'TSLA', 'VGK', 'XPP']


In [27]:
# --- CELL 5: Scrape Reddit with Dual Sentiment ---

# ============================================================================
safe_print("\n" + "="*70)
safe_print("SCRAPING REDDIT WITH DUAL SENTIMENT ANALYSIS")
safe_print("="*70)
safe_print("Analyzing with TextBlob (general) + FinBERT (finance-specific)")
safe_print("This will take 5-10 minutes...\n")

if symbols:
    scraper.scrape_reddit(
        symbols,
        limit=50,
        time_filter='week'
    )
    safe_print("\nReddit scraping complete!") # Removed checkmark
else:
    safe_print("No symbols to scrape. Run Cell 4 to load symbols.")

# --- END CELL 5 ---


2025-10-26 01:05:32,742 - INFO - Reddit API initialized successfully
2025-10-26 01:05:32,795 - INFO - Scraping 8 subreddits: ['stocks', 'investing', 'wallstreetbets', 'SecurityAnalysis', 'StockMarket', 'finance', 'options', 'daytrading']



SCRAPING REDDIT WITH DUAL SENTIMENT ANALYSIS
Analyzing with TextBlob (general) + FinBERT (finance-specific)
This will take 5-10 minutes...


[1/20] Processing AAPL...
  -> Searching r/stocks... 1 posts processed.
  -> Searching r/investing... 0 posts processed.
  -> Searching r/wallstreetbets... 3 posts processed.
  -> Searching r/SecurityAnalysis... 0 posts processed.
  -> Searching r/StockMarket... 1 posts processed.
  -> Searching r/finance... 0 posts processed.
  -> Searching r/options... 0 posts processed.
  -> Searching r/daytrading... 8 posts processed.
  Saved 13 new posts for AAPL.

[2/20] Processing AMZN...
  -> Searching r/stocks... 0 posts processed.
  -> Searching r/investing... 5 posts processed.
  -> Searching r/wallstreetbets... 9 posts processed.
  -> Searching r/SecurityAnalysis... 0 posts processed.
  -> Searching r/StockMarket... 1 posts processed.
  -> Searching r/finance... 0 posts processed.
  -> Searching r/options... 1 posts processed.
  -> Searching r/daytrad

2025-10-26 01:13:46,053 - INFO - Finished Reddit scrape. Saved 148 new posts overall.


  Saved 0 new posts for XPP.

Reddit scraping complete!


In [28]:
# --- CELL 6: Scrape News with Dual Sentiment ---

# ============================================================================
safe_print("\n" + "="*70)
safe_print("SCRAPING NEWS WITH DUAL SENTIMENT ANALYSIS")
safe_print("="*70)
safe_print("This will take 1-2 minutes...\n")

if symbols:
    scraper.scrape_news(symbols, days_back=7)
    safe_print("\nNews scraping complete!") # Removed checkmark
else:
    safe_print("No symbols to scrape. Run Cell 4 to load symbols.")

# --- END CELL 6 ---


SCRAPING NEWS WITH DUAL SENTIMENT ANALYSIS
This will take 1-2 minutes...


[1/20] Fetching news for AAPL...
  Processed 22 articles. Saved 22 new.

[2/20] Fetching news for AMZN...
  Processed 15 articles. Saved 15 new.

[3/20] Fetching news for BLK...
  Processed 1 articles. Saved 1 new.

[4/20] Fetching news for ERO...
  Processed 0 articles. Saved 0 new.

[5/20] Fetching news for FXP...
  Processed 0 articles. Saved 0 new.

[6/20] Fetching news for GOOGL...
  Processed 12 articles. Saved 12 new.

[7/20] Fetching news for GXC...
  Processed 0 articles. Saved 0 new.

[8/20] Fetching news for JPM...
  Processed 0 articles. Saved 0 new.

[9/20] Fetching news for KR...
  Processed 0 articles. Saved 0 new.

[10/20] Fetching news for MDT...
  Processed 0 articles. Saved 0 new.

[11/20] Fetching news for META...
  Processed 19 articles. Saved 19 new.

[12/20] Fetching news for MSFT...
  Processed 3 articles. Saved 3 new.

[13/20] Fetching news for NVDA...
  Processed 13 articles. Saved 13 

2025-10-26 01:20:59,695 - INFO - Finished News scrape. Saved 109 new articles overall.



News scraping complete!


In [30]:
# --- CELL 7: Aggregate Daily Sentiment ---

# ============================================================================
safe_print("\n" + "="*70)
safe_print("AGGREGATING DUAL SENTIMENT SCORES")
safe_print("="*70)
safe_print("Aggregating scores for today...")

scraper.aggregate_daily_sentiment(date_to_aggregate=datetime.now().date())

safe_print("\nAggregating scores for yesterday (to ensure completeness)...")
scraper.aggregate_daily_sentiment(date_to_aggregate=datetime.now().date() - timedelta(days=1))

safe_print("\nSentiment aggregation complete!") # Removed checkmark

# --- END CELL 7 ---


2025-10-26 01:23:10,736 - INFO - Aggregating daily sentiment for date: 2025-10-26
2025-10-26 01:23:10,888 - INFO - Finished aggregation for 2025-10-26. Processed 20/20 symbols.
2025-10-26 01:23:10,890 - INFO - Aggregating daily sentiment for date: 2025-10-25



AGGREGATING DUAL SENTIMENT SCORES
Aggregating scores for today...

Aggregating scores for yesterday (to ensure completeness)...


2025-10-26 01:23:11,049 - INFO - Finished aggregation for 2025-10-25. Processed 20/20 symbols.



Sentiment aggregation complete!


In [31]:
# --- CELL 8: Compare TextBlob vs FinBERT Performance ---

# ============================================================================
safe_print("\n" + "="*70)
safe_print("TEXTBLOB VS FINBERT COMPARISON")
safe_print("="*70)

try:
    conn = scraper.connect_db() # Use method to connect
    if conn:
        # Overall statistics
        safe_print("\n1. OVERALL SENTIMENT STATISTICS")
        safe_print("-" * 70)
        stats_query = """
        SELECT 'Reddit' as source, COUNT(*) as total, AVG(textblob_score) as avg_textblob, AVG(finbert_score) as avg_finbert, AVG(combined_score) as avg_combined,
               STDDEV(textblob_score) as std_textblob, STDDEV(finbert_score) as std_finbert, AVG(ABS(textblob_score - finbert_score)) as avg_diff
        FROM reddit_sentiment WHERE textblob_score IS NOT NULL AND finbert_score IS NOT NULL
        UNION ALL
        SELECT 'News' as source, COUNT(*), AVG(textblob_score), AVG(finbert_score), AVG(combined_score),
               STDDEV(textblob_score), STDDEV(finbert_score), AVG(ABS(textblob_score - finbert_score))
        FROM news_sentiment WHERE textblob_score IS NOT NULL AND finbert_score IS NOT NULL
        """
        stats_df = pd.read_sql(stats_query, conn)
        safe_print(stats_df.to_string(index=False))

        # Sentiment distribution comparison (Reddit only)
        safe_print("\n2. REDDIT SENTIMENT LABEL DISTRIBUTION")
        safe_print("-" * 70)
        label_query = """
        SELECT 'TextBlob' as method, textblob_label as label, COUNT(*) as count, ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
        FROM reddit_sentiment GROUP BY textblob_label
        UNION ALL
        SELECT 'FinBERT' as method, finbert_label as label, COUNT(*) as count, ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
        FROM reddit_sentiment GROUP BY finbert_label ORDER BY method, label
        """
        label_df = pd.read_sql(label_query, conn)
        safe_print(label_df.to_string(index=False))

        # Agreement rate
        safe_print("\n3. TEXTBLOB-FINBERT AGREEMENT RATE")
        safe_print("-" * 70)
        agreement_query = """
        SELECT 'Reddit' as source, COUNT(*) as total, SUM(CASE WHEN textblob_label = finbert_label THEN 1 ELSE 0 END) as agreements,
               ROUND(SUM(CASE WHEN textblob_label = finbert_label THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as agreement_rate
        FROM reddit_sentiment
        UNION ALL
        SELECT 'News' as source, COUNT(*), SUM(CASE WHEN textblob_label = finbert_label THEN 1 ELSE 0 END),
               ROUND(SUM(CASE WHEN textblob_label = finbert_label THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2)
        FROM news_sentiment
        """
        agreement_df = pd.read_sql(agreement_query, conn)
        safe_print(agreement_df.to_string(index=False))

        # Top stocks by sentiment
        safe_print("\n4. TOP 10 STOCKS BY COMBINED SENTIMENT (Today)")
        safe_print("-" * 70)
        top_stocks_query = """
        SELECT symbol, total_mentions, ROUND(overall_textblob, 4) as textblob_score, ROUND(overall_finbert, 4) as finbert_score,
               ROUND(overall_combined, 4) as combined_score,
               CASE WHEN overall_combined > 0.1 THEN 'POSITIVE' WHEN overall_combined < -0.1 THEN 'NEGATIVE' ELSE 'NEUTRAL' END as sentiment
        FROM daily_sentiment WHERE date = CURDATE() AND total_mentions > 0 ORDER BY total_mentions DESC LIMIT 10
        """
        top_stocks_df = pd.read_sql(top_stocks_query, conn)
        if not top_stocks_df.empty:
            safe_print(top_stocks_df.to_string(index=False))
        else:
            safe_print("No sentiment data for today yet.")

        # Divergence analysis
        safe_print("\n5. STOCKS WITH HIGHEST TEXTBLOB-FINBERT DIVERGENCE (Today)")
        safe_print("-" * 70)
        divergence_query = """
        SELECT symbol, total_mentions, ROUND(overall_textblob, 4) as textblob, ROUND(overall_finbert, 4) as finbert,
               ROUND(ABS(overall_textblob - overall_finbert), 4) as divergence
        FROM daily_sentiment WHERE date = CURDATE() AND total_mentions > 0 ORDER BY ABS(overall_textblob - overall_finbert) DESC LIMIT 10
        """
        divergence_df = pd.read_sql(divergence_query, conn)
        if not divergence_df.empty:
            safe_print(divergence_df.to_string(index=False))
        else:
            safe_print("No sentiment data for today yet.")

        conn.close()
    else:
         safe_print("Could not connect to database for analysis.")

except Exception as e:
    safe_print(f"An error occurred during analysis: {e}")
    safe_print("Please ensure your database is running and tables were created.")

# --- END CELL 8 ---



TEXTBLOB VS FINBERT COMPARISON

1. OVERALL SENTIMENT STATISTICS
----------------------------------------------------------------------
source  total  avg_textblob  avg_finbert  avg_combined  std_textblob  std_finbert  avg_diff
Reddit    148      0.079616    -0.047336     -0.009251       0.14692     0.378545  0.272745
  News    109      0.072703     0.076991      0.075703       0.19819     0.668125  0.544229

2. REDDIT SENTIMENT LABEL DISTRIBUTION
----------------------------------------------------------------------
  method    label  count  percentage
 FinBERT negative     25       16.89
 FinBERT  neutral    112       75.68
 FinBERT positive     11        7.43
TextBlob negative      8        5.41
TextBlob  neutral     91       61.49
TextBlob positive     49       33.11

3. TEXTBLOB-FINBERT AGREEMENT RATE
----------------------------------------------------------------------
source  total  agreements  agreement_rate
Reddit    148        72.0           48.65
  News    109        45.0  