In [2]:
def create_summary_df(ticker_stats):
    """
    Create a DataFrame with summary statistics for each ticker
    """
    data = []
    for ticker, stats in ticker_stats.items():
        avg_score = stats['total_upvotes'] / stats['total_mentions'] if stats['total_mentions'] > 0 else 0
        data.append({
            'ticker': ticker,
            'total_mentions': stats['total_mentions'],
            'total_upvotes': stats['total_upvotes'],
            'post_mentions': stats['post_mentions'],
            'comment_mentions': stats['comment_mentions'],
            'avg_score': avg_score
        })
    return pd.DataFrame(data)

def plot_ticker_analysis(ticker_stats, metric='mentions', top_n=15):
    """
    Create visualizations for ticker analysis
    
    Parameters:
    - ticker_stats: Dictionary containing ticker statistics
    - metric: String indicating which metric to plot ('mentions', 'upvotes', or 'avg_score')
    - top_n: Number of top tickers to display
    """
    plt.figure(figsize=(12, 6))
    
    # Create DataFrame from ticker stats
    df = create_summary_df(ticker_stats)
    
    # Sort based on the selected metric
    if metric == 'mentions':
        df = df.sort_values('total_mentions', ascending=False)
        plot_data = df['total_mentions']
        title = f'Top {top_n} Tickers by Total Mentions'
        ylabel = 'Number of Mentions'
    elif metric == 'upvotes':
        df = df.sort_values('total_upvotes', ascending=False)
        plot_data = df['total_upvotes']
        title = f'Top {top_n} Tickers by Total Upvotes'
        ylabel = 'Number of Upvotes'
    else:  # avg_score
        df = df.sort_values('avg_score', ascending=False)
        plot_data = df['avg_score']
        title = f'Top {top_n} Tickers by Average Score per Mention'
        ylabel = 'Average Score'
    
    # Select top N tickers
    plot_data = plot_data.head(top_n)
    
    # Create bar plot
    sns.barplot(x=plot_data.index, y=plot_data.values, palette='viridis')
    
    # Customize plot
    plt.title(title, fontsize=14, pad=20)
    plt.xlabel('Ticker Symbol', fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels on top of bars
    for i, v in enumerate(plot_data.values):
        if metric == 'avg_score':
            plt.text(i, v, f'{v:.1f}', ha='center', va='bottom')
        else:
            plt.text(i, v, str(int(v)), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

def plot_mention_distribution(ticker_stats):
    """
    Create a pie chart showing the distribution of mentions between posts and comments
    """
    total_post_mentions = sum(stats['post_mentions'] for stats in ticker_stats.values())
    total_comment_mentions = sum(stats['comment_mentions'] for stats in ticker_stats.values())
    
    plt.figure(figsize=(8, 8))
    plt.pie([total_post_mentions, total_comment_mentions], 
            labels=['Posts', 'Comments'],
            autopct='%1.1f%%',
            colors=['lightblue', 'lightgreen'])
    plt.title('Distribution of Ticker Mentions')
    plt.axis('equal')
    plt.show()

def plot_ticker_timeline(ticker_stats, top_n=5):
    """
    Create a timeline of mentions for top tickers
    """
    # Get top N tickers by total mentions
    df = create_summary_df(ticker_stats)
    top_tickers = df.nlargest(top_n, 'total_mentions')['ticker'].tolist()
    
    # Create timeline data
    timeline_data = []
    for ticker in top_tickers:
        stats = ticker_stats[ticker]
        
        # Combine post and comment data
        all_mentions = []
        for post in stats['post_data']:
            all_mentions.append({
                'ticker': ticker,
                'time': post['created_utc'],
                'score': post['score'],
                'type': 'post'
            })
        for comment in stats['comment_data']:
            all_mentions.append({
                'ticker': ticker,
                'time': comment['created_utc'],
                'score': comment['score'],
                'type': 'comment'
            })
        timeline_data.extend(all_mentions)
    
    # Convert to DataFrame and sort by time
    timeline_df = pd.DataFrame(timeline_data)
    timeline_df['datetime'] = pd.to_datetime(timeline_df['time'], unit='s')
    timeline_df = timeline_df.sort_values('datetime')
    
    # Plot
    plt.figure(figsize=(15, 8))
    for ticker in top_tickers:
        ticker_data = timeline_df[timeline_df['ticker'] == ticker]
        plt.scatter(ticker_data['datetime'], 
                   ticker_data['score'],
                   label=ticker,
                   alpha=0.6)
    
    plt.title(f'Timeline of Mentions and Scores for Top {top_n} Tickers')
    plt.xlabel('Time')
    plt.ylabel('Score')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
import praw
import re
import json
from time import sleep
from datetime import datetime

# --- Reddit API Authentication ---
CLIENT_ID = "ayjJix181j8z0QOyQoPSbg"
CLIENT_SECRET = "VRLoz1o0NAPgIziOEAeY8Kb7Gr2e2Q"
USER_AGENT = "WSB_Scraper/1.0 by u/FlyingOoze43453"

# --- Predefined Market Tickers & Company Mapping ---
MARKET_TICKERS = [
    "JPM", "BRK.B", "2222.SR", "1398.HK", "BAC", "AMZN", "0939.HK", "MSFT", 
    "1288.HK", "GOOGL", "TM", "AAPL", "3988.HK", "XOM", "HSBC", "WFC", "SHEL",
    "PTR", "UNH", "WMT", "005930.KS", "CVX", "GS", "META", "TTE", "MS", "RY",
    "C", "2318.HK", "0941.HK", "ALV.DE", "BNP.PA", "VZ", "3968.HK", "UBS",
    "SAN", "T", "0700.HK", "MUFG", "CMCSA", "BABA", "SNP", "VOW.DE", "TD",
    "JNJ", "TSM", "BP", "MC.PA", "RELIANCE.NS", "NESN.SW", "CS.PA", "MBG.DE",
    "AXP", "PBR", "SBIN.NS", "3328.HK", "1658.HK", "TSLA", "PG", "STLA",
    "BMW.DE", "DTE.DE", "SMFG", "CVS", "HDB", "BBVA", "SONY", "SIE.DE", "CBA.AX",
    "LICI.NS", "GM", "9432.T", "ACA.PA", "AVGO", "PEP", "HD", "ORCL", "8058.T",
    "ROG.SW", "EQNR", "ELV", "CEO", "601166.SS", "ISP.MI", "IBM", "CB", "ITUB",
    "BUD", "ABBV", "CSCO", "HMC", "BNS", "005380.KS", "ENEL.MI", "CI", "COST",
    "MUV2.DE", "ZURN.SW", "NVS", "KO", "GE", "CAT", "DE", "BMO", "MFG", "AIR.PA",
    "INTC", "8031.T", "COP", "601668.SS", "NVDA", "RIO", "0267.HK", "RTX",
    "GLEN.L", "BHP", "LFC", "USB", "2317.TW", "IBE.MC", "AZN", "SNY", "UL",
    "F", "COF", "NAB.AX", "APO", "V", "UCG.MI", "300750.SZ", "AIG", "ANZ.AX",
    "DG.PA", "0998.HK", "TMO", "SCHW", "PGR", "D05.SI", "MPC", "PNC", "G.MI",
    "IBN", "8001.T", "WBC.AX", "UPS", "DB", "CM", "1088.HK", "8766.T", "NEE",
    "E", "1211.HK", "ING", "LYG", "DIS", "PSX", "MET", "CRM", "600000.SS",
    "BDORY", "1299.HK", "MRK", "ABT", "0728.HK", "DELL", "BCS", "LIN", "FDX",
    "6501.T", "ACN", "LMT", "OR.PA", "NDA-FI.HE", "LLY", "KKR", "PM", "NWG",
    "VLO", "ENB", "JD", "BK", "VALE", "GSK", "PRU", "LOW", "CHTR", "HON",
    "MCK", "DUK", "QCOM", "HCA", "MDT", "AMGN", "ENGI.PA", "SO", "CABK.MC",
    "MFC", "DHR", "MDLZ", "SU.PA"
]

COMPANY_TO_TICKER = {
    "JPMorgan Chase": "JPM",
    "Berkshire Hathaway": "BRK.B",
    "Saudi Arabian Oil Company": "2222.SR",
    "Saudi Aramco": "2222.SR",
    "ICBC": "1398.HK",
    "Bank of America": "BAC",
    "Amazon": "AMZN",
    "China Construction Bank": "0939.HK",
    "Microsoft": "MSFT",
    "Agricultural Bank of China": "1288.HK",
    "Alphabet": "GOOGL",
    "Google": "GOOGL",
    "Toyota Motor": "TM",
    "Toyota": "TM",
    "Apple": "AAPL",
    "Bank of China": "3988.HK",
    "ExxonMobil": "XOM",
    "Exxon Mobil": "XOM",
    "HSBC Holdings": "HSBC",
    "HSBC": "HSBC",
    "Wells Fargo": "WFC",
    "Shell": "SHEL",
    "PetroChina": "PTR",
    "UnitedHealth Group": "UNH",
    "UnitedHealth": "UNH",
    "Walmart": "WMT",
    "Samsung Electronics": "005930.KS",
    "Samsung": "005930.KS",
    "Chevron": "CVX",
    "Goldman Sachs Group": "GS",
    "Goldman Sachs": "GS",
    "Meta Platforms": "META",
    "Meta": "META",
    "Facebook": "META",
    "TotalEnergies": "TTE",
    "Morgan Stanley": "MS",
    "RBC": "RY",
    "Royal Bank of Canada": "RY",
    "Citigroup": "C",
    "Citi": "C",
    "Ping An Insurance Group": "2318.HK",
    "Ping An": "2318.HK",
    "China Mobile": "0941.HK",
    "Allianz": "ALV.DE",
    "BNP Paribas": "BNP.PA",
    "Verizon Communications": "VZ",
    "Verizon": "VZ",
    "China Merchants Bank": "3968.HK",
    "UBS": "UBS",
    "Santander": "SAN",
    "AT&T": "T",
    "Tencent Holdings": "0700.HK",
    "Tencent": "0700.HK",
    "Mitsubishi UFJ Financial": "MUFG",
    "Comcast": "CMCSA",
    "Alibaba Group": "BABA",
    "Alibaba": "BABA",
    "Sinopec": "SNP",
    "Volkswagen Group": "VOW.DE",
    "Volkswagen": "VOW.DE",
    "VW": "VOW.DE",
    "TD Bank Group": "TD",
    "TD Bank": "TD",
    "Johnson & Johnson": "JNJ",
    "Taiwan Semiconductor": "TSM",
    "TSMC": "TSM",
    "BP": "BP",
    "LVMH": "MC.PA",
    "Louis Vuitton": "MC.PA",
    "LVMH Moët Hennessy Louis Vuitton": "MC.PA",
    "Reliance Industries": "RELIANCE.NS",
    "Reliance": "RELIANCE.NS",
    "Nestlé": "NESN.SW",
    "Nestle": "NESN.SW",
    "AXA Group": "CS.PA",
    "AXA": "CS.PA",
    "Mercedes-Benz Group": "MBG.DE",
    "Mercedes-Benz": "MBG.DE",
    "Mercedes": "MBG.DE",
    "American Express": "AXP",
    "AmEx": "AXP",
    "Petrobras": "PBR",
    "State Bank of India": "SBIN.NS",
    "SBI": "SBIN.NS",
    "Bank of Communications": "3328.HK",
    "Postal Savings Bank Of China": "1658.HK",
    "PSBC": "1658.HK",
    "Tesla": "TSLA",
    "Procter & Gamble": "PG",
    "P&G": "PG",
    "Stellantis": "STLA",
    "BMW Group": "BMW.DE",
    "BMW": "BMW.DE",
    "Deutsche Telekom": "DTE.DE",
    "Sumitomo Mitsui Financial": "SMFG",
    "CVS Health": "CVS",
    "CVS": "CVS",
    "HDFC Bank": "HDB",
    "BBVA": "BBVA",
    "Banco Bilbao Vizcaya": "BBVA",
    "Sony": "SONY",
    "Siemens": "SIE.DE",
    "Commonwealth Bank": "CBA.AX",
    "Life Insurance Corp. of India": "LICI.NS",
    "LIC": "LICI.NS",
    "General Motors": "GM",
    "Nippon Telegraph & Tel": "9432.T",
    "NTT": "9432.T",
    "Credit Agricole": "ACA.PA",
    "Broadcom": "AVGO",
    "PepsiCo": "PEP",
    "Pepsi": "PEP",
    "The Home Depot": "HD",
    "Home Depot": "HD",
    "Oracle": "ORCL",
    "Mitsubishi": "8058.T",
    "Roche Holding": "ROG.SW",
    "Roche": "ROG.SW",
    "Equinor": "EQNR",
    "Elevance Health": "ELV",
    "CNOOC": "CEO",
    "Industrial Bank": "601166.SS",
    "Intesa Sanpaolo": "ISP.MI",
    "IBM": "IBM",
    "International Business Machines": "IBM",
    "Chubb": "CB",
    "Itaú Unibanco": "ITUB",
    "Itau": "ITUB",
    "Anheuser-Busch InBev": "BUD",
    "Anheuser-Busch": "BUD",
    "AbbVie": "ABBV",
    "Cisco Systems": "CSCO",
    "Cisco": "CSCO",
    "Honda Motor": "HMC",
    "Honda": "HMC",
    "Bank of Nova Scotia": "BNS",
    "Scotiabank": "BNS",
    "Hyundai Motor": "005380.KS",
    "Hyundai": "005380.KS",
    "Enel": "ENEL.MI",
    "Cigna": "CI",
    "Costco Wholesale": "COST",
    "Costco": "COST",
    "Munich Re": "MUV2.DE",
    "Zurich Insurance Group": "ZURN.SW",
    "Zurich Insurance": "ZURN.SW",
    "Novartis": "NVS",
    "Coca-Cola": "KO",
    "Coke": "KO"
}

# --- Build a Ticker Regex with Strict Token Matching ---
def build_ticker_regex(tickers):
    """
    Precompile a regex that matches a ticker as a standalone token,
    optionally preceded by a '$'. The ticker must be surrounded by whitespace,
    punctuation, or start/end of string.
    """
    escaped = [re.escape(ticker) for ticker in tickers]
    # Use a non-capturing group for the left context and capture the ticker.
    pattern = r'(?:^|\s)\$?(' + '|'.join(escaped) + r')(?=\s|$|[.,;:!?])'
    return re.compile(pattern)

ticker_regex = build_ticker_regex(MARKET_TICKERS)

def extract_mentions(text):
    """
    Extract ticker symbols from text using the precompiled regex.
    Returns a list of tickers (in uppercase) found as standalone tokens.
    """
    if not text:
        return []
    return ticker_regex.findall(text.upper())

# --- Reddit Rate Limiter ---
class RedditRateLimiter:
    def __init__(self):
        self.last_request = datetime.now()
        self.request_count = 0
        self.REQUESTS_PER_MINUTE = 30
        self.MIN_REQUEST_INTERVAL = 1
        
    def wait_if_needed(self):
        now = datetime.now()
        time_since_last = (now - self.last_request).total_seconds()
        if time_since_last < self.MIN_REQUEST_INTERVAL:
            sleep(self.MIN_REQUEST_INTERVAL - time_since_last)
        if time_since_last >= 60:
            self.request_count = 0
        if self.request_count >= self.REQUESTS_PER_MINUTE:
            sleep_time = 60 - time_since_last
            if sleep_time > 0:
                print(f"Approaching rate limit, waiting {sleep_time:.1f} seconds...")
                sleep(sleep_time)
            self.request_count = 0
        self.request_count += 1
        self.last_request = datetime.now()

# --- Scrape WSB Data ---
def scrape_wsb_data(post_limit=100, comments_per_post=100):
    """
    Scrape WallStreetBets posts and comments that mention tickers.
    Outputs raw scraped data.
    """
    import praw
    reddit = praw.Reddit(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        user_agent=USER_AGENT
    )
    subreddit = reddit.subreddit("wallstreetbets")
    wsb_data = []
    rate_limiter = RedditRateLimiter()
    posts_processed = 0
    
    print(f"Starting to scrape up to {post_limit} posts...")
    for submission in subreddit.new(limit=post_limit):
        try:
            rate_limiter.wait_if_needed()
            title_tickers = extract_mentions(submission.title)
            selftext_tickers = extract_mentions(submission.selftext)
            post_info = {
                "post_id": submission.id,
                "title": submission.title or "",
                "selftext": submission.selftext or "",
                "created_utc": submission.created_utc,
                "score": submission.score,
                "upvote_ratio": submission.upvote_ratio,
                "title_tickers": title_tickers,
                "selftext_tickers": selftext_tickers,
                "comments": []
            }
            if title_tickers or selftext_tickers:
                try:
                    rate_limiter.wait_if_needed()
                    submission.comments.replace_more(limit=0)
                    all_comments = submission.comments.list()
                    for comment in all_comments[:comments_per_post]:
                        if hasattr(comment, 'body') and comment.body:
                            comment_tickers = extract_mentions(comment.body)
                            if comment_tickers:
                                comment_data = {
                                    "comment_id": comment.id,
                                    "comment_body": comment.body,
                                    "tickers": comment_tickers,
                                    "score": comment.score,
                                    "created_utc": comment.created_utc
                                }
                                post_info["comments"].append(comment_data)
                except Exception as e:
                    print(f"Error processing comments for post {submission.id}: {str(e)}")
            if title_tickers or selftext_tickers or post_info["comments"]:
                wsb_data.append(post_info)
            posts_processed += 1
            if posts_processed % 10 == 0:
                print(f"Processed {posts_processed} posts...")
        except Exception as e:
            print(f"Error processing post {submission.id}: {str(e)}")
            continue
    print(f"Finished processing {posts_processed} posts.")
    return wsb_data

# --- Main Execution ---
if __name__ == "__main__":
    print("Starting WSB scraping...")
    scraped_data = scrape_wsb_data()
    
    # Save scraped data to JSON file with a timestamp.
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"wsb_scraped_data_{timestamp}.json"
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(scraped_data, f, indent=2, ensure_ascii=False)
    print(f"Scraping complete. Data saved to {output_filename}")


Starting WSB scraping...
Starting to scrape up to 100 posts...
Processed 10 posts...
Processed 20 posts...
Approaching rate limit, waiting 60.0 seconds...
Processed 30 posts...
Processed 40 posts...
