In [1]:
import os
import json
import re
import requests

# ---------- Configuration ----------
PRIOR_WEIGHT = 10.0   # Higher value pulls low-vote items closer to neutral.
NEUTRAL_SCORE = 5.0

# ---------- Helper Function to Strip Redundant Prefix ----------
def strip_prefix(sentiment_text: str) -> str:
    """
    Removes a leading prefix in the format:
      "Score=<number>, Reasons="
    so that only the explanation remains.
    """
    pattern = r"Score=\s*\d+(?:\.\d+)?\s*,\s*Reasons=(.*)"
    match = re.match(pattern, sentiment_text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return sentiment_text.strip()

# ---------- Function to call the LLM for sentiment analysis ----------
def analyze_sentiment(text: str, ticker: str) -> str:
    """
    Sends a sentiment analysis request to the LLM endpoint and returns
    the model's full response which includes both a numeric sentiment score and an explanation.
    
    The prompt instructs the LLM to output strictly in the following format:
      Score=<number between 1 and 10>, Reasons=<detailed explanation>
      
    Additionally, if the text is contradictory, ambiguous, or confusing, the LLM should output a score of exactly 5.
    """
    prompt = (
        f"Analyze this WallStreetBets text about {ticker}. "
        "Provide your analysis strictly in the following format:\n\n"
        "Score=<number between 1 and 10>, Reasons=<detailed explanation>\n\n"
        "A score of 1 means extremely bearish/negative sentiment, and a score of 10 means extremely bullish/positive sentiment.\n"
        "IMPORTANT: If the text is contradictory, ambiguous, or confusing, output a score of exactly 5 and explain that the text is ambiguous.\n\n"
        f"Text: {text}"
    )
    
    payload = {
        "model": "meta/llama-3.3-70b-instruct",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 128
    }
    
    host = os.environ.get("LLAMA3HOST", "atl1-1-03-010-15-0")
    url = f"http://{host}:8000/v1/chat/completions"
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error: Request failed: {e}")
        return "Error"
    
    try:
        result = response.json()
        return result["choices"][0]["message"]["content"]
    except (ValueError, KeyError) as e:
        print(f"Error: Unable to parse response: {e}")
        return "Error"

# ---------- Helper Function to Parse the Numeric Score ----------
def parse_sentiment_score(sentiment_text: str) -> float:
    """
    Extracts the first floating point number from the sentiment analysis output.
    If none is found, returns NEUTRAL_SCORE.
    """
    m = re.search(r"(\d+(?:\.\d+)?)", sentiment_text)
    if m:
        try:
            return float(m.group(1))
        except Exception:
            return NEUTRAL_SCORE
    return NEUTRAL_SCORE

# ---------- Function to Compute Weighted Aggregate for a Single Ticker ----------
def compute_weighted_for_ticker(ticker_data, sentiment_results_for_ticker):
    # Build lookup dictionaries for vote counts.
    post_votes = { post.get("post_id"): post.get("score", 0) for post in ticker_data.get("post_content", []) }
    comment_votes = { comment.get("comment_id"): comment.get("score", 0) for comment in ticker_data.get("comment_content", []) }
    
    sum_post_weighted = 0.0
    total_post_weight = 0.0
    for item in sentiment_results_for_ticker["posts"]:
        post_id = item["post_id"]
        score_val = item["sentiment"].get("Score", NEUTRAL_SCORE)
        weight = post_votes.get(post_id, 0)
        sum_post_weighted += weight * score_val
        total_post_weight += weight

    posts_weighted_sentiment = ((sum_post_weighted + PRIOR_WEIGHT * NEUTRAL_SCORE) /
                                (total_post_weight + PRIOR_WEIGHT)) if (total_post_weight + PRIOR_WEIGHT) > 0 else NEUTRAL_SCORE

    sum_comment_weighted = 0.0
    total_comment_weight = 0.0
    for item in sentiment_results_for_ticker["comments"]:
        comment_id = item["comment_id"]
        score_val = item["sentiment"].get("Score", NEUTRAL_SCORE)
        weight = comment_votes.get(comment_id, 0)
        sum_comment_weighted += weight * score_val
        total_comment_weight += weight

    comments_weighted_sentiment = ((sum_comment_weighted + PRIOR_WEIGHT * NEUTRAL_SCORE) /
                                   (total_comment_weight + PRIOR_WEIGHT)) if (total_comment_weight + PRIOR_WEIGHT) > 0 else NEUTRAL_SCORE

    total_weight = total_post_weight + total_comment_weight
    sum_overall = sum_post_weighted + sum_comment_weighted
    overall_weighted_sentiment = ((sum_overall + PRIOR_WEIGHT * NEUTRAL_SCORE) /
                                  (total_weight + PRIOR_WEIGHT)) if (total_weight + PRIOR_WEIGHT) > 0 else NEUTRAL_SCORE

    return {
        "posts_weighted_sentiment": posts_weighted_sentiment,
        "total_post_votes": total_post_weight,
        "comments_weighted_sentiment": comments_weighted_sentiment,
        "total_comment_votes": total_comment_weight,
        "overall_weighted_sentiment": overall_weighted_sentiment,
        "total_votes": total_weight
    }

# ---------- Function to Save Results to a JSON File ----------
def save_json(data, filename):
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)

# ---------- Main Processing ----------
sentiment_output_filename = "wsb_sentiment_results.json"
weighted_output_filename = "wsb_weighted_sentiment.json"

# Load the original scraped JSON file.
input_filename = "wsb_data.json"
with open(input_filename, "r") as f:
    data = json.load(f)

sentiment_results = {}  # Will store raw sentiment results per ticker.
weighted_results = {}   # Will store weighted aggregate sentiment per ticker.

# Process one ticker at a time.
for ticker, ticker_data in data.get("ticker_stats", {}).items():
    print(f"\n=== Processing ticker: {ticker} ===")
    sentiment_results[ticker] = {"posts": [], "comments": []}
    
    # Process all posts for this ticker.
    for post in ticker_data.get("post_content", []):
        post_id = post.get("post_id")
        title = post.get("title", "")
        selftext = post.get("selftext", "")
        full_text = title + "\n" + selftext if selftext else title
        
        sentiment_response = analyze_sentiment(full_text, ticker)
        score_val = parse_sentiment_score(sentiment_response)
        # We assume the LLM returns in the correct format.
        result_entry = {"Score": score_val, "Reasons": sentiment_response}
        sentiment_results[ticker]["posts"].append({
            "post_id": post_id,
            "sentiment": result_entry
        })
        # Print the result without repeating the prefix.
        clean_reasons = strip_prefix(sentiment_response)
        print(f"Analyzed post {post_id} for ticker {ticker}: Score={score_val}, Reasons={clean_reasons}")
        
        # Save raw sentiment results after each post.
        save_json(sentiment_results, sentiment_output_filename)
    
    # Process all comments for this ticker.
    for comment in ticker_data.get("comment_content", []):
        comment_id = comment.get("comment_id")
        comment_text = comment.get("comment_body", "")
        sentiment_response = analyze_sentiment(comment_text, ticker)
        score_val = parse_sentiment_score(sentiment_response)
        result_entry = {"Score": score_val, "Reasons": sentiment_response}
        sentiment_results[ticker]["comments"].append({
            "comment_id": comment_id,
            "sentiment": result_entry
        })
        clean_reasons = strip_prefix(sentiment_response)
        print(f"Analyzed comment {comment_id} for ticker {ticker}: Score={score_val}, Reasons={clean_reasons}")
        
        # Save raw sentiment results after each comment.
        save_json(sentiment_results, sentiment_output_filename)
    
    # After processing all posts and comments for this ticker, compute the weighted aggregate sentiment.
    weighted_results[ticker] = compute_weighted_for_ticker(ticker_data, sentiment_results[ticker])
    save_json(weighted_results, weighted_output_filename)
    overall_score = weighted_results[ticker]["overall_weighted_sentiment"]
    print(f"Weighted aggregate for ticker {ticker}: Score={overall_score}")
    print(f"Detailed aggregate: {json.dumps(weighted_results[ticker], indent=4)}\n")

print("Processing complete. Check the output JSON files for results.")



=== Processing ticker: T ===
Analyzed post 1ivq6dn for ticker T: Score=8.0, Reasons=The text expresses a highly positive and enthusiastic sentiment towards the author's newfound sense of belonging among WallStreetBets investors, particularly the "degen" community. The author draws parallels between their own thought patterns and behaviors associated with schizophrenia and those of skilled investors, conveying a sense of pride and camaraderie. The tone is optimistic, and the author's language suggests a strong sense of excitement and gratitude. However, the text does not explicitly discuss the stock "T" or provide any direct analysis or prediction, which prevents the score from being a perfect 10. Additionally, the PS section at the end,
Analyzed post 1ivkuuk for ticker T: Score=1.0, Reasons=The text expresses extreme anxiety and concern about the potential loss of money invested in PLTR, which is affecting the author's personal life, including their ability to pay their mortgage. The 

KeyboardInterrupt: 