In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import yfinance as yf
import os

# Social Media

## Twitter Sentiment Analysis

In [None]:


sentiment_df = pd.read_csv('sentiment_data.csv')

"""sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

sentiment_df = sentiment_df.set_index(['date', 'symbol'])

sentiment_df['engagement_ratio'] = sentiment_df['twitterComments']/sentiment_df['twitterLikes']

sentiment_df = sentiment_df[(sentiment_df['twitterLikes']>20)&(sentiment_df['twitterComments']>10)]

sentiment_df"""
sentiment_df['symbol']

(27235,)

### Aggregate Monthly and Calculate Average Sentiment for the Month

In [None]:
aggragated_df = (sentiment_df.reset_index('symbol').groupby([pd.Grouper(freq='M'), 'symbol'])
                    [['engagement_ratio']].mean())

aggragated_df['rank'] = (aggragated_df.groupby(level=0)['engagement_ratio']
                         .transform(lambda x: x.rank(ascending=False)))

aggragated_df

### Select Top 5 Stocks on Cross-Sectional Ranking

In [None]:
filtered_df = aggragated_df[aggragated_df['rank']<6].copy()

filtered_df = filtered_df.reset_index(level=1)

filtered_df.index = filtered_df.index+pd.DateOffset(1)

filtered_df = filtered_df.reset_index().set_index(['date', 'symbol'])

filtered_df.head(20)

### Extract the Stocks to Form Portfolios

In [None]:
dates = filtered_df.index.get_level_values('date').unique().tolist()

fixed_dates = {}

for d in dates:
    
    fixed_dates[d.strftime('%Y-%m-%d')] = filtered_df.xs(d, level=0).index.tolist()
    
fixed_dates

### Download Fresh Stock Prices for Shortlisted Stocks

In [None]:
stocks_list = sentiment_df.index.get_level_values('symbol').unique().tolist()

prices_df = yf.download(tickers=stocks_list,
                        start='2021-01-01',
                        end='2023-03-01')

### Calculate Portfolio Returns with Monthly Rebalancing

In [None]:
returns_df = np.log(prices_df['Adj Close']).diff().dropna()

portfolio_df = pd.DataFrame()

for start_date in fixed_dates.keys():
    
    end_date = (pd.to_datetime(start_date)+pd.offsets.MonthEnd()).strftime('%Y-%m-%d')
    
    cols = fixed_dates[start_date]
    
    temp_df = returns_df[start_date:end_date][cols].mean(axis=1).to_frame('portfolio_return')
    
    portfolio_df = pd.concat([portfolio_df, temp_df], axis=0)
    
portfolio_df

### Download NASDAQ Prices and Calculate Returns to Compare Strategies

In [None]:
qqq_df = yf.download(tickers='QQQ',
                     start='2021-01-01',
                     end='2023-03-01')

qqq_ret = np.log(qqq_df['Adj Close']).diff().to_frame('nasdaq_return')

portfolio_df = portfolio_df.merge(qqq_ret,
                                  left_index=True,
                                  right_index=True)

portfolio_df

In [None]:
plt.style.use('ggplot')

portfolios_cumulative_return = np.exp(np.log1p(portfolio_df).cumsum()).sub(1)

portfolios_cumulative_return.plot(figsize=(16,6))

plt.title('Twitter Engagement Ratio Strategy Return Over Time')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

plt.ylabel('Return')

plt.show()

## Reddit Sentiment

In [2]:
import praw
import pandas as pd
import numpy as np
import yfinance as yf
from nltk.sentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv
import nltk

# Download required NLTK data
nltk.download('vader_lexicon')

# Load environment variables
load_dotenv()

class StockSentimentAnalyzer:
    def __init__(self):
        # Initialize Reddit API
        self.reddit = praw.Reddit(
            client_id=os.getenv('REDDIT_CLIENT_ID'),
            client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
            user_agent=os.getenv('REDDIT_USER_AGENT')
        )
        self.sia = SentimentIntensityAnalyzer()

    def get_reddit_posts(self, stock_symbol, limit=100):
        """Fetch Reddit posts about a specific stock."""
        posts = []
        for post in self.reddit.subreddit('stocks+investing+wallstreetbets').search(
            f'{stock_symbol} stock', limit=limit
        ):
            posts.append({
                'title': post.title,
                'text': post.selftext,
                'score': post.score,
                'created_utc': datetime.fromtimestamp(post.created_utc)
            })
        return pd.DataFrame(posts)

    def analyze_sentiment(self, text):
        """Analyze sentiment of text using VADER."""
        return self.sia.polarity_scores(text)['compound']

    def get_stock_data(self, stock_symbol, days=30):
        """Fetch historical stock data."""
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)
        stock = yf.Ticker(stock_symbol)
        return stock.history(start=start_date, end=end_date)

    def predict_trend(self, stock_symbol):
        """Predict stock trend based on Reddit sentiment."""
        # Get Reddit posts
        posts_df = self.get_reddit_posts(stock_symbol)
        
        # Calculate average sentiment
        posts_df['sentiment'] = posts_df['text'].apply(self.analyze_sentiment)
        avg_sentiment = posts_df['sentiment'].mean()
        
        # Get stock data
        stock_data = self.get_stock_data(stock_symbol)
        
        # Simple trend prediction
        if avg_sentiment > 0.2:
            return "Bullish (Positive sentiment detected)"
        elif avg_sentiment < -0.2:
            return "Bearish (Negative sentiment detected)"
        else:
            return "Neutral (Mixed sentiment)"

def main():
    # Initialize analyzer
    analyzer = StockSentimentAnalyzer()
    
    # Get user input
    stock_symbol = 'AAPL'
    
    try:
        # Get prediction
        prediction = analyzer.predict_trend(stock_symbol)
        print(f"\nAnalysis for {stock_symbol}:")
        print(prediction)
        
        # Show some sample posts
        posts = analyzer.get_reddit_posts(stock_symbol, limit=5)
        print("\nRecent Reddit posts about this stock:")
        for _, post in posts.iterrows():
            print(f"\nTitle: {post['title']}")
            print(f"Sentiment: {analyzer.analyze_sentiment(post['text']):.2f}")
            
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main() 

[nltk_data] Downloading package vader_lexicon to C:\Users\Luke
[nltk_data]     Gearin\AppData\Roaming\nltk_data...


MissingRequiredAttributeException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the Reddit class constructor, or as an environment variable.

# News Sources
* News APIs / RSS - Bloomberg, Routers, Financial Times, Nasdaq, etc.
* Press Releases / SEC Filings - via EDGAR / SEC API
* Stock Messaging Boards (potential)

How I want it to work given grouping of stocks (each researched at different time):
1. Google search stock and go to "News" page
2. Grab every news article from the prior month on the stock (maybe save titles and results incase overlap)
    - Potentially filter out less reputable OR underweight in overall rating
3. Run sentiment analysis on articles and calculate sentiment ratio

In [10]:
from GoogleNews import GoogleNews
import pandas as pd
from datetime import date, timedelta

def fetch_google_news(ticker: str, days_back: int = 30, max_pages: int = 5):
    end_date = date.today()
    start_date = end_date - timedelta(days=days_back)
    
    gn = GoogleNews(lang='en')
    gn.set_time_range(start_date.strftime('%m/%d/%Y'), end_date.strftime('%m/%d/%Y'))

    query = f"{ticker} stock"
    gn.search(query)

    all_results = []
    for i in range(1, max_pages + 1):
        gn.getpage(i)
        results = gn.result()
        all_results.extend(results)

    # Remove duplicates
    seen = set()
    unique_results = []
    for item in all_results:
        if item['link'] not in seen:
            seen.add(item['link'])
            unique_results.append(item)

    df = pd.DataFrame(unique_results)
    return df[['title', 'media', 'date', 'link']] if not df.empty else pd.DataFrame()

# Example
df = fetch_google_news("AAPL", days_back=30, max_pages=5)
print(df)


                                                title  \
0   Is Tim Cook Stepping Down? Apple CEO's 65th Bi...   
1   Everyone’s Hacking iPhones! Apple Increases iO...   
2   Apple Shares Face Mounting Pressure from Trade...   
3                 Apple (AAPL) Stock Might Be Rotting   
4   Why Google, Microsoft, Apple Pouring Billions ...   
5   Apple beats Elon Musk to acquire Prompt AI for...   
6   Apple (AAPL) Price Target Stays at $220 as UBS...   
7                    Is There a Future for Sirius XM?   
8                Where Will Apple Stock Be in 1 Year?   
9          Apple's Descent into Competing with Xiaomi   
10  Is Tim Cook Stepping Down? Apple CEO's 65th Bi...   
11  Everyone’s Hacking iPhones! Apple Increases iO...   
12  Apple Shares Face Mounting Pressure from Trade...   
13                Apple (AAPL) Stock Might Be Rotting   
14  Why Google, Microsoft, Apple Pouring Billions ...   
15  Apple beats Elon Musk to acquire Prompt AI for...   
16  Apple (AAPL) Price Target S

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   60 non-null     object
 1   media   60 non-null     object
 2   date    60 non-null     object
 3   link    60 non-null     object
dtypes: object(4)
memory usage: 2.0+ KB
