In [164]:
# --- 1. Import Libraries & Load Data ---------------------------------------
import pandas as pd
import yfinance as yf
import nltk
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

nltk.download('stopwords', download_dir='/Users/indranili/nltk_data')

tweets_df = pd.read_csv("../data/filtered_stocks_tweets.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/indranili/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [165]:
print(tweets_df.columns.tolist())
print(tweets_df.head())


['tweet_id', 'date', 'stock_ticker', 'text', 'user_followers', 'user_screen_name', 'user_verified', 'word_count', 'caps_ratio', 'has_url', 'emoji_count', 'cashtag_count', 'total_engagement', 'is_retweet', 'original_author', 'original_retweets', 'original_favorites', 'current_retweets', 'current_favorites', 'text_cleaned']
             tweet_id        date stock_ticker  \
0  459825460810383360  2014-04-26            V   
1  459827071418925056  2014-04-26            V   
2  459977378496667648  2014-04-26            V   
3  459994705191632896  2014-04-26            V   
4  459996785579028480  2014-04-26            V   

                                                text  user_followers  \
0  RT @philstockworld: Yesterday's Featured $AAPL...          459556   
1  RT @philstockworld: Yesterday's Featured $AAPL...          459554   
2  RT @philstockworld: Yesterday's Featured $AAPL...          459595   
3  RT @philstockworld: Yesterday's Featured $AAPL...          459590   
4  RT @philstoc

In [166]:
tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.date


tweets_df['vader'] = tweets_df['text_cleaned'].apply(
    lambda x: analyzer.polarity_scores(str(x))['compound']
)

daily = (
    tweets_df
    .groupby(['stock_ticker', 'date'])
    .agg(
        avg_vader    = ('vader', 'mean'),
        std_vader    = ('vader', 'std'),
        tweet_count  = ('text', 'count'),
        avg_length   = ('text', lambda x: x.str.len().mean())
    )
    .reset_index()
)




In [167]:
start_date = tweets_df['date'].min().strftime('%Y-%m-%d')
end_date   = tweets_df['date'].max().strftime('%Y-%m-%d')

symbols = daily['stock_ticker'].unique()
price_list = []

for sym in symbols:
    dfp = yf.download(sym, start=start_date, end=end_date, auto_adjust=True)

    # ---- FORCE FLATTEN MULTIINDEX ----
    dfp.columns = dfp.columns.get_level_values(0)

    # now dfp has: Open, High, Low, Close, Volume
    dfp = dfp[['Close']]
    dfp['return'] = dfp['Close'].pct_change()
    dfp['stock_ticker'] = sym
    dfp['date'] = dfp.index.date
    price_list.append(dfp)

price_df = pd.concat(price_list).reset_index(drop=True)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['BBL']: YFTzMissingError('possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed downloa

In [168]:
# =====================================================================
# FULL PIPELINE FOR MULTI-HORIZON RETURNS (5D, 10D, 20D)
# =====================================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor


# ============================================================
# 1. MERGE TWEET FEATURES + PRICE DATA
# ============================================================
merged = pd.merge(
    daily,
    price_df[['stock_ticker', 'date', 'Close', 'return']],
    on=['stock_ticker', 'date'],
    how='inner'
)

# ============================================================
# 2. TARGET HORIZONS: 5D, 10D, 20D FUTURE RETURNS
# ============================================================
merged['target_5d'] = merged.groupby('stock_ticker')['Close'].pct_change(-5)
merged['target_10d'] = merged.groupby('stock_ticker')['Close'].pct_change(-10)
merged['target_20d'] = merged.groupby('stock_ticker')['Close'].pct_change(-20)

# ============================================================
# 3. MOMENTUM FEATURES (past returns)
# ============================================================
merged['mom_5d'] = merged.groupby('stock_ticker')['Close'].pct_change(5)
merged['mom_10d'] = merged.groupby('stock_ticker')['Close'].pct_change(10)
merged['mom_20d'] = merged.groupby('stock_ticker')['Close'].pct_change(20)

# 1-day momentum (baseline)
merged['prev_return_1'] = merged.groupby('stock_ticker')['return'].shift(1)

# ============================================================
# 4. SENTIMENT EWMA (captures sentiment trends)
# ============================================================
merged['sent_ewma'] = (
    merged.groupby('stock_ticker')['avg_vader']
          .transform(lambda x: x.ewm(alpha=0.3).mean())
)

# ============================================================
# 5. VOLATILITY FEATURES
# ============================================================
merged['vol_5'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(5).std())
merged['vol_10'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(10).std())
merged['vol_20'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(20).std())

# Mean return features
merged['mean_ret_5'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(5).mean())
merged['mean_ret_10'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(10).mean())
merged['mean_ret_20'] = merged.groupby('stock_ticker')['return'].transform(lambda x: x.rolling(20).mean())

# ============================================================
# 6. PRICE RANGE VOLATILITY
# ============================================================
merged['range_vol'] = (
    merged.groupby('stock_ticker')['Close']
          .transform(lambda x: (x - x.rolling(5).min()) / x.rolling(5).min())
)

# ============================================================
# 7. CLEAN UP DATA
# ============================================================
merged['std_vader'] = merged['std_vader'].fillna(0)

merged = merged.dropna(subset=[
    'mom_5d', 'mom_10d', 'mom_20d',
    'vol_5', 'vol_10', 'vol_20',
    'mean_ret_5', 'mean_ret_10', 'mean_ret_20',
    'range_vol',
    'target_5d', 'target_10d', 'target_20d'
])

print("Merged shape:", merged.shape)

# ============================================================
# 8. FEATURE MATRIX
# ============================================================
feature_cols = [
    'avg_vader', 'std_vader', 'tweet_count', 'avg_length', 'sent_ewma',

    # Momentum
    'prev_return_1',
    'mom_5d', 'mom_10d', 'mom_20d',

    # Volatility
    'vol_5', 'vol_10', 'vol_20',

    # Trend
    'mean_ret_5', 'mean_ret_10', 'mean_ret_20',

    # Range volatility
    'range_vol'
]

X = merged[feature_cols]

# ============================================================
# FUNCTION: Evaluate model for given target horizon
# ============================================================
def evaluate_horizon(target):
    print("\n==============================")
    print(f"Evaluating horizon: {target}")
    print("==============================")

    y = merged[target]

    # Time split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )

    # ---- Elastic Net ----
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    enet = ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=5000)
    enet.fit(X_train_scaled, y_train)
    print("Elastic Net R²:", enet.score(X_test_scaled, y_test))

    # ---- EN → LR ----
    coefs = pd.Series(enet.coef_, index=X.columns)
    selected = list(coefs[coefs != 0].index)
    if len(selected) == 0:
        selected = X.columns

    lr = LinearRegression()
    lr.fit(X_train[selected], y_train)
    print("LR (selected) R²:", lr.score(X_test[selected], y_test))

    # ---- Random Forest ----
    rf = RandomForestRegressor(n_estimators=400, max_depth=8, min_samples_leaf=20)
    rf.fit(X_train, y_train)
    print("Random Forest R²:", rf.score(X_test, y_test))

    # ---- HGB ----
    hgb = HistGradientBoostingRegressor(max_depth=5, learning_rate=0.05, max_leaf_nodes=31)
    hgb.fit(X_train, y_train)
    print("HGB R²:", hgb.score(X_test, y_test))


# ============================================================
# 9. Evaluate all horizons (this is the main output)
# ============================================================
evaluate_horizon('target_5d')
evaluate_horizon('target_10d')
evaluate_horizon('target_20d')


Merged shape: (242, 23)

Evaluating horizon: target_5d
Elastic Net R²: 0.0223918552938992
LR (selected) R²: -0.008004206817759929
Random Forest R²: 0.034611174593545346
HGB R²: -0.05083219838959363

Evaluating horizon: target_10d
Elastic Net R²: -0.42769813614671315
LR (selected) R²: -0.44623893679267046
Random Forest R²: -0.1717585827664212
HGB R²: -0.3725153978031084

Evaluating horizon: target_20d
Elastic Net R²: -0.5799568294339654
LR (selected) R²: -0.6361190966933363
Random Forest R²: -0.560462272173941
HGB R²: -0.5860664528761528
