In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet,LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('../data/dataset.csv')
df = df.sort_values(["ticker", "date"])

# previous 1-day return
df["prev_ret_1d"] = df.groupby("ticker")["daily_return"].shift(1)

# previous 3-day return (past window)
df["prev_ret_3d"] = (
    df.groupby("ticker")["daily_return"]
      .rolling(3).sum()
      .shift(1)
      .reset_index(level=0, drop=True)
)

# previous 5-day return (past window)
df["prev_ret_5d"] = (
    df.groupby("ticker")["daily_return"]
      .rolling(5).sum()
      .shift(1)
      .reset_index(level=0, drop=True)
)


In [4]:
target = 'ret_5d'
features = [
    "sent_raw",
    "sent_weighted",
    "sent_ewma_2",
    "sent_ewma_5",
    "sent_ewma_10",
    "followers_mean",
    "tweet_count",
    "prev_ret_1d",
    "prev_ret_3d",
    "prev_ret_5d",
]


X = df[features].copy()
y = df[target].copy()

mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=42
)

In [5]:
#Feature selection 

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

enet = ElasticNet(alpha=0.001,l1_ratio=0.5,max_iter=5000,random_state=42)
enet.fit(X_train_s,y_train)
y_pred = enet.predict(X_test_s)

coefs = pd.Series(enet.coef_, index=features)
print("Elastic Net coefs:\n", coefs)


selected = list(coefs[coefs != 0].index)
if not selected:
    selected = features
print("\nSelected features:", selected)


print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))


Elastic Net coefs:
 sent_raw          0.000000
sent_weighted     0.000000
sent_ewma_2       0.000000
sent_ewma_5       0.000000
sent_ewma_10     -0.000000
followers_mean   -0.000000
tweet_count       0.000073
prev_ret_1d      -0.000069
prev_ret_3d      -0.001719
prev_ret_5d      -0.000000
dtype: float64

Selected features: ['tweet_count', 'prev_ret_1d', 'prev_ret_3d']
MSE: 0.0010645100365252922
R^2: 0.0004464834758999281


In [6]:
scaler_sel = StandardScaler()
X_train_sel_s = scaler_sel.fit_transform(X_train[selected])
X_test_sel_s  = scaler_sel.transform(X_test[selected])

linreg = LinearRegression()
linreg.fit(X_train_sel_s, y_train)
y_pred = linreg.predict(X_test_sel_s)

print("\nMSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))
print("\nLinReg coefs:\n", pd.Series(linreg.coef_, index=selected))


MSE: 0.0010655567324309866
R^2: -0.0005363429300240252

LinReg coefs:
 tweet_count    0.000569
prev_ret_1d   -0.000386
prev_ret_3d   -0.002033
dtype: float64
