In [1]:
!pip install vaderSentiment



In [2]:
import pandas as pd
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
df = pd.read_pickle("merged_dataset.pkl") 

df.shape, df.head()

((81803, 19),
   ticker        date                created_at  \
 0     VZ  2014-01-05 2014-01-05 20:12:05+00:00   
 1     VZ  2014-01-15 2014-01-15 09:28:46+00:00   
 2      T  2014-01-15 2014-01-15 09:28:46+00:00   
 3     VZ  2014-01-21 2014-01-21 17:04:22+00:00   
 4    IBM  2014-01-21 2014-01-21 15:24:20+00:00   
 
                                           clean_text  \
 0                 the s amp ps worst sectors in 2013   
 1  ahah had to search ticker as well after i saw ...   
 2  ahah had to search ticker as well after i saw ...   
 3  sorry t mobile verizon is still the mightiest ...   
 4  hoy reportaran al cierre de mercado entre otra...   
 
                                            base_text  user.followers_count  \
 0  $VZ The S&amp;Ps Worst Sectors in 2013 http://...                     9   
 1  @maoxian ahah had to search ticker as well aft...                  6069   
 2  @maoxian ahah had to search ticker as well aft...                  6069   
 3  $VZ - Sorry T-

In [4]:
df["date"] = pd.to_datetime(df["date"]).dt.date

# Drop rows without returns (if any)
df = df.dropna(subset=["daily_return", "ret_3d", "ret_5d", "ret_7d"])

# Sort for later EWMA
df = df.sort_values(["ticker", "date", "created_at"])
df.shape

(34851, 19)

In [5]:
analyzer = SentimentIntensityAnalyzer()

df["sent_raw"] = df["clean_text"].astype(str).apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

df["sent_raw"].describe()

count    34851.000000
mean         0.107053
std          0.302995
min         -0.922500
25%          0.000000
50%          0.000000
75%          0.318200
max          0.953800
Name: sent_raw, dtype: float64

In [7]:
def agg_group(g):
    w = g["user.followers_count"].fillna(0).clip(lower=1)

    return pd.Series({
        "sent_raw": g["sent_raw"].mean(),
        "sent_weighted": (g["sent_raw"] * w).sum() / w.sum(),
        "followers_mean": g["user.followers_count"].mean(),
        "tweet_count": len(g),
        "daily_return": g["daily_return"].iloc[0],
        "ret_3d": g["ret_3d"].iloc[0],
        "ret_5d": g["ret_5d"].iloc[0],
        "ret_7d": g["ret_7d"].iloc[0],
    })

daily = (
    df
    .groupby(["ticker", "date"], as_index=False)
    .apply(agg_group)
)

daily = daily.reset_index(drop=True)
daily.shape, daily.head()

  .apply(agg_group)


((11177, 10),
   ticker        date  sent_raw  sent_weighted  followers_mean  tweet_count  \
 0   AAPL  2014-01-02  0.103657       0.072854     1182.571429          7.0   
 1   AAPL  2014-01-03  0.159500      -0.038990     4002.333333          3.0   
 2   AAPL  2014-01-06  0.079844       0.012992     9452.555556          9.0   
 3   AAPL  2014-01-07  0.354350       0.455834     3946.500000         14.0   
 4   AAPL  2014-01-08  0.281275       0.279246     6444.500000          4.0   
 
    daily_return    ret_3d    ret_5d    ret_7d  
 0     -0.014064 -0.023665 -0.030029 -0.031457  
 1     -0.021966  0.004584 -0.014862  0.010000  
 2      0.005453 -0.013623 -0.015075  0.024691  
 3     -0.007151 -0.013147  0.011758  0.026313  
 4      0.006333 -0.014224  0.025577 -0.005134  )

In [8]:
daily = daily.sort_values(["ticker", "date"])

for span in [2, 5, 10]:
    daily[f"sent_ewma_{span}"] = (
        daily
        .groupby("ticker")["sent_raw"]
        .transform(lambda s: s.ewm(span=span, adjust=False).mean())
    )

daily.head()

Unnamed: 0,ticker,date,sent_raw,sent_weighted,followers_mean,tweet_count,daily_return,ret_3d,ret_5d,ret_7d,sent_ewma_2,sent_ewma_5,sent_ewma_10
0,AAPL,2014-01-02,0.103657,0.072854,1182.571429,7.0,-0.014064,-0.023665,-0.030029,-0.031457,0.103657,0.103657,0.103657
1,AAPL,2014-01-03,0.1595,-0.03899,4002.333333,3.0,-0.021966,0.004584,-0.014862,0.01,0.140886,0.122271,0.11381
2,AAPL,2014-01-06,0.079844,0.012992,9452.555556,9.0,0.005453,-0.013623,-0.015075,0.024691,0.100192,0.108129,0.107635
3,AAPL,2014-01-07,0.35435,0.455834,3946.5,14.0,-0.007151,-0.013147,0.011758,0.026313,0.269631,0.190203,0.152492
4,AAPL,2014-01-08,0.281275,0.279246,6444.5,4.0,0.006333,-0.014224,0.025577,-0.005134,0.277394,0.22056,0.175907


In [9]:
target_col = "ret_5d"

ml = daily.dropna(subset=[target_col]).copy()

feature_cols = [
    "sent_raw",
    "sent_weighted",
    "sent_ewma_2",
    "sent_ewma_5",
    "sent_ewma_10",
    "followers_mean",
    "tweet_count",
]

ml = ml[["ticker", "date"] + feature_cols + ["daily_return", "ret_3d", "ret_5d", "ret_7d"]]
ml.shape, ml.head()

((11177, 13),
   ticker        date  sent_raw  sent_weighted  sent_ewma_2  sent_ewma_5  \
 0   AAPL  2014-01-02  0.103657       0.072854     0.103657     0.103657   
 1   AAPL  2014-01-03  0.159500      -0.038990     0.140886     0.122271   
 2   AAPL  2014-01-06  0.079844       0.012992     0.100192     0.108129   
 3   AAPL  2014-01-07  0.354350       0.455834     0.269631     0.190203   
 4   AAPL  2014-01-08  0.281275       0.279246     0.277394     0.220560   
 
    sent_ewma_10  followers_mean  tweet_count  daily_return    ret_3d  \
 0      0.103657     1182.571429          7.0     -0.014064 -0.023665   
 1      0.113810     4002.333333          3.0     -0.021966  0.004584   
 2      0.107635     9452.555556          9.0      0.005453 -0.013623   
 3      0.152492     3946.500000         14.0     -0.007151 -0.013147   
 4      0.175907     6444.500000          4.0      0.006333 -0.014224   
 
      ret_5d    ret_7d  
 0 -0.030029 -0.031457  
 1 -0.014862  0.010000  
 2 -0.015075 

In [10]:
ml.to_csv("ml_dataset_high_impact.csv", index=False)
ml.to_pickle("ml_dataset_high_impact.pkl")