In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor


df = pd.read_csv("../data/tweets_vader_sentiment.csv")
df.head()


Unnamed: 0,Date,Tweet,Stock Name,Company Name,Open,High,Low,Close,Adj Close,Volume,clean_text,vader_score,vader_label
0,2021-09-30,I bought my first $AAPL stock in 2010. \n\nSin...,AAPL,Apple Inc.,143.660004,144.380005,141.279999,141.5,140.478485,89056700.0,bought first aapl stock 2010 since seen 137229...,0.3182,1
1,2021-09-30,The media is really pushing hard for their big...,AAPL,Apple Inc.,143.660004,144.380005,141.279999,141.5,140.478485,89056700.0,media really pushing hard big money clients ts...,0.1585,1
2,2021-09-30,"In 2020, Tim Cook, CEO of $AAPL, earned $265,0...",AAPL,Apple Inc.,143.660004,144.380005,141.279999,141.5,140.478485,89056700.0,2020 tim cook ceo aapl earned 265000000 total ...,0.0,0
3,2021-09-30,This thread is just a broad overview of the ba...,AAPL,Apple Inc.,143.660004,144.380005,141.279999,141.5,140.478485,89056700.0,thread broad overview balance sheet want speci...,0.4767,1
4,2021-09-30,"All about this trendline now on $AAPL, continu...",AAPL,Apple Inc.,143.660004,144.380005,141.279999,141.5,140.478485,89056700.0,trendline aapl continuing reject,-0.4019,-1


In [131]:
df['Date'] = pd.to_datetime(df['Date'])

price_df = df[['Stock Name','Date','Close']].drop_duplicates()

price_df = price_df.sort_values(['Stock Name','Date']).reset_index(drop=True)

price_df.head()


price_df['next_close'] = price_df.groupby('Stock Name')['Close'].shift(-1)

price_df['next_return'] = (price_df['next_close'] - price_df['Close']) / price_df['Close']

price_df.tail()

df = df.merge(
    price_df[['Stock Name', 'Date', 'next_close', 'next_return']],
    on=['Stock Name', 'Date'],
    how='left'
)

df[['Stock Name', 'Date', 'Close', 'next_close', 'next_return']].sample(10)





Unnamed: 0,Stock Name,Date,Close,next_close,next_return
31785,PYPL,2022-09-17,,,
65191,TSLA,2022-08-10,294.356659,286.630005,-0.026249
15119,KO,2022-05-19,60.0,60.98,0.016333
26285,NIO,2022-06-17,20.77,,
6800,AMD,2022-06-18,,,
21037,MSFT,2022-06-06,268.75,272.5,0.013953
53606,TSLA,2022-04-13,340.790009,328.333344,-0.036552
40164,TSLA,2021-12-06,336.33667,350.583344,0.042358
70797,TSM,2021-11-15,118.120003,118.080002,-0.000339
68066,TSLA,2022-09-14,302.609985,303.75,0.003767


In [132]:
df['log_volume'] = np.log1p(df['Volume'])

df[['Volume','log_volume']].head()


df['abs_vader'] = df['vader_score'].abs()

df[['vader_score', 'abs_vader']].head()

Unnamed: 0,vader_score,abs_vader
0,0.3182,0.3182
1,0.1585,0.1585
2,0.0,0.0
3,0.4767,0.4767
4,-0.4019,0.4019


In [133]:
# --- Feature engineering ---


df['tweet_length'] = df['clean_text'].apply(
    lambda x: len(str(x).split())
)


df['day_of_week'] = df['Date'].dt.dayofweek

df[['clean_text', 'tweet_length', 'day_of_week', 'vader_score']].head()


Unnamed: 0,clean_text,tweet_length,day_of_week,vader_score
0,bought first aapl stock 2010 since seen 137229...,12,3,0.3182
1,media really pushing hard big money clients ts...,29,3,0.1585
2,2020 tim cook ceo aapl earned 265000000 total ...,26,3,0.0
3,thread broad overview balance sheet want speci...,21,3,0.4767
4,trendline aapl continuing reject,4,3,-0.4019
