In [1]:
# Packages
import pickle
from textblob import TextBlob
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import Counter
import nltk, spacy, re, json
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import PorterStemmer
nltk.download("punkt")
nltk.download('stopwords')

# Set up
% matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package punkt to /jet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /jet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Combine Features

### Import sentiment features

In [2]:
### import features: sentiment scores of elon's telsa related tweets
with open('../../data/features/2019_05_17_nlp_sentiments.pickle', "rb") as file:
    elon_features = pickle.load(file)

In [3]:
elon_features.shape

(6251, 5)

In [4]:
elon_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6251 entries, 0 to 6250
Data columns (total 5 columns):
id              6251 non-null object
Datetime        6251 non-null object
text            6251 non-null object
sentiment       6251 non-null float64
subjectivity    6251 non-null float64
dtypes: float64(2), object(3)
memory usage: 244.3+ KB


In [5]:
len(elon_features['Datetime'].unique())

6150

In [6]:
# tweets are indexed by id rather than datetime 
elon_features[elon_features['Datetime'].duplicated()]

Unnamed: 0,id,Datetime,text,sentiment,subjectivity
259,1108890675922112512,2019-03-22 00:38:00+00:00,new tesla referral program just releasedhttps://www.tesla.com/blog/teslas-new-customer-referral-program?redirect=no â€¦,0.136364,0.454545
264,1108812577327341568,2019-03-21 19:28:00+00:00,"jim, no response?",0.000000,0.000000
292,1107024770304868352,2019-03-16 21:04:00+00:00,"definitely down the road, once construction & upgrades arenâ€™t so crazy",-0.377778,0.594444
389,1102344922554724353,2019-03-03 23:08:00+00:00,i believe so,0.000000,0.000000
476,1098774731140956160,2019-02-22 02:41:00+00:00,"guys, i have feelings â€¦",0.000000,0.000000
491,1098653939141009408,2019-02-21 18:41:00+00:00,"merlins. the max chamber pressure run damaged raptor sn 1 (as expected). a lot of the parts are fine for reuse, but next tests will be with sn 2, which is almost done.",0.036111,0.216667
612,1091813829925732352,2019-02-02 21:41:00+00:00,i do like nonsense memes. itâ€™s true.,0.175000,0.325000
621,1091271584839393280,2019-02-01 09:46:00+00:00,logarithms,0.000000,0.000000
820,1079819504710967297,2018-12-31 19:20:00+00:00,happy new year!,0.485227,0.727273
841,1078022229927968768,2018-12-26 20:18:00+00:00,yes,0.000000,0.000000


In [7]:
elon_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6251 entries, 0 to 6250
Data columns (total 5 columns):
id              6251 non-null object
Datetime        6251 non-null object
text            6251 non-null object
sentiment       6251 non-null float64
subjectivity    6251 non-null float64
dtypes: float64(2), object(3)
memory usage: 244.3+ KB


### Add datetime features

In [36]:
### import features: datatime related (day, month, week, DayofWeek, BinaryTrading)
with open('../../data/features/2019_05_17_Datetime.pickle', "rb") as file:
    datetime_features = pickle.load(file)

In [37]:
datetime_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6251 entries, 0 to 6250
Data columns (total 20 columns):
Unnamed: 0       6251 non-null int64
Datetime         6251 non-null datetime64[ns, UTC]
retweets         6251 non-null int64
favorites        6251 non-null int64
text             6251 non-null object
id               6251 non-null object
emoji            186 non-null object
isreply          6242 non-null object
replyto          3950 non-null object
origin           6242 non-null float64
keyword          6251 non-null bool
year             6251 non-null int64
month            6251 non-null int64
day              6251 non-null int64
date             6251 non-null object
DayofWeek        6251 non-null int64
Weekend          6251 non-null bool
Time             6251 non-null object
BinaryTrading    6251 non-null int64
DayDistance      6251 non-null int64
dtypes: bool(2), datetime64[ns, UTC](1), float64(1), int64(9), object(7)
memory usage: 891.3+ KB


In [38]:
#elon_features['Datetime'] = pd.to_datetime(elon_features['Datetime'], utc=True)

In [39]:
elon_features.shape, datetime_features.shape

((6251, 5), (6251, 20))

In [40]:
elon_features[elon_features['id'].duplicated()]

Unnamed: 0,id,Datetime,text,sentiment,subjectivity


In [41]:
datetime_features[datetime_features['id'].duplicated()]

Unnamed: 0.1,Unnamed: 0,Datetime,retweets,favorites,text,id,emoji,isreply,replyto,origin,keyword,year,month,day,date,DayofWeek,Weekend,Time,BinaryTrading,DayDistance


In [42]:
# features = elon_features.set_index('Datetime').drop(['text'], axis = 1).merge(
#     datetime_features.set_index('Datetime'), left_index=True, right_index=True, how='inner')

In [43]:
### Merge sentiment and datatime features
features = elon_features.merge(datetime_features.drop(['text', 'Datetime'], axis = 1), 
                    left_on='id', right_on='id', how='inner')

In [44]:
features.shape

(6251, 22)

In [45]:
#features[features['Datetime'].duplicated()]

### Add Google Trend Variables

In [18]:
with open('../../data/features/2019_05_7_ElonGoogleTrend_Jeff.pickle', "rb") as file:
    googletrend_features = pickle.load(file)

In [46]:
googletrend_features.shape

(6251, 17)

In [47]:
googletrend_features

Unnamed: 0.1,Unnamed: 0,Datetime,retweets,favorites,text,id,emoji,isreply,replyto,origin,keyword,year,month,day,date,DayofWeek,gtrend
0,0,2019-04-14 23:31:00+00:00,107,4213,starship will land on a ring of fire,1117571159195668480,,True,jasonsellspa,1.117564e+18,False,2019,4,14,2019-04-14,6,28
1,1,2019-04-14 23:10:00+00:00,83,4981,6.5,1117565769829818368,,True,JaneidyEve,1.117564e+18,False,2019,4,14,2019-04-14,6,28
2,2,2019-04-14 23:06:00+00:00,61,4041,yes,1117564774190075904,,True,Johankyu7,1.117564e+18,False,2019,4,14,2019-04-14,6,28
3,3,2019-04-14 23:02:00+00:00,10739,117153,winter is coming,1117563769159286784,,False,,1.117564e+18,False,2019,4,14,2019-04-14,6,28
4,4,2019-04-14 23:01:00+00:00,12245,162803,thinking about adding giant stainless steel dragon wings to starship,1117563679099240449,,False,,1.117564e+18,False,2019,4,14,2019-04-14,6,28
5,5,2019-04-14 22:54:00+00:00,131,5201,"that tweet did take immense effort fair point tho. powerwall production is now ramping fast. tesla was cell-starved last year, so we had to switch all lines to make packs for cars, which meant powerwall production was living off scraps.",1117561885040283648,ðŸ¤£ ðŸ¤£,True,EforElectric,1.117554e+18,True,2019,4,14,2019-04-14,6,28
6,6,2019-04-14 22:46:00+00:00,65,2181,deal,1117559770955812865,,True,yourloyalpal,1.117554e+18,False,2019,4,14,2019-04-14,6,28
7,7,2019-04-14 22:21:00+00:00,4864,54282,please support my campaign to rebrand @ wsj as emoji!,1117553530615648256,ðŸ§¦,False,,1.117554e+18,False,2019,4,14,2019-04-14,6,28
8,8,2019-04-14 22:17:00+00:00,2073,42205,"at some point, @ theonion and i should just a get a room â€¦",1117552423373918208,,False,,1.117552e+18,False,2019,4,14,2019-04-14,6,28
9,9,2019-04-14 22:14:00+00:00,177,1979,just use this handy guidehttps://www.theonion.com/the-onion-s-guide-to-blockchain-technology-1829819640 â€¦,1117551672039841793,,True,techreview,1.117533e+18,False,2019,4,14,2019-04-14,6,28


In [48]:
features = features.merge(googletrend_features[['id', 'gtrend']], 
                    left_on='id', right_on='id', how='inner')

In [49]:
features

Unnamed: 0.1,id,Datetime,text,sentiment,subjectivity,Unnamed: 0,retweets,favorites,emoji,isreply,...,year,month,day,date,DayofWeek,Weekend,Time,BinaryTrading,DayDistance,gtrend
0,1117571159195668480,2019-04-14 23:31:00+00:00,starship will land on a ring of fire,0.000000,0.000000,0,107,4213,,True,...,2019,4,14,2019-04-14,6,True,23:31:00,0,1,28
1,1117565769829818368,2019-04-14 23:10:00+00:00,6.5,0.000000,0.000000,1,83,4981,,True,...,2019,4,14,2019-04-14,6,True,23:10:00,0,1,28
2,1117564774190075904,2019-04-14 23:06:00+00:00,yes,0.000000,0.000000,2,61,4041,,True,...,2019,4,14,2019-04-14,6,True,23:06:00,0,1,28
3,1117563769159286784,2019-04-14 23:02:00+00:00,winter is coming,0.000000,0.000000,3,10739,117153,,False,...,2019,4,14,2019-04-14,6,True,23:02:00,0,1,28
4,1117563679099240449,2019-04-14 23:01:00+00:00,thinking about adding giant stainless steel dragon wings to starship,0.100000,0.600000,4,12245,162803,,False,...,2019,4,14,2019-04-14,6,True,23:01:00,0,1,28
5,1117561885040283648,2019-04-14 22:54:00+00:00,"that tweet did take immense effort fair point tho. powerwall production is now ramping fast. tesla was cell-starved last year, so we had to switch all lines to make packs for cars, which meant powerwall production was living off scraps.",0.183333,0.538889,5,131,5201,ðŸ¤£ ðŸ¤£,True,...,2019,4,14,2019-04-14,6,True,22:54:00,0,1,28
6,1117559770955812865,2019-04-14 22:46:00+00:00,deal,0.000000,0.000000,6,65,2181,,True,...,2019,4,14,2019-04-14,6,True,22:46:00,0,1,28
7,1117553530615648256,2019-04-14 22:21:00+00:00,please support my campaign to rebrand @ wsj as emoji!,0.000000,0.000000,7,4864,54282,ðŸ§¦,False,...,2019,4,14,2019-04-14,6,True,22:21:00,0,1,28
8,1117552423373918208,2019-04-14 22:17:00+00:00,"at some point, @ theonion and i should just a get a room â€¦",0.000000,0.000000,8,2073,42205,,False,...,2019,4,14,2019-04-14,6,True,22:17:00,0,1,28
9,1117551672039841793,2019-04-14 22:14:00+00:00,just use this handy guidehttps://www.theonion.com/the-onion-s-guide-to-blockchain-technology-1829819640 â€¦,0.600000,0.900000,9,177,1979,,True,...,2019,4,14,2019-04-14,6,True,22:14:00,0,1,28


In [50]:
features.shape

(6251, 23)

### Use only obs after 2015-05-23 when yahoo finance data are accessible

In [51]:
import datetime
features = features[features['date'] >= datetime.date(2015, 5, 23)]

In [52]:
features.shape

(5175, 23)

In [53]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5175 entries, 0 to 5174
Data columns (total 23 columns):
id               5175 non-null object
Datetime         5175 non-null object
text             5175 non-null object
sentiment        5175 non-null float64
subjectivity     5175 non-null float64
Unnamed: 0       5175 non-null int64
retweets         5175 non-null int64
favorites        5175 non-null int64
emoji            186 non-null object
isreply          5171 non-null object
replyto          3655 non-null object
origin           5171 non-null float64
keyword          5175 non-null bool
year             5175 non-null int64
month            5175 non-null int64
day              5175 non-null int64
date             5175 non-null object
DayofWeek        5175 non-null int64
Weekend          5175 non-null bool
Time             5175 non-null object
BinaryTrading    5175 non-null int64
DayDistance      5175 non-null int64
gtrend           5175 non-null int64
dtypes: bool(2), float64(3), in

In [54]:
features.shape

(5175, 23)

### Add sentiment features from comments to elon's tweets

In [55]:
import h5py
import pandas as pd

with open('../../data/features/2019_05_15_Eloncomments1hour.pickle', "rb") as file:
    elon_comments = pickle.load(file)

In [56]:
elon_comments.shape

(6245, 15)

In [57]:
elon_comments.columns

Index(['retweets', 'favorites', 'CommentSentimental', 'CommentSubjectivity',
       'date', 'id', 'origin', 'countComment', 'posCommentSum',
       'negCommentSum', 'CommentSD', 'posCommentSD', 'negCommentSD',
       'negCommentPercent', 'posCommentPercent'],
      dtype='object')

In [58]:
elon_comments

Unnamed: 0,retweets,favorites,CommentSentimental,CommentSubjectivity,date,id,origin,countComment,posCommentSum,negCommentSum,CommentSD,posCommentSD,negCommentSD,negCommentPercent,posCommentPercent
0,10513,120734,63.134057,229.326642,2019-04-14 23:31:00,1117571159195668480,1117563679099240449,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
1,10513,120734,63.134057,229.326642,2019-04-14 23:10:00,1117565769829818368,1117563679099240449,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
2,10513,120734,63.134057,229.326642,2019-04-14 23:06:00,1117564774190075904,1117563679099240449,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
3,10513,120734,63.134057,229.326642,2019-04-14 23:02:00,1117563769159286784,1117563679099240449,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
4,373,19413,30.295510,123.411165,2019-04-14 22:54:00,1117561885040283648,1117553530615648256,727,50.753336,-20.457826,0.211838,0.228215,0.216880,0.101788,0.210454
5,10513,120734,63.134057,229.326642,2019-04-14 23:01:00,1117563679099240449,1117563679099240449,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
6,373,19407,30.470510,121.227832,2019-04-14 22:46:00,1117559770955812865,1117553530615648256,715,50.240836,-19.770326,0.211555,0.228942,0.212865,0.102098,0.209790
7,372,19383,27.652166,109.879184,2019-04-14 22:21:00,1117553530615648256,1117553530615648256,681,45.409991,-17.757826,0.205035,0.226954,0.205873,0.096916,0.198238
8,133,4825,21.880338,74.959134,2019-04-14 22:18:00,1117552682141532160,1117552423373918208,402,30.832074,-8.951736,0.204717,0.214156,0.161391,0.097015,0.213930
9,133,4823,21.680338,74.625800,2019-04-14 22:17:00,1117552423373918208,1117552423373918208,400,30.632074,-8.951736,0.205082,0.214723,0.161391,0.097500,0.212500


In [59]:
#elon_comments[elon_comments['id'] != elon_comments['origin']]

In [60]:
elon_comments = elon_comments.drop(['retweets', 'favorites','date', 'origin',], axis = 1)

In [61]:
features.shape

(5175, 23)

In [62]:
features = pd.merge(features, elon_comments, left_on = 'id', right_on = 'id', how = 'inner')

In [63]:
features.shape

(5175, 33)

In [64]:
features

Unnamed: 0.1,id,Datetime,text,sentiment,subjectivity,Unnamed: 0,retweets,favorites,emoji,isreply,...,CommentSentimental,CommentSubjectivity,countComment,posCommentSum,negCommentSum,CommentSD,posCommentSD,negCommentSD,negCommentPercent,posCommentPercent
0,1117571159195668480,2019-04-14 23:31:00+00:00,starship will land on a ring of fire,0.000000,0.000000,0,107,4213,,True,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
1,1117565769829818368,2019-04-14 23:10:00+00:00,6.5,0.000000,0.000000,1,83,4981,,True,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
2,1117564774190075904,2019-04-14 23:06:00+00:00,yes,0.000000,0.000000,2,61,4041,,True,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
3,1117563769159286784,2019-04-14 23:02:00+00:00,winter is coming,0.000000,0.000000,3,10739,117153,,False,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
4,1117563679099240449,2019-04-14 23:01:00+00:00,thinking about adding giant stainless steel dragon wings to starship,0.100000,0.600000,4,12245,162803,,False,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
5,1117561885040283648,2019-04-14 22:54:00+00:00,"that tweet did take immense effort fair point tho. powerwall production is now ramping fast. tesla was cell-starved last year, so we had to switch all lines to make packs for cars, which meant powerwall production was living off scraps.",0.183333,0.538889,5,131,5201,ðŸ¤£ ðŸ¤£,True,...,30.295510,123.411165,727,50.753336,-20.457826,0.211838,0.228215,0.216880,0.101788,0.210454
6,1117559770955812865,2019-04-14 22:46:00+00:00,deal,0.000000,0.000000,6,65,2181,,True,...,30.470510,121.227832,715,50.240836,-19.770326,0.211555,0.228942,0.212865,0.102098,0.209790
7,1117553530615648256,2019-04-14 22:21:00+00:00,please support my campaign to rebrand @ wsj as emoji!,0.000000,0.000000,7,4864,54282,ðŸ§¦,False,...,27.652166,109.879184,681,45.409991,-17.757826,0.205035,0.226954,0.205873,0.096916,0.198238
8,1117552423373918208,2019-04-14 22:17:00+00:00,"at some point, @ theonion and i should just a get a room â€¦",0.000000,0.000000,8,2073,42205,,False,...,21.680338,74.625800,400,30.632074,-8.951736,0.205082,0.214723,0.161391,0.097500,0.212500
9,1117551672039841793,2019-04-14 22:14:00+00:00,just use this handy guidehttps://www.theonion.com/the-onion-s-guide-to-blockchain-technology-1829819640 â€¦,0.600000,0.900000,9,177,1979,,True,...,2.121852,4.341574,36,2.295000,-0.173148,0.187566,0.312288,0.018987,0.055556,0.222222


In [65]:
#features[features['emoji'].isna() == False]

In [66]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5175 entries, 0 to 5174
Data columns (total 33 columns):
id                     5175 non-null object
Datetime               5175 non-null object
text                   5175 non-null object
sentiment              5175 non-null float64
subjectivity           5175 non-null float64
Unnamed: 0             5175 non-null int64
retweets               5175 non-null int64
favorites              5175 non-null int64
emoji                  186 non-null object
isreply                5173 non-null object
replyto                3657 non-null object
origin                 5173 non-null float64
keyword                5175 non-null bool
year                   5175 non-null int64
month                  5175 non-null int64
day                    5175 non-null int64
date                   5175 non-null object
DayofWeek              5175 non-null int64
Weekend                5175 non-null bool
Time                   5175 non-null object
BinaryTrading         

In [69]:
features['isreply'] = features['isreply'].astype(int)

In [70]:
features = features.drop(['Unnamed: 0', 'emoji', 'replyto', 'origin','retweets', 'favorites'], axis = 1)

In [71]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5175 entries, 0 to 5174
Data columns (total 27 columns):
id                     5175 non-null object
Datetime               5175 non-null object
text                   5175 non-null object
sentiment              5175 non-null float64
subjectivity           5175 non-null float64
isreply                5175 non-null int64
keyword                5175 non-null bool
year                   5175 non-null int64
month                  5175 non-null int64
day                    5175 non-null int64
date                   5175 non-null object
DayofWeek              5175 non-null int64
Weekend                5175 non-null bool
Time                   5175 non-null object
BinaryTrading          5175 non-null int64
DayDistance            5175 non-null int64
gtrend                 5175 non-null int64
CommentSentimental     5175 non-null float64
CommentSubjectivity    5175 non-null float64
countComment           5175 non-null int64
posCommentSum         

In [72]:
features

Unnamed: 0,id,Datetime,text,sentiment,subjectivity,isreply,keyword,year,month,day,...,CommentSentimental,CommentSubjectivity,countComment,posCommentSum,negCommentSum,CommentSD,posCommentSD,negCommentSD,negCommentPercent,posCommentPercent
0,1117571159195668480,2019-04-14 23:31:00+00:00,starship will land on a ring of fire,0.000000,0.000000,1,False,2019,4,14,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
1,1117565769829818368,2019-04-14 23:10:00+00:00,6.5,0.000000,0.000000,1,False,2019,4,14,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
2,1117564774190075904,2019-04-14 23:06:00+00:00,yes,0.000000,0.000000,1,False,2019,4,14,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
3,1117563769159286784,2019-04-14 23:02:00+00:00,winter is coming,0.000000,0.000000,0,False,2019,4,14,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
4,1117563679099240449,2019-04-14 23:01:00+00:00,thinking about adding giant stainless steel dragon wings to starship,0.100000,0.600000,0,False,2019,4,14,...,63.134057,229.326642,1379,93.863996,-30.729939,0.205439,0.245963,0.208518,0.088470,0.207397
5,1117561885040283648,2019-04-14 22:54:00+00:00,"that tweet did take immense effort fair point tho. powerwall production is now ramping fast. tesla was cell-starved last year, so we had to switch all lines to make packs for cars, which meant powerwall production was living off scraps.",0.183333,0.538889,1,True,2019,4,14,...,30.295510,123.411165,727,50.753336,-20.457826,0.211838,0.228215,0.216880,0.101788,0.210454
6,1117559770955812865,2019-04-14 22:46:00+00:00,deal,0.000000,0.000000,1,False,2019,4,14,...,30.470510,121.227832,715,50.240836,-19.770326,0.211555,0.228942,0.212865,0.102098,0.209790
7,1117553530615648256,2019-04-14 22:21:00+00:00,please support my campaign to rebrand @ wsj as emoji!,0.000000,0.000000,0,False,2019,4,14,...,27.652166,109.879184,681,45.409991,-17.757826,0.205035,0.226954,0.205873,0.096916,0.198238
8,1117552423373918208,2019-04-14 22:17:00+00:00,"at some point, @ theonion and i should just a get a room â€¦",0.000000,0.000000,0,False,2019,4,14,...,21.680338,74.625800,400,30.632074,-8.951736,0.205082,0.214723,0.161391,0.097500,0.212500
9,1117551672039841793,2019-04-14 22:14:00+00:00,just use this handy guidehttps://www.theonion.com/the-onion-s-guide-to-blockchain-technology-1829819640 â€¦,0.600000,0.900000,1,False,2019,4,14,...,2.121852,4.341574,36,2.295000,-0.173148,0.187566,0.312288,0.018987,0.055556,0.222222


### Output the table contains all features

In [73]:
### Save cleaned features table to pickle file
with open('../../data/features/2019_05_17_all_features.pickle', 'wb') as file:
        pickle.dump(features, file, protocol=pickle.HIGHEST_PROTOCOL)