# (1) Twitter Data
## (1.1) Getting twitter data

In [1]:
import pandas as pd
from path import Path

from twarc import Twarc2, expansions
import json
import datetime
import pandas as pd
from pprint import pprint

In [2]:
from config import bearer_token

In [3]:
client = Twarc2(bearer_token=bearer_token)

In [4]:
user = 'elonmusk'
posts_dict = {
    'date':[],
    'text':[],
    'like_count':[],
    'quote_count':[],
    'reply_count':[],
    'retweet_count':[]
}

In [5]:
# pull posts from Twitter and create a dictionary
user_timeline = client.timeline(user=user, exclude_replies=True, start_time=datetime.datetime(2017,1,20, 0, 0, 0) )
for page in user_timeline:
    result = expansions.flatten(page)
    for tweet in result:
        posts_dict['date'].append(tweet['created_at'])
        posts_dict['text'].append(tweet['text'])
        posts_dict['like_count'].append(tweet['public_metrics']['like_count'])
        posts_dict['quote_count'].append(tweet['public_metrics']['quote_count'])
        posts_dict['reply_count'].append(tweet['public_metrics']['reply_count'])
        posts_dict['retweet_count'].append(tweet['public_metrics']['retweet_count'])

In [6]:
# convert dictionary of posts to dataframe
twitter_df = pd.DataFrame.from_dict(posts_dict)
twitter_df.head()
twitter_df.shape

(849, 6)

## (1.2) Clean the twitter data

In [7]:
# Drop the NaNs
twitter_df.dropna()

Unnamed: 0,date,text,like_count,quote_count,reply_count,retweet_count
0,2021-07-14T23:35:39.000Z,Review of Model S Plaid by Dan Neil\nhttps://t...,29872,189,5195,2617
1,2021-07-14T21:30:25.000Z,Some light reading with lil X https://t.co/MHj...,105566,405,5502,4433
2,2021-07-14T02:42:29.000Z,RT @Tesla: You can stream Netflix &amp; YouTub...,0,0,0,2793
3,2021-07-13T03:05:20.000Z,those who attack space\nmaybe don’t realize th...,247850,13307,31170,22968
4,2021-07-13T02:37:57.000Z,"Loki is pretty good. Basically, live-action @R...",135134,2863,7204,9106
...,...,...,...,...,...,...
844,2020-06-21T07:03:08.000Z,Mars is my souldog,187069,911,4137,10615
845,2020-06-21T06:19:41.000Z,"If heat death is the end of the universe, it r...",144392,896,3530,12618
846,2020-06-21T05:18:44.000Z,RT @cleantechnica: Exclusive Pro Photos: Tesla...,0,0,0,529
847,2020-06-21T00:31:25.000Z,RT @Tesla: https://t.co/26o1bAP14v,0,0,0,2167


In [8]:
# Determine data types for each column
twitter_df.dtypes

date             object
text             object
like_count        int64
quote_count       int64
reply_count       int64
retweet_count     int64
dtype: object

In [9]:
twitter_df.shape

(849, 6)

In [10]:
# Let's convert the date.
twitter_df['date'] = pd.to_datetime(twitter_df['date']).dt.date.astype('datetime64')
twitter_df.head()

Unnamed: 0,date,text,like_count,quote_count,reply_count,retweet_count
0,2021-07-14,Review of Model S Plaid by Dan Neil\nhttps://t...,29872,189,5195,2617
1,2021-07-14,Some light reading with lil X https://t.co/MHj...,105566,405,5502,4433
2,2021-07-14,RT @Tesla: You can stream Netflix &amp; YouTub...,0,0,0,2793
3,2021-07-13,those who attack space\nmaybe don’t realize th...,247850,13307,31170,22968
4,2021-07-13,"Loki is pretty good. Basically, live-action @R...",135134,2863,7204,9106


In [11]:
from pandas import Series, DataFrame

def f(x):
     return Series(dict(like_count = x['like_count'].sum(),
                        quote_count = x['quote_count'].sum(),
                        reply_count = x['reply_count'].sum(),
                        retweet_count = x['retweet_count'].sum(),
                        text = "{%s}" % ', '.join(x['text'])))

In [12]:
twitter_df = twitter_df.groupby('date').apply(f).reset_index()
twitter_df.head()


Unnamed: 0,date,like_count,quote_count,reply_count,retweet_count,text
0,2020-06-19,402524,2590,5885,31264,{Juneteenth is henceforth considered a US holi...
1,2020-06-21,735049,4879,17986,57388,{2019 seems so quaint &amp; long ago https://t...
2,2020-06-22,133410,892,5246,5438,{Tentative date for Tesla Shareholder Meeting ...
3,2020-06-25,259070,1039,4758,10803,{RT @GerberKawasaki: First thoughts driving my...
4,2020-06-26,2246092,24437,32927,300068,{.@JeffBezos is a copy 🐈 haha https://t.co/plR...


## (1.3) Preprocessing the Twitter data

**Preprocess the data by making it all lowercase. Remove a reasonable set of stopwords from the dataset and tokenize. Then, report the 10 most common words and their count. We need to iterate this process, adding some stop words as we understand the structure of the data. Justify additional stop words we've added.**

In [13]:
# Data Pre-processing and make the tweets all lowercase and remove stopwords.
from nltk.corpus import stopwords
en_stop_words = set(stopwords.words('english'))
list(en_stop_words)[:5]

['weren', 'didn', 't', 'about', "hasn't"]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
from nltk.stem import PorterStemmer

import nltk
nltk.download('stopwords')

import pandas as pd
import re
import math

date_col='date'
tweet_col='text'
like_count= 'like_count'
quote_count= 'quote_count'
reply_count= 'reply_count'
retweet_count= 'retweet_count'

# lower the tweets
twitter_df['preprocessed_' + tweet_col] = twitter_df[tweet_col].str.lower()

# remove apostrophe from words and curly braces
twitter_df['preprocessed_text'] = [re.sub("^{", "", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("}\Z", "", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("('[a-z])\s", "", row) for row in twitter_df['preprocessed_text']]

                                      
# filter out stop words and URLs
en_stop_words = set(stopwords.words('english'))
extended_stop_words = en_stop_words | \
                    {
                        '&amp;', 'rt',                            
                          'th','co', 're', 've', 'kim', 'daca', 'us', 'it', 'th', 'you', 'haha', 'st', 'et', 'so', 'iii',
                        'also', 've', 'la', 're', 'the', 'https', 'wow', 'actually', 'due', 'ft', 'pcr', 'via', 'am', 'gt',
                        'com', 'since', 'in', 'me', 'and', 'btw', 'yesterday', 'ii', 'inu', 'on', 'http', 'to', 'vs', 'rd', 
                        'ur', 'of', 'bs', 'km', 'est', 'em', 'lz', 'kms', 'aft', 'nd',  'here’s'
                    }
print(extended_stop_words)

url_re = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'        

twitter_df['preprocessed_' + tweet_col] = twitter_df['preprocessed_' + tweet_col].apply(lambda row: ' '.join([word for word in row.split() if (not word in extended_stop_words) and (not re.match(url_re, word))]))

# tokenize the tweets
tokenizer = RegexpTokenizer('[a-zA-Z]\w+\'?\w*')


#create an object of class PorterStemmer
porter = PorterStemmer()

# apply stemming
twitter_df['preprocessed_text'] = [porter.stem(row) for row in twitter_df['preprocessed_text']]   

twitter_df['tokenized_' + tweet_col] = twitter_df['preprocessed_' + tweet_col].apply(lambda row: tokenizer.tokenize(row))

df_tweets_clean = twitter_df
df_tweets_clean.head()

{'weren', 'didn', 't', 'about', "hasn't", 'be', 'wasn', 'since', 'or', 'btw', 'whom', "aren't", 'down', 'during', 'they', 'is', 'kms', 'est', 'her', 'just', 'doesn', 'bs', 'was', "that'll", 'vs', "you're", 'yourselves', 'ii', "should've", 'an', 'actually', 'hasn', 'not', 'so', 'herself', 'also', 'only', 'yesterday', 'won', 'you', 'hadn', 'there', 'most', 'mustn', 'don', 'shan', 'here’s', 'et', 'off', 'ain', 'all', "mustn't", 'above', 're', "wasn't", 'such', 'ourselves', 'nd', 'than', 'who', 'us', 'its', "it's", 'via', 'own', 'now', "isn't", 'some', 'ur', "won't", 'where', 'what', 'after', 'can', 'gt', 'here', 'over', 've', 'has', 'ours', 'it', 'these', 'through', 'this', 'my', "couldn't", 'your', 'daca', 'those', 'very', 'under', "you'd", 'am', 'to', "you've", "haven't", 'will', 'himself', 'does', 'myself', "wouldn't", 'rt', '&amp;', 'needn', 'yours', 'when', 'before', 'further', 'itself', 'were', "weren't", "shan't", 'that', 'com', 'and', 'against', 'inu', 'same', 'until', 'th', 'me',

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zkirsan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,like_count,quote_count,reply_count,retweet_count,text,preprocessed_text,tokenized_text
0,2020-06-19,402524,2590,5885,31264,{Juneteenth is henceforth considered a US holi...,juneteenth henceforth considered holiday tesla...,"[juneteenth, henceforth, considered, holiday, ..."
1,2020-06-21,735049,4879,17986,57388,{2019 seems so quaint &amp; long ago https://t...,"2019 seems quaint long ago purpose, tesla biow...","[seems, quaint, long, ago, purpose, tesla, bio..."
2,2020-06-22,133410,892,5246,5438,{Tentative date for Tesla Shareholder Meeting ...,tentative date tesla shareholder meeting batte...,"[tentative, date, tesla, shareholder, meeting,..."
3,2020-06-25,259070,1039,4758,10803,{RT @GerberKawasaki: First thoughts driving my...,@gerberkawasaki: first thoughts driving new te...,"[gerberkawasaki, first, thoughts, driving, new..."
4,2020-06-26,2246092,24437,32927,300068,{.@JeffBezos is a copy 🐈 haha https://t.co/plR...,".@jeffbezos copy 🐈 controls memes, controls un...","[jeffbezos, copy, controls, memes, controls, u..."


In [15]:
df_tweets_clean = df_tweets_clean[['date', 'text', 'preprocessed_text', 'tokenized_text', 'like_count', 'quote_count', 'reply_count', 'retweet_count']]
df_tweets_clean.head(10)

Unnamed: 0,date,text,preprocessed_text,tokenized_text,like_count,quote_count,reply_count,retweet_count
0,2020-06-19,{Juneteenth is henceforth considered a US holi...,juneteenth henceforth considered holiday tesla...,"[juneteenth, henceforth, considered, holiday, ...",402524,2590,5885,31264
1,2020-06-21,{2019 seems so quaint &amp; long ago https://t...,"2019 seems quaint long ago purpose, tesla biow...","[seems, quaint, long, ago, purpose, tesla, bio...",735049,4879,17986,57388
2,2020-06-22,{Tentative date for Tesla Shareholder Meeting ...,tentative date tesla shareholder meeting batte...,"[tentative, date, tesla, shareholder, meeting,...",133410,892,5246,5438
3,2020-06-25,{RT @GerberKawasaki: First thoughts driving my...,@gerberkawasaki: first thoughts driving new te...,"[gerberkawasaki, first, thoughts, driving, new...",259070,1039,4758,10803
4,2020-06-26,{.@JeffBezos is a copy 🐈 haha https://t.co/plR...,".@jeffbezos copy 🐈 controls memes, controls un...","[jeffbezos, copy, controls, memes, controls, u...",2246092,24437,32927,300068
5,2020-06-28,"{Btw, Tesla actually receives *least* subsidie...","btw, tesla receives *least* subsidies automake...","[btw, tesla, receives, least, subsidies, autom...",549375,5325,10657,55441
6,2020-06-30,{Your GPS just got slightly better https://t.c...,gps got slightly better @spacex: falcon 9’s fi...,"[gps, got, slightly, better, spacex, falcon, f...",156672,688,2610,21575
7,2020-07-01,{Tesla Impact Report (repost). We do everythin...,tesla impact report (repost). everything human...,"[tesla, impact, report, repost, everything, hu...",26857,208,1657,2530
8,2020-07-02,{Thanks Tesla owners &amp; investors! Love you...,thanks tesla owners investors! love you!! work...,"[thanks, tesla, owners, investors, love, you, ...",600485,11573,31284,39497
9,2020-07-04,{Please take a moment to report accounts clear...,please take moment report accounts clearly eng...,"[please, take, moment, report, accounts, clear...",377757,2674,15608,32526


In [16]:
# the most common words and their count
def get_most_freq_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]
  
get_most_freq_words([ word for tweet in df_tweets_clean.tokenized_text for word in tweet],10)

[('spacex', 170),
 ('tesla', 114),
 ('launch', 56),
 ('dragon', 55),
 ('falcon', 53),
 ('first', 44),
 ('nasa', 36),
 ('crew', 35),
 ('model', 29),
 ('mission', 27)]

In [17]:
df_tweets_clean.to_csv('data/tweets_data.csv', index=False)

# (2 ) Stock data

## (2.1) Getting the stock data

In [18]:
import requests
import pandas as pd
from yahoo_fin.stock_info import get_data

In [19]:
# historical daily data from Yahoo finance
tesla_df = get_data("tsla", start_date = None, end_date = None, index_as_date = False, interval="1d")
tesla_df

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2010-06-29,3.800000,5.000000,3.508000,4.778000,4.778000,93831500,TSLA
1,2010-06-30,5.158000,6.084000,4.660000,4.766000,4.766000,85935500,TSLA
2,2010-07-01,5.000000,5.184000,4.054000,4.392000,4.392000,41094000,TSLA
3,2010-07-02,4.600000,4.620000,3.742000,3.840000,3.840000,25699000,TSLA
4,2010-07-06,4.000000,4.000000,3.166000,3.222000,3.222000,34334500,TSLA
...,...,...,...,...,...,...,...,...
2775,2021-07-08,628.369995,654.429993,620.460022,652.809998,652.809998,22773300,TSLA
2776,2021-07-09,653.179993,658.909973,644.690002,656.950012,656.950012,18118500,TSLA
2777,2021-07-12,662.200012,687.239990,662.159973,685.700012,685.700012,25927000,TSLA
2778,2021-07-13,686.320007,693.280029,666.299988,668.539978,668.539978,20847500,TSLA


## (2.2) Clean the stock data

In [20]:
# Drop adjclose column
tesla_df = tesla_df.drop(columns=["adjclose", "ticker"])
tesla_df.head()

Unnamed: 0,date,open,high,low,close,volume
0,2010-06-29,3.8,5.0,3.508,4.778,93831500
1,2010-06-30,5.158,6.084,4.66,4.766,85935500
2,2010-07-01,5.0,5.184,4.054,4.392,41094000
3,2010-07-02,4.6,4.62,3.742,3.84,25699000
4,2010-07-06,4.0,4.0,3.166,3.222,34334500


In [21]:
# Determine data types for each column
tesla_df.dtypes

date      datetime64[ns]
open             float64
high             float64
low              float64
close            float64
volume             int64
dtype: object

## (2.3) Preprocessing the Stock Data

In [22]:
# Calculate change in stock price
tesla_df['change'] = tesla_df['close'].diff()
tesla_df.head(10)

Unnamed: 0,date,open,high,low,close,volume,change
0,2010-06-29,3.8,5.0,3.508,4.778,93831500,
1,2010-06-30,5.158,6.084,4.66,4.766,85935500,-0.012
2,2010-07-01,5.0,5.184,4.054,4.392,41094000,-0.374
3,2010-07-02,4.6,4.62,3.742,3.84,25699000,-0.552
4,2010-07-06,4.0,4.0,3.166,3.222,34334500,-0.618
5,2010-07-07,3.28,3.326,2.996,3.16,34608500,-0.062
6,2010-07-08,3.228,3.504,3.114,3.492,38557000,0.332
7,2010-07-09,3.516,3.58,3.31,3.48,20253000,-0.012
8,2010-07-12,3.59,3.614,3.4,3.41,11012500,-0.07
9,2010-07-13,3.478,3.728,3.38,3.628,13400500,0.218


In [23]:
tesla_df.to_csv('data/tesla_stocks', index=False)