# (1) Twitter Data
## (1.1) Getting twitter data

In [1]:
import pandas as pd
from path import Path

from twarc import Twarc2, expansions
import json
import datetime
import pandas as pd
from pprint import pprint

In [2]:
from config import bearer_token

In [3]:
client = Twarc2(bearer_token=bearer_token)

In [4]:
user = 'elonmusk'
posts_dict = {
    'date':[],
    'text':[],
    'like_count':[],
    'quote_count':[],
    'reply_count':[],
    'retweet_count':[]
}

In [5]:
# pull posts from Twitter and create a dictionary
user_timeline = client.timeline(user=user, exclude_replies=True, start_time=datetime.datetime(2017,1,20, 0, 0, 0) )
for page in user_timeline:
    result = expansions.flatten(page)
    for tweet in result:
        posts_dict['date'].append(tweet['created_at'])
        posts_dict['text'].append(tweet['text'])
        posts_dict['like_count'].append(tweet['public_metrics']['like_count'])
        posts_dict['quote_count'].append(tweet['public_metrics']['quote_count'])
        posts_dict['reply_count'].append(tweet['public_metrics']['reply_count'])
        posts_dict['retweet_count'].append(tweet['public_metrics']['retweet_count'])

In [6]:
# convert dictionary of posts to dataframe
twitter_df = pd.DataFrame.from_dict(posts_dict)
twitter_df.head()
twitter_df.shape

(849, 6)

## (1.2) Clean the twitter data

In [7]:
# Drop the NaNs
twitter_df.dropna()

Unnamed: 0,date,text,like_count,quote_count,reply_count,retweet_count
0,2021-07-14T21:30:25.000Z,Some light reading with lil X https://t.co/MHj...,32561,208,3052,1750
1,2021-07-14T02:42:29.000Z,RT @Tesla: You can stream Netflix &amp; YouTub...,0,0,0,2624
2,2021-07-13T03:05:20.000Z,those who attack space\nmaybe don’t realize th...,239487,12754,30247,22302
3,2021-07-13T02:37:57.000Z,"Loki is pretty good. Basically, live-action @R...",131552,2791,7032,8884
4,2021-07-13T02:30:16.000Z,🤯 https://t.co/Z11qszTY4v,292920,1886,9360,21895
...,...,...,...,...,...,...
844,2020-06-21T06:19:41.000Z,"If heat death is the end of the universe, it r...",144421,896,3530,12620
845,2020-06-21T05:18:44.000Z,RT @cleantechnica: Exclusive Pro Photos: Tesla...,0,0,0,529
846,2020-06-21T00:31:25.000Z,RT @Tesla: https://t.co/26o1bAP14v,0,0,0,2167
847,2020-06-19T17:06:03.000Z,Juneteenth is henceforth considered a US holid...,402576,2590,5885,31266


In [8]:
# Determine data types for each column
twitter_df.dtypes

date             object
text             object
like_count        int64
quote_count       int64
reply_count       int64
retweet_count     int64
dtype: object

In [9]:
twitter_df.shape

(849, 6)

In [10]:
# Let's convert the date.
twitter_df['date'] = pd.to_datetime(twitter_df['date']).dt.date.astype('datetime64')
twitter_df.head()

Unnamed: 0,date,text,like_count,quote_count,reply_count,retweet_count
0,2021-07-14,Some light reading with lil X https://t.co/MHj...,32561,208,3052,1750
1,2021-07-14,RT @Tesla: You can stream Netflix &amp; YouTub...,0,0,0,2624
2,2021-07-13,those who attack space\nmaybe don’t realize th...,239487,12754,30247,22302
3,2021-07-13,"Loki is pretty good. Basically, live-action @R...",131552,2791,7032,8884
4,2021-07-13,🤯 https://t.co/Z11qszTY4v,292920,1886,9360,21895


In [11]:
from pandas import Series, DataFrame

def f(x):
     return Series(dict(like_count = x['like_count'].sum(),
                        quote_count = x['quote_count'].sum(),
                        reply_count = x['reply_count'].sum(),
                        retweet_count = x['retweet_count'].sum(),
                        text = "{%s}" % ', '.join(x['text'])))

In [12]:
twitter_df = twitter_df.groupby('date').apply(f).reset_index()
twitter_df.head()


Unnamed: 0,date,like_count,quote_count,reply_count,retweet_count,text
0,2020-06-18,0,0,0,1794,{RT @SpaceX: More than 100 spacecraft have bee...
1,2020-06-19,402576,2590,5885,31266,{Juneteenth is henceforth considered a US holi...
2,2020-06-21,735154,4879,17987,57393,{2019 seems so quaint &amp; long ago https://t...
3,2020-06-22,133437,892,5246,5438,{Tentative date for Tesla Shareholder Meeting ...
4,2020-06-25,259114,1039,4758,10807,{RT @GerberKawasaki: First thoughts driving my...


## (1.3) Preprocessing the Twitter data

**Preprocess the data by making it all lowercase. Remove a reasonable set of stopwords from the dataset and tokenize. Then, report the 10 most common words and their count. We need to iterate this process, adding some stop words as we understand the structure of the data. Justify additional stop words we've added.**

In [27]:
# Data Pre-processing and make the tweets all lowercase and remove stopwords.
from nltk.corpus import stopwords
en_stop_words = set(stopwords.words('english'))
list(en_stop_words)[:5]

['needn', 'before', 'own', 'weren', 'what']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime

import nltk
nltk.download('stopwords')

import pandas as pd
import re
import math

date_col='date'
tweet_col='text'
like_count= 'like_count'
quote_count= 'quote_count'
reply_count= 'reply_count'
retweet_count= 'retweet_count'


# lower the tweets
twitter_df['preprocessed_' + tweet_col] = twitter_df[tweet_col].str.lower()

# remove apostrophe from words
twitter_df.preprocessed_text = [re.sub("(‘[a-z])\s", "", row) for row in twitter_df.preprocessed_text] 
                                       
# filter out stop words and URLs
en_stop_words = set(stopwords.words('english'))
extended_stop_words = en_stop_words | \
                    {
                        '&amp;', 'rt',                            
                          'th','co', 're', 've', 'kim', 'daca', 'us', 'it', 'th', 'you', 'haha', 'st', 'et', 'so', 'iii',
                        'also', 've', 'la', 're', 'the', 'https', 'wow', 'actually', 'due', 'ft', 'pcr', 'via', 'am', 'gt',
                        'com', 'since', 'in', 'me', 'and', 'btw', 'yesterday', 'ii', 'inu', 'on', 'http', 'to', 'vs', 'rd', 
                        'ur', 'of', 'bs', 'km', 'est', 'em', 'lz', 'kms', 'aft', 'nd',  'here’s'
                    }
print(extended_stop_words)

url_re = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'        

twitter_df['preprocessed_' + tweet_col] = twitter_df['preprocessed_' + tweet_col].apply(lambda row: ' '.join([word for word in row.split() if (not word in extended_stop_words) and (not re.match(url_re, word))]))

# tokenize the tweets
tokenizer = RegexpTokenizer('[a-zA-Z]\w+\'?\w*')
twitter_df['tokenized_' + tweet_col] = twitter_df['preprocessed_' + tweet_col].apply(lambda row: tokenizer.tokenize(row))

df_tweets_clean = twitter_df
df_tweets_clean.head()

{'needn', 'before', 'own', 'weren', 'what', "didn't", 'for', "doesn't", 'the', "you'd", 'having', 'herself', 'during', 'will', 'km', 'was', 'mightn', "it's", 'than', 'as', "mustn't", 'into', 'also', 'to', 'until', 'actually', 'them', 'kim', 'from', 'after', 'daca', 'yesterday', 'further', 'rt', 'do', 'with', 'due', 'themselves', 'your', 'some', 'its', 'an', 'iii', 'yours', 'his', 'should', "couldn't", '&amp;', 'of', 'haha', 'th', 'bs', 'they', 'we', 'above', 'are', 'est', 'btw', 'which', 'don', 'has', 'co', 'and', "wouldn't", 'hers', 'kms', 'my', 'any', 'gt', 'not', 'ft', 'only', 'whom', 'o', "you'll", 'just', 'but', "won't", 'aft', 'each', 'no', 'again', 'yourselves', 'can', "you've", 'him', 'about', 'be', 'up', "haven't", 'where', 'now', 'her', 'how', "don't", 'out', 'vs', 'is', 'ain', 'i', 'd', 'wasn', 'myself', 'doing', 'while', 'lz', 'being', 'haven', 'shan', 'been', 'this', 'these', 'doesn', 'isn', "shouldn't", 'et', 'since', 'our', 'couldn', 'same', 't', 'didn', 'st', 'those', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zkirsan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,like_count,quote_count,reply_count,retweet_count,text,preprocessed_text,tokenized_text
0,2020-06-18,0,0,0,1794,{RT @SpaceX: More than 100 spacecraft have bee...,{rt @spacex: 100 spacecraft signed fly falcon ...,"[rt, spacex, spacecraft, signed, fly, falcon, ..."
1,2020-06-19,402576,2590,5885,31266,{Juneteenth is henceforth considered a US holi...,{juneteenth henceforth considered holiday tesl...,"[juneteenth, henceforth, considered, holiday, ..."
2,2020-06-21,735154,4879,17987,57393,{2019 seems so quaint &amp; long ago https://t...,"{2019 seems quaint long ago purpose, tesla bio...","[seems, quaint, long, ago, purpose, tesla, bio..."
3,2020-06-22,133437,892,5246,5438,{Tentative date for Tesla Shareholder Meeting ...,{tentative date tesla shareholder meeting batt...,"[tentative, date, tesla, shareholder, meeting,..."
4,2020-06-25,259114,1039,4758,10807,{RT @GerberKawasaki: First thoughts driving my...,{rt @gerberkawasaki: first thoughts driving ne...,"[rt, gerberkawasaki, first, thoughts, driving,..."


In [24]:
df_tweets_clean = df_tweets_clean[['date', 'text', 'preprocessed_text', 'tokenized_text', 'like_count', 'quote_count', 'reply_count', 'retweet_count']]
df_tweets_clean.head(10)

Unnamed: 0,date,text,preprocessed_text,tokenized_text,like_count,quote_count,reply_count,retweet_count
0,2020-06-18,{RT @SpaceX: More than 100 spacecraft have bee...,{rt @spacex: 100 spacecraft signed fly falcon ...,"[rt, spacex, spacecraft, signed, fly, falcon, ...",0,0,0,1794
1,2020-06-19,{Juneteenth is henceforth considered a US holi...,{juneteenth henceforth considered holiday tesl...,"[juneteenth, henceforth, considered, holiday, ...",402576,2590,5885,31266
2,2020-06-21,{2019 seems so quaint &amp; long ago https://t...,"{2019 seems quaint long ago purpose, tesla bio...","[seems, quaint, long, ago, purpose, tesla, bio...",735154,4879,17987,57393
3,2020-06-22,{Tentative date for Tesla Shareholder Meeting ...,{tentative date tesla shareholder meeting batt...,"[tentative, date, tesla, shareholder, meeting,...",133437,892,5246,5438
4,2020-06-25,{RT @GerberKawasaki: First thoughts driving my...,{rt @gerberkawasaki: first thoughts driving ne...,"[rt, gerberkawasaki, first, thoughts, driving,...",259114,1039,4758,10807
5,2020-06-26,{.@JeffBezos is a copy 🐈 haha https://t.co/plR...,"{.@jeffbezos copy 🐈 controls memes, controls u...","[jeffbezos, copy, controls, memes, controls, u...",2246288,24439,32931,300115
6,2020-06-28,"{Btw, Tesla actually receives *least* subsidie...","{btw, tesla receives *least* subsidies automak...","[btw, tesla, receives, least, subsidies, autom...",549422,5325,10659,55453
7,2020-06-30,{Your GPS just got slightly better https://t.c...,{your gps got slightly better @spacex: falcon ...,"[your, gps, got, slightly, better, spacex, fal...",156694,688,2610,21577
8,2020-07-01,{Tesla Impact Report (repost). We do everythin...,{tesla impact report (repost). everything huma...,"[tesla, impact, report, repost, everything, hu...",26862,208,1657,2530
9,2020-07-02,{Thanks Tesla owners &amp; investors! Love you...,{thanks tesla owners investors! love you!! wor...,"[thanks, tesla, owners, investors, love, you, ...",600565,11573,31287,39509


In [26]:
# the most common words and their count
def get_most_freq_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]
  
get_most_freq_words([ word for tweet in df_tweets_clean.tokenized_text for word in tweet],10)

[('spacex', 176),
 ('tesla', 116),
 ('rt', 67),
 ('launch', 57),
 ('dragon', 56),
 ('falcon', 54),
 ('first', 44),
 ('crew', 37),
 ('nasa', 36),
 ('space_station', 30)]

In [None]:
df_tweets_clean.to_csv('data/tweets_data.csv', index=False)

# (2 ) Stock data

## (2.1) Getting the stock data

In [None]:
import requests
import pandas as pd
from yahoo_fin.stock_info import get_data

In [None]:
# historical daily data from Yahoo finance
tesla_df = get_data("tsla", start_date = None, end_date = None, index_as_date = False, interval="1d")
tesla_df

## (2.2) Clean the stock data

In [None]:
# Drop adjclose column
tesla_df = tesla_df.drop(columns=["adjclose", "ticker"])
tesla_df.head()

In [None]:
# Determine data types for each column
tesla_df.dtypes

## (2.3) Preprocessing the Stock Data

In [None]:
# Calculate change in stock price
tesla_df['change'] = tesla_df['close'].diff()
tesla_df.head(10)

In [None]:
tesla_df.to_csv('data/tesla_stocks', index=False)