# (1) Twitter Data
## (1.1) Getting Twitter data 2021 from API

In [54]:
import pandas as pd
from path import Path
from twarc import Twarc2, expansions
import json
import datetime
import os

In [55]:
from config import bearer_token

In [56]:
client = Twarc2(bearer_token=bearer_token)

In [57]:
user = 'elonmusk'
posts_dict = {
    'date':[],
    'text':[],
    'like_count':[],
    'reply_count':[],
    'retweet_count':[]
}

In [58]:
# pull posts from Twitter and create a dictionary
user_timeline = client.timeline(user=user, exclude_replies=True, start_time=datetime.datetime(2021,1,1, 0, 0, 0) )
for page in user_timeline:
    result = expansions.flatten(page)
    for tweet in result:
        posts_dict['date'].append(tweet['created_at'])
        posts_dict['text'].append(tweet['text'])
        posts_dict['like_count'].append(tweet['public_metrics']['like_count'])
        posts_dict['reply_count'].append(tweet['public_metrics']['reply_count'])
        posts_dict['retweet_count'].append(tweet['public_metrics']['retweet_count'])

In [59]:
# convert dictionary of posts to dataframe
twitter_2021 = pd.DataFrame.from_dict(posts_dict)
twitter_2021.head()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2021-09-18T23:13:01.000Z,Congratulations @Inspiration4x!!!,111859,6234,7887
1,2021-09-18T23:12:22.000Z,"RT @SpaceX: Splashdown! Welcome back to planet Earth, @Inspiration4x! https://t.co/94yLjMBqWt",0,0,15456
2,2021-09-18T21:41:06.000Z,RT @SpaceX: Dragon has entered its last orbit before reentry and splashdown → https://t.co/bJFjLCzWdK https://t.co/rAeaXJLLIb,0,0,2079
3,2021-09-18T19:01:40.000Z,RT @SpaceX: Orbital moonrise https://t.co/vrx8Jzeu1t,0,0,4585
4,2021-09-18T04:54:38.000Z,"Moving at ~23 times speed of sound, circling Earth every ~90 minutes https://t.co/AncsjFpirC",99972,4382,9596


In [60]:
# convert date to datetime datatype
twitter_2021['date'] = pd.to_datetime(twitter_2021['date']).dt.date.astype('datetime64')
twitter_2021.tail()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
542,2021-01-07,This is called the domino effect https://t.co/qpbEW54RvM,363374,4442,36994
543,2021-01-04,"Because of the large footprint, it may seem flat, but actually ranges up to 5 stories tall",57379,1368,1055
544,2021-01-04,Snow falling on Giga Berlin https://t.co/eTXMtYp8hG,147180,3609,6790
545,2021-01-02,"So proud of the Tesla team for achieving this major milestone! At the start of Tesla, I thought we had (optimistically) a 10% chance of surviving at all. https://t.co/xCqTL5TGlE",108925,4104,6157
546,2021-01-02,"RT @Tesla: In 2020, we produced and delivered half a million cars. Huge thanks to all those who made this possible.\nhttps://t.co/q43vz6RMhd",0,0,6175


## (1.2) Getting Twitter data 2011 - 2020 from archive

In [61]:
# load twitter data from csv file
file_to_load = os.path.join('Resources/Data', 'elon_musk_tweets_2011-2021.csv')
twitter_archive = pd.read_csv(file_to_load)
twitter_archive.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,0,1343644462036086785,1343320495127633920,1609185000000.0,2020-12-28 19:46:18,0,,Entertainment will be critical when cars drive themselves,en,[],...,,,,,,[],,,,
1,1,1343619610617077760,1343386617294295040,1609179000000.0,2020-12-28 18:07:33,0,,@kimpaquette Just meeting with Larry Ellison to seek some advice. Back working on Tesla end of quarter tomorrow.,en,[],...,,,,,,"[{'screen_name': 'kimpaquette', 'name': 'Kim Paquette 💫🦄', 'id': '258602317'}]",,,,
2,2,1343608616960491521,1343576442722893825,1609176000000.0,2020-12-28 17:23:51,0,,@richierichhhhh_ Absolutely,en,[],...,,,,,,"[{'screen_name': 'richierichhhhh_', 'name': 'Richard', 'id': '1154974451328405507'}]",,,,
3,3,1343608530998153222,1343320495127633920,1609176000000.0,2020-12-28 17:23:31,0,,What should Tesla do with in-car gaming in an autonomous world?,en,[],...,,,,,,[],,,,
4,4,1343431408052662273,1343043963096326147,1609134000000.0,2020-12-28 05:39:42,0,,@PPathole @WSJ Absolutely,en,[],...,,,,,,"[{'screen_name': 'PPathole', 'name': 'Pranay Pathole', 'id': '1291945442'}, {'screen_name': 'WSJ', 'name': 'The Wall Street Journal', 'id': '3108351'}]",,,,


In [62]:
# select and rename columns
twitter_archive_clean = twitter_archive[['date', 'tweet', 'nlikes', 'nreplies', 'nretweets']]\
                            .loc[(twitter_archive['reply_to'] == '[]') & (twitter_archive['retweet'] == False)]
twitter_archive_clean.columns=['date', 'text', 'like_count', 'reply_count', 'retweet_count']

# convert date to datetime datatype
twitter_archive_clean['date'] = pd.to_datetime(twitter_archive_clean['date']).dt.date.astype('datetime64')

# drop last row with 1 tweet in 2011
twitter_archive_clean.drop(twitter_archive_clean.tail(1).index,inplace=True)

twitter_archive_clean.head()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2020-12-28,Entertainment will be critical when cars drive themselves,55085,2922,2611
3,2020-12-28,What should Tesla do with in-car gaming in an autonomous world?,33830,6932,884
6,2020-12-27,Try playing Polytopia in your Tesla! Great game. Multiplayer online version coming soon.,148037,5355,4186
34,2020-12-25,"Change your horn sound to 🐐, 🐍🎷, 💨 or holiday jingles with latest Tesla software update!",187368,5373,6983
35,2020-12-25,Merry Christmas &amp; happy holidays! 🎁 https://t.co/uk6NSPwR9R,236833,7496,13288


## (1.3) Clean the twitter data

In [63]:
# concatenate 2 datasets to get tweets from 2011 to 2021
twitter_df_merged = pd.concat([twitter_2021, twitter_archive_clean])
twitter_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4732 entries, 0 to 11715
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           4732 non-null   datetime64[ns]
 1   text           4732 non-null   object        
 2   like_count     4732 non-null   int64         
 3   reply_count    4732 non-null   int64         
 4   retweet_count  4732 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 221.8+ KB


In [64]:
# Drop the NaNs
twitter_df_merged.dropna()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2021-09-18,Congratulations @Inspiration4x!!!,111859,6234,7887
1,2021-09-18,"RT @SpaceX: Splashdown! Welcome back to planet Earth, @Inspiration4x! https://t.co/94yLjMBqWt",0,0,15456
2,2021-09-18,RT @SpaceX: Dragon has entered its last orbit before reentry and splashdown → https://t.co/bJFjLCzWdK https://t.co/rAeaXJLLIb,0,0,2079
3,2021-09-18,RT @SpaceX: Orbital moonrise https://t.co/vrx8Jzeu1t,0,0,4585
4,2021-09-18,"Moving at ~23 times speed of sound, circling Earth every ~90 minutes https://t.co/AncsjFpirC",99972,4382,9596
...,...,...,...,...,...
11711,2011-12-04,Am reading a great biography of Ben Franklin by Isaacson. Highly recommended.,65,17,9
11712,2011-12-03,That was a total non sequitur btw,53,31,6
11713,2011-12-03,"Great Voltaire quote, arguably better than Twain. Hearing news of his own death, Voltaire replied the reports were true, only premature.",29,7,25
11714,2011-12-01,I made the volume on the Model S http://t.co/wMCnT53M go to 11. Now I just need to work in a miniature Stonehenge...,78,31,9


In [65]:
# export all tweets for analysis in Tableau
twitter_df_merged.to_csv('Resources/Data/tweets_data_2011_2021_ungrouped.csv', index=False)

## (1.4) Preprocessing the Twitter data

**Preprocess the data by making it all lowercase. Remove a reasonable set of stopwords from the dataset and tokenize. Then, report the 10 most common words and their count. We need to iterate this process, adding some stop words as we understand the structure of the data. Justify additional stop words we've added.**

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
from nltk.stem import PorterStemmer
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gobinaththangaiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
# group tweets that posted at the same day
def f(x):
     return pd.Series(dict(like_count = x['like_count'].sum(),
                        reply_count = x['reply_count'].sum(),
                        retweet_count = x['retweet_count'].sum(),
                        text = "{%s}" % ', '.join(x['text'])))

In [68]:
twitter_df_merged = twitter_df_merged.groupby('date').apply(f).reset_index()
twitter_df_merged.head()


Unnamed: 0,date,like_count,reply_count,retweet_count,text
0,2011-12-01,267,63,24,"{I made the volume on the Model S http://t.co/wMCnT53M go to 11. Now I just need to work in a miniature Stonehenge..., Went to Iceland on Sat to ride bumper cars on ice! No, not the country, Vlad's rink in Van Nuys. Awesome family fun :) http://t.co/rBQXJ9IT}"
1,2011-12-03,82,38,31,"{That was a total non sequitur btw, Great Voltaire quote, arguably better than Twain. Hearing news of his own death, Voltaire replied the reports were true, only premature.}"
2,2011-12-04,65,17,9,{Am reading a great biography of Ben Franklin by Isaacson. Highly recommended.}
3,2011-12-21,1330,87,597,"{Yum! Even better than deep fried butter: http://t.co/Ody21NuD, Yeah, this really is me, as my Mom @mayemusk will attest. Not sure I can handle just doing 140 char missives. Will put longer thoughts on G+, Got called randomly by Kanye West today and received a download of his thoughts, ranging from shoes to Moses. He was polite, but opaque., His singing and acting talent will be sorely missed: http://t.co/IIFKob75 South Park sequel coming soon..., Why does the crowd cry over the glorious leader Kim Il Sung's death? Fear of being shot may play a role: http://t.co/hoQrYtG1, Sam Harris also wrote a nice piece on the awesomeness of Hitchens: http://t.co/fPkLiK3v May the good man RIP., Read ""Lying"", the new book by my friend Sam Harris. Excellent cover art and lots of good reasons not to lie!}"
4,2011-12-22,1349,132,206,"{Model S options are out! Performance in red and black for me. I will deliver my car in June/July. http://t.co/acnyP4nh, Hi, I'm Art Garfunkel. Have you heard the sound of silence? Because, you know, it makes a sound... http://t.co/7vgya9xL, Raul Campos invited me to do a guest DJ gig on KCRW. Hear my random holiday season music selections at http://t.co/o6FQASvC}"


In [69]:
twitter_df_merged.count()

date             1767
like_count       1767
reply_count      1767
retweet_count    1767
text             1767
dtype: int64

In [70]:
twitter_df = twitter_df_merged.copy()

In [71]:
# Data Pre-processing and make the tweets all lowercase and remove stopwords.
# lower the tweets
twitter_df['preprocessed_text'] = twitter_df['text'].str.lower()

# remove apostrophe from words and url
twitter_df['preprocessed_text'] = [re.sub("('[a-z]+)\s", " ", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("(')\s", " ", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("(?:https:\/\/\S+)\s", "", row) for row in twitter_df['preprocessed_text']]

                                      
# filter out rest URLs
url_re = '(?:https?:\/\/)?(?:[^?\/\s]+[?\/])(.*)'
twitter_df['preprocessed_text'] = twitter_df['preprocessed_text'].apply(lambda row: ' '.join([word for word in row.split() if (not re.match(url_re, word))]))

# tokenize the tweets
tokenizer = RegexpTokenizer('[a-zA-Z]\w+\'?\w*')
twitter_df['tokenized_text'] = twitter_df['preprocessed_text'].apply(lambda row: tokenizer.tokenize(row))

#create an object of class PorterStemmer
porter = PorterStemmer()

# apply stemming
twitter_df['preprocessed_text'] = [porter.stem(row) for row in twitter_df['preprocessed_text']]   

# filter out stop words
en_stop_words = nltk.corpus.stopwords.words('english')
additional_stop_words =['amp', 'rt', 'th','co', 're', 've', 'kim', 'daca', 'us', 'it', 'th', 'you', 'haha', 'st', 'et', 'so', 'iii', 'also', 've', 'la', 're', 'the', 'https', 'wow', 'actually', 'due', 'ft', 'pcr', 'via', 'am', 'gt', 'com', 'since', 'in', 'me', 'and', 'btw', 'yesterday', 'ii', 'inu', 'on', 'http', 'to', 'vs', 'rd', 'ur', 'of', 'bs', 'km', 'est', 'em', 'lz', 'kms', 'aft', 'nd',  'here’s', 're', 'mqxfakpzf' 'mph', 'ht', 'etc', 'dm', 'doo']
en_stop_words.extend(additional_stop_words)

twitter_df['tokenized_text'] = twitter_df['tokenized_text'].apply(lambda row: [word for word in row if (not word in en_stop_words)])

df_tweets_clean = twitter_df.copy()
df_tweets_clean.head()

Unnamed: 0,date,like_count,reply_count,retweet_count,text,preprocessed_text,tokenized_text
0,2011-12-01,267,63,24,"{I made the volume on the Model S http://t.co/wMCnT53M go to 11. Now I just need to work in a miniature Stonehenge..., Went to Iceland on Sat to ride bumper cars on ice! No, not the country, Vlad's rink in Van Nuys. Awesome family fun :) http://t.co/rBQXJ9IT}","{i made the volume on the model s go to 11. now i just need to work in a miniature stonehenge..., went to iceland on sat to ride bumper cars on ice! no, not the country, vlad rink in van nuys. awesome family fun :)","[made, volume, model, go, need, work, miniature, stonehenge, went, iceland, sat, ride, bumper, cars, ice, country, vlad, rink, van, nuys, awesome, family, fun]"
1,2011-12-03,82,38,31,"{That was a total non sequitur btw, Great Voltaire quote, arguably better than Twain. Hearing news of his own death, Voltaire replied the reports were true, only premature.}","{that was a total non sequitur btw, great voltaire quote, arguably better than twain. hearing news of his own death, voltaire replied the reports were true, only premature.}","[total, non, sequitur, great, voltaire, quote, arguably, better, twain, hearing, news, death, voltaire, replied, reports, true, premature]"
2,2011-12-04,65,17,9,{Am reading a great biography of Ben Franklin by Isaacson. Highly recommended.},{am reading a great biography of ben franklin by isaacson. highly recommended.},"[reading, great, biography, ben, franklin, isaacson, highly, recommended]"
3,2011-12-21,1330,87,597,"{Yum! Even better than deep fried butter: http://t.co/Ody21NuD, Yeah, this really is me, as my Mom @mayemusk will attest. Not sure I can handle just doing 140 char missives. Will put longer thoughts on G+, Got called randomly by Kanye West today and received a download of his thoughts, ranging from shoes to Moses. He was polite, but opaque., His singing and acting talent will be sorely missed: http://t.co/IIFKob75 South Park sequel coming soon..., Why does the crowd cry over the glorious leader Kim Il Sung's death? Fear of being shot may play a role: http://t.co/hoQrYtG1, Sam Harris also wrote a nice piece on the awesomeness of Hitchens: http://t.co/fPkLiK3v May the good man RIP., Read ""Lying"", the new book by my friend Sam Harris. Excellent cover art and lots of good reasons not to lie!}","{yum! even better than deep fried butter: yeah, this really is me, as my mom @mayemusk will attest. not sure i can handle just doing 140 char missives. will put longer thoughts on g+, got called randomly by kanye west today and received a download of his thoughts, ranging from shoes to moses. he was polite, but opaque., his singing and acting talent will be sorely missed: south park sequel coming soon..., why does the crowd cry over the glorious leader kim il sung fear of being shot may play a role: sam harris also wrote a nice piece on the awesomeness of hitchens: may the good man rip., read ""lying"", the new book by my friend sam harris. excellent cover art and lots of good reasons not to lie!}","[yum, even, better, deep, fried, butter, yeah, really, mom, mayemusk, attest, sure, handle, char, missives, put, longer, thoughts, got, called, randomly, kanye, west, today, received, download, thoughts, ranging, shoes, moses, polite, opaque, singing, acting, talent, sorely, missed, south, park, sequel, coming, soon, crowd, cry, glorious, leader, il, sung, fear, shot, may, play, role, sam, harris, wrote, nice, piece, awesomeness, hitchens, may, good, man, rip, read, lying, new, book, friend, sam, harris, excellent, cover, art, lots, good, reasons, lie]"
4,2011-12-22,1349,132,206,"{Model S options are out! Performance in red and black for me. I will deliver my car in June/July. http://t.co/acnyP4nh, Hi, I'm Art Garfunkel. Have you heard the sound of silence? Because, you know, it makes a sound... http://t.co/7vgya9xL, Raul Campos invited me to do a guest DJ gig on KCRW. Hear my random holiday season music selections at http://t.co/o6FQASvC}","{model s options are out! performance in red and black for me. i will deliver my car in hi, i art garfunkel. have you heard the sound of because, you know, it makes a sound... raul campos invited me to do a guest dj gig on kcrw. hear my random holiday season music selections at","[model, options, performance, red, black, deliver, car, hi, art, garfunkel, heard, sound, know, makes, sound, raul, campos, invited, guest, dj, gig, kcrw, hear, random, holiday, season, music, selections]"


In [72]:
df_tweets_clean = df_tweets_clean[['date', 'text', 'tokenized_text', 'like_count', 'reply_count', 'retweet_count']]
df_tweets_clean.head(10)

Unnamed: 0,date,text,tokenized_text,like_count,reply_count,retweet_count
0,2011-12-01,"{I made the volume on the Model S http://t.co/wMCnT53M go to 11. Now I just need to work in a miniature Stonehenge..., Went to Iceland on Sat to ride bumper cars on ice! No, not the country, Vlad's rink in Van Nuys. Awesome family fun :) http://t.co/rBQXJ9IT}","[made, volume, model, go, need, work, miniature, stonehenge, went, iceland, sat, ride, bumper, cars, ice, country, vlad, rink, van, nuys, awesome, family, fun]",267,63,24
1,2011-12-03,"{That was a total non sequitur btw, Great Voltaire quote, arguably better than Twain. Hearing news of his own death, Voltaire replied the reports were true, only premature.}","[total, non, sequitur, great, voltaire, quote, arguably, better, twain, hearing, news, death, voltaire, replied, reports, true, premature]",82,38,31
2,2011-12-04,{Am reading a great biography of Ben Franklin by Isaacson. Highly recommended.},"[reading, great, biography, ben, franklin, isaacson, highly, recommended]",65,17,9
3,2011-12-21,"{Yum! Even better than deep fried butter: http://t.co/Ody21NuD, Yeah, this really is me, as my Mom @mayemusk will attest. Not sure I can handle just doing 140 char missives. Will put longer thoughts on G+, Got called randomly by Kanye West today and received a download of his thoughts, ranging from shoes to Moses. He was polite, but opaque., His singing and acting talent will be sorely missed: http://t.co/IIFKob75 South Park sequel coming soon..., Why does the crowd cry over the glorious leader Kim Il Sung's death? Fear of being shot may play a role: http://t.co/hoQrYtG1, Sam Harris also wrote a nice piece on the awesomeness of Hitchens: http://t.co/fPkLiK3v May the good man RIP., Read ""Lying"", the new book by my friend Sam Harris. Excellent cover art and lots of good reasons not to lie!}","[yum, even, better, deep, fried, butter, yeah, really, mom, mayemusk, attest, sure, handle, char, missives, put, longer, thoughts, got, called, randomly, kanye, west, today, received, download, thoughts, ranging, shoes, moses, polite, opaque, singing, acting, talent, sorely, missed, south, park, sequel, coming, soon, crowd, cry, glorious, leader, il, sung, fear, shot, may, play, role, sam, harris, wrote, nice, piece, awesomeness, hitchens, may, good, man, rip, read, lying, new, book, friend, sam, harris, excellent, cover, art, lots, good, reasons, lie]",1330,87,597
4,2011-12-22,"{Model S options are out! Performance in red and black for me. I will deliver my car in June/July. http://t.co/acnyP4nh, Hi, I'm Art Garfunkel. Have you heard the sound of silence? Because, you know, it makes a sound... http://t.co/7vgya9xL, Raul Campos invited me to do a guest DJ gig on KCRW. Hear my random holiday season music selections at http://t.co/o6FQASvC}","[model, options, performance, red, black, deliver, car, hi, art, garfunkel, heard, sound, know, makes, sound, raul, campos, invited, guest, dj, gig, kcrw, hear, random, holiday, season, music, selections]",1349,132,206
5,2011-12-24,"{The Russians are having some challenges with their rockets. Too many of the engineers that designed them have retired: http://t.co/rEs7spSU, We had a long and interesting conversation on many subjects. He has exciting ideas for extending his creative talents beyond music., Kanye stopped by the SpaceX rocket factory today. http://t.co/6z7gHBn6}","[russians, challenges, rockets, many, engineers, designed, retired, long, interesting, conversation, many, subjects, exciting, ideas, extending, creative, talents, beyond, music, kanye, stopped, spacex, rocket, factory, today]",117113,1370,8434
6,2011-12-26,"{Walked around a neighborhood recently rebuilt with help from APJ and others http://t.co/KYHjsS1k, It was Xmas, so we brought presents for the kids at the orphanage. They don't usually get much. http://t.co/r8qfluIG, Met with UNICEF, Doctors Without Borders and Artists for Peace & Justice. I support them and would recommend others do too., Just returned from a trip to Haiti. Covered a lot of ground and saw many tough situations. They need a lot of help., Single character Tweets are the ulitmate extension of the Twitmeme..., I}","[walked, around, neighborhood, recently, rebuilt, help, apj, others, xmas, brought, presents, kids, orphanage, usually, get, much, met, unicef, doctors, without, borders, artists, peace, justice, support, would, recommend, others, returned, trip, haiti, covered, lot, ground, saw, many, tough, situations, need, lot, help, single, character, tweets, ulitmate, extension, twitmeme]",558,102,171
7,2011-12-27,{If you ever wanted to know the *real* truth about the moon landings ...(best Onion article ever) http://t.co/pgNEJsjI},"[ever, wanted, know, real, truth, moon, landings, best, onion, article, ever]",39,13,34
8,2011-12-28,{@TheOnion So true :)},"[theonion, true]",12,7,1
9,2011-12-29,"{Am not saying that is *necessarily* good or bad, but reality will force us to live with the consequences of our actions http://t.co/fnXmhUok, Interesting Economist article about how humanity's collective actions have created a fundamentally new geological age -- the Anthropocene., @om Cool personal essay. It really resonated with me, as I felt the same way after coming very close to dying from malaria ten years ago., @richardbranson Liked ""Screw Business as Usual"" a lot. This approach should be taken to heart by all, as it really is the smart move., @kanyewest Just returned from Haiti. For those who want to help, I recommend donating to MSF, UNICEF and Artists for Peace & Justice.}","[saying, necessarily, good, bad, reality, force, live, consequences, actions, interesting, economist, article, humanity, collective, actions, created, fundamentally, new, geological, age, anthropocene, om, cool, personal, essay, really, resonated, felt, way, coming, close, dying, malaria, ten, years, ago, richardbranson, liked, screw, business, usual, lot, approach, taken, heart, really, smart, move, kanyewest, returned, haiti, want, help, recommend, donating, msf, unicef, artists, peace, justice]",187,39,41


In [73]:
# count unique words
def get_most_freq_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]
  
len(get_most_freq_words([ word for tweet in df_tweets_clean.tokenized_text for word in tweet]))

8253

In [74]:
df_tweets_clean.to_csv('Resources/data/tweets_data_2011_2021.csv', index=False)

## (1.5) Upload dataset to SQL Database

In [75]:
import sqlalchemy
from sqlalchemy import create_engine, inspect
import psycopg2
from config import db_password

In [76]:
# Create engine
# engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{hostname}/twitter_vs_stocks')
engine = create_engine(f"postgresql://postgres:{db_password}@127.0.0.1:5432/twitter_vs_stocks")
# Use the Inspector to explore the database
inspector = inspect(engine)

In [77]:
df_tweets_clean.to_sql('tweets_text', engine, if_exists ='replace',method='multi', index=False)

# (2 ) Stock data

## (2.1) Getting the stock data

In [78]:
from yahoo_fin.stock_info import get_data

In [79]:
# historical daily data from Yahoo finance
tesla_df = get_data("tsla", start_date = '2011-01-01', end_date = None, index_as_date = False, interval="1d")
tesla_df

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2011-01-03,5.368000,5.400000,5.180000,5.324000,5.324000,6415000,TSLA
1,2011-01-04,5.332000,5.390000,5.204000,5.334000,5.334000,5937000,TSLA
2,2011-01-05,5.296000,5.380000,5.238000,5.366000,5.366000,7233500,TSLA
3,2011-01-06,5.366000,5.600000,5.362000,5.576000,5.576000,10306000,TSLA
4,2011-01-07,5.600000,5.716000,5.580000,5.648000,5.648000,11239500,TSLA
...,...,...,...,...,...,...,...,...
2691,2021-09-13,740.210022,744.780029,708.849976,743.000000,743.000000,22952500,TSLA
2692,2021-09-14,742.570007,754.469971,736.400024,744.489990,744.489990,18524900,TSLA
2693,2021-09-15,745.000000,756.859985,738.359985,755.830017,755.830017,15357700,TSLA
2694,2021-09-16,752.830017,758.909973,747.609985,756.989990,756.989990,13923400,TSLA


## (2.2) Clean the stock data

In [80]:
# Drop adjclose column
tesla_df = tesla_df.drop(columns=["adjclose", "ticker"])
tesla_df.head()

Unnamed: 0,date,open,high,low,close,volume
0,2011-01-03,5.368,5.4,5.18,5.324,6415000
1,2011-01-04,5.332,5.39,5.204,5.334,5937000
2,2011-01-05,5.296,5.38,5.238,5.366,7233500
3,2011-01-06,5.366,5.6,5.362,5.576,10306000
4,2011-01-07,5.6,5.716,5.58,5.648,11239500


In [81]:
# Determine data types for each column
tesla_df.dtypes

date      datetime64[ns]
open             float64
high             float64
low              float64
close            float64
volume             int64
dtype: object

## (2.3) Preprocessing the Stock Data

In [82]:
# Calculate change in stock price
tesla_df['change'] = tesla_df['close'].diff()
tesla_df.head(10)

Unnamed: 0,date,open,high,low,close,volume,change
0,2011-01-03,5.368,5.4,5.18,5.324,6415000,
1,2011-01-04,5.332,5.39,5.204,5.334,5937000,0.01
2,2011-01-05,5.296,5.38,5.238,5.366,7233500,0.032
3,2011-01-06,5.366,5.6,5.362,5.576,10306000,0.21
4,2011-01-07,5.6,5.716,5.58,5.648,11239500,0.072
5,2011-01-10,5.634,5.736,5.61,5.69,6713500,0.042
6,2011-01-11,5.718,5.742,5.384,5.392,8551000,-0.298
7,2011-01-12,5.402,5.48,5.304,5.392,4822000,0.0
8,2011-01-13,5.392,5.394,5.232,5.244,3618000,-0.148
9,2011-01-14,5.23,5.316,5.122,5.15,5960000,-0.094


In [84]:
tesla_df.to_csv('Resources/data/tesla_stocks.csv', index=False)

## (2.4) Upload dataset to SQL Database

In [85]:
tesla_df.to_sql('stock', engine, if_exists ='replace',method='multi', index=False)