# (1) Twitter Data
## (1.1) Getting Twitter data 2021 from API

In [23]:
import pandas as pd
from path import Path
from twarc import Twarc2, expansions
import json
import datetime
import os

In [24]:
from config import bearer_token

In [25]:
client = Twarc2(bearer_token=bearer_token)

In [26]:
user = 'elonmusk'
posts_dict = {
    'date':[],
    'text':[],
    'like_count':[],
    'reply_count':[],
    'retweet_count':[]
}

In [27]:
# pull posts from Twitter and create a dictionary
user_timeline = client.timeline(user=user, exclude_replies=True, start_time=datetime.datetime(2021,1,1, 0, 0, 0) )
for page in user_timeline:
    result = expansions.flatten(page)
    for tweet in result:
        posts_dict['date'].append(tweet['created_at'])
        posts_dict['text'].append(tweet['text'])
        posts_dict['like_count'].append(tweet['public_metrics']['like_count'])
        posts_dict['reply_count'].append(tweet['public_metrics']['reply_count'])
        posts_dict['retweet_count'].append(tweet['public_metrics']['retweet_count'])

In [28]:
# convert dictionary of posts to dataframe
twitter_2021 = pd.DataFrame.from_dict(posts_dict)
twitter_2021.head()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2021-09-04T04:04:40.000Z,RT @ErcXspace: Starbase 2023.\n\n#SpaceX #Star...,0,0,5595
1,2021-09-03T08:47:10.000Z,Time is the ultimate currency,430739,29370,64948
2,2021-09-02T00:37:51.000Z,"Looks promising that Beta 10.1, about 2 weeks ...",30841,3341,1499
3,2021-09-02T00:32:46.000Z,FSD Beta 10 rolls out midnight Friday next week,59586,4232,3618
4,2021-09-01T05:35:30.000Z,RT @SpaceX: Landing in the dark through clouds...,0,0,10182


In [29]:
# convert date to datetime datatype
twitter_2021['date'] = pd.to_datetime(twitter_2021['date']).dt.date.astype('datetime64')
twitter_2021.tail()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
195,2021-05-20,How much is that Doge in the window? https://t...,309568,72696,54256
196,2021-05-20,Tesla Model S Plaid delivery event\nJune 3 at ...,199113,18368,14519
197,2021-05-19,Credit to our Master of Coin,174594,15780,13385
198,2021-05-19,Tesla has 💎 🙌,463150,71078,57292
199,2021-05-18,Aiming for extreme precision with next gen Mod...,95357,6358,4445


## (1.2) Getting Twitter data 2011 - 2020 from archive

In [30]:
# load twitter data from csv file
file_to_load = os.path.join('Data', 'elon_musk_tweets_2011-2021.csv')
twitter_archive = pd.read_csv(file_to_load)
twitter_archive.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,0,1343644462036086785,1343320495127633920,1609185000000.0,2020-12-28 19:46:18,0,,Entertainment will be critical when cars drive...,en,[],...,,,,,,[],,,,
1,1,1343619610617077760,1343386617294295040,1609179000000.0,2020-12-28 18:07:33,0,,@kimpaquette Just meeting with Larry Ellison t...,en,[],...,,,,,,"[{'screen_name': 'kimpaquette', 'name': 'Kim P...",,,,
2,2,1343608616960491521,1343576442722893825,1609176000000.0,2020-12-28 17:23:51,0,,@richierichhhhh_ Absolutely,en,[],...,,,,,,"[{'screen_name': 'richierichhhhh_', 'name': 'R...",,,,
3,3,1343608530998153222,1343320495127633920,1609176000000.0,2020-12-28 17:23:31,0,,What should Tesla do with in-car gaming in an ...,en,[],...,,,,,,[],,,,
4,4,1343431408052662273,1343043963096326147,1609134000000.0,2020-12-28 05:39:42,0,,@PPathole @WSJ Absolutely,en,[],...,,,,,,"[{'screen_name': 'PPathole', 'name': 'Pranay P...",,,,


In [31]:
# select and rename columns
twitter_archive_clean = twitter_archive[['date', 'tweet', 'nlikes', 'nreplies', 'nretweets']]\
                            .loc[(twitter_archive['reply_to'] == '[]') & (twitter_archive['retweet'] == False)]
twitter_archive_clean.columns=['date', 'text', 'like_count', 'reply_count', 'retweet_count']

# convert date to datetime datatype
twitter_archive_clean['date'] = pd.to_datetime(twitter_archive_clean['date']).dt.date.astype('datetime64')

# drop last row with 1 tweet in 2011
twitter_archive_clean.drop(twitter_archive_clean.tail(1).index,inplace=True)

twitter_archive_clean.head()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2020-12-28,Entertainment will be critical when cars drive...,55085,2922,2611
3,2020-12-28,What should Tesla do with in-car gaming in an ...,33830,6932,884
6,2020-12-27,Try playing Polytopia in your Tesla! Great gam...,148037,5355,4186
34,2020-12-25,"Change your horn sound to 🐐, 🐍🎷, 💨 or holiday ...",187368,5373,6983
35,2020-12-25,Merry Christmas &amp; happy holidays! 🎁 https...,236833,7496,13288


## (1.3) Clean the twitter data

In [32]:
# concatenate 2 datasets to get tweets from 2011 to 2021
twitter_df_merged = pd.concat([twitter_2021, twitter_archive_clean])
twitter_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4385 entries, 0 to 11715
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           4385 non-null   datetime64[ns]
 1   text           4385 non-null   object        
 2   like_count     4385 non-null   int64         
 3   reply_count    4385 non-null   int64         
 4   retweet_count  4385 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 205.5+ KB


In [33]:
# Drop the NaNs
twitter_df_merged.dropna()

Unnamed: 0,date,text,like_count,reply_count,retweet_count
0,2021-09-04,RT @ErcXspace: Starbase 2023.\n\n#SpaceX #Star...,0,0,5595
1,2021-09-03,Time is the ultimate currency,430739,29370,64948
2,2021-09-02,"Looks promising that Beta 10.1, about 2 weeks ...",30841,3341,1499
3,2021-09-02,FSD Beta 10 rolls out midnight Friday next week,59586,4232,3618
4,2021-09-01,RT @SpaceX: Landing in the dark through clouds...,0,0,10182
...,...,...,...,...,...
11711,2011-12-04,Am reading a great biography of Ben Franklin b...,65,17,9
11712,2011-12-03,That was a total non sequitur btw,53,31,6
11713,2011-12-03,"Great Voltaire quote, arguably better than Twa...",29,7,25
11714,2011-12-01,I made the volume on the Model S http://t.co/...,78,31,9


In [34]:
# export all tweets for analysis in Tableau
twitter_df_merged.to_csv('Data/tweets_data_2011_2021_ungrouped.csv', index=False)

## (1.4) Preprocessing the Twitter data

**Preprocess the data by making it all lowercase. Remove a reasonable set of stopwords from the dataset and tokenize. Then, report the 10 most common words and their count. We need to iterate this process, adding some stop words as we understand the structure of the data. Justify additional stop words we've added.**

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
from nltk.stem import PorterStemmer
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gobinaththangaiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# group tweets that posted at the same day
def f(x):
     return pd.Series(dict(like_count = x['like_count'].sum(),
                        reply_count = x['reply_count'].sum(),
                        retweet_count = x['retweet_count'].sum(),
                        text = "{%s}" % ', '.join(x['text'])))

In [37]:
twitter_df_merged = twitter_df_merged.groupby('date').apply(f).reset_index()
twitter_df_merged.head()


Unnamed: 0,date,like_count,reply_count,retweet_count,text
0,2011-12-01,267,63,24,{I made the volume on the Model S http://t.co...
1,2011-12-03,82,38,31,"{That was a total non sequitur btw, Great Volt..."
2,2011-12-04,65,17,9,{Am reading a great biography of Ben Franklin ...
3,2011-12-21,1330,87,597,{Yum! Even better than deep fried butter: htt...
4,2011-12-22,1349,132,206,{Model S options are out! Performance in red a...


In [38]:
twitter_df_merged.count()

date             1648
like_count       1648
reply_count      1648
retweet_count    1648
text             1648
dtype: int64

In [39]:
twitter_df = twitter_df_merged.copy()

In [40]:
# Data Pre-processing and make the tweets all lowercase and remove stopwords.
# lower the tweets
twitter_df['preprocessed_text'] = twitter_df['text'].str.lower()

# remove apostrophe from words and url
twitter_df['preprocessed_text'] = [re.sub("('[a-z]+)\s", " ", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("(')\s", " ", row) for row in twitter_df['preprocessed_text']]
twitter_df['preprocessed_text'] = [re.sub("(?:https:\/\/\S+)\s", "", row) for row in twitter_df['preprocessed_text']]

                                      
# filter out rest URLs
url_re = '(?:https?:\/\/)?(?:[^?\/\s]+[?\/])(.*)'
twitter_df['preprocessed_text'] = twitter_df['preprocessed_text'].apply(lambda row: ' '.join([word for word in row.split() if (not re.match(url_re, word))]))

# tokenize the tweets
tokenizer = RegexpTokenizer('[a-zA-Z]\w+\'?\w*')
twitter_df['tokenized_text'] = twitter_df['preprocessed_text'].apply(lambda row: tokenizer.tokenize(row))

#create an object of class PorterStemmer
porter = PorterStemmer()

# apply stemming
twitter_df['preprocessed_text'] = [porter.stem(row) for row in twitter_df['preprocessed_text']]   

# filter out stop words
en_stop_words = nltk.corpus.stopwords.words('english')
additional_stop_words =['amp', 'rt', 'th','co', 're', 've', 'kim', 'daca', 'us', 'it', 'th', 'you', 'haha', 'st', 'et', 'so', 'iii', 'also', 've', 'la', 're', 'the', 'https', 'wow', 'actually', 'due', 'ft', 'pcr', 'via', 'am', 'gt', 'com', 'since', 'in', 'me', 'and', 'btw', 'yesterday', 'ii', 'inu', 'on', 'http', 'to', 'vs', 'rd', 'ur', 'of', 'bs', 'km', 'est', 'em', 'lz', 'kms', 'aft', 'nd',  'here’s', 're', 'mqxfakpzf' 'mph', 'ht', 'etc', 'dm', 'doo']
en_stop_words.extend(additional_stop_words)

twitter_df['tokenized_text'] = twitter_df['tokenized_text'].apply(lambda row: [word for word in row if (not word in en_stop_words)])

df_tweets_clean = twitter_df.copy()
df_tweets_clean.head()

Unnamed: 0,date,like_count,reply_count,retweet_count,text,preprocessed_text,tokenized_text
0,2011-12-01,267,63,24,{I made the volume on the Model S http://t.co...,{i made the volume on the model s go to 11. no...,"[made, volume, model, go, need, work, miniatur..."
1,2011-12-03,82,38,31,"{That was a total non sequitur btw, Great Volt...","{that was a total non sequitur btw, great volt...","[total, non, sequitur, great, voltaire, quote,..."
2,2011-12-04,65,17,9,{Am reading a great biography of Ben Franklin ...,{am reading a great biography of ben franklin ...,"[reading, great, biography, ben, franklin, isa..."
3,2011-12-21,1330,87,597,{Yum! Even better than deep fried butter: htt...,{yum! even better than deep fried butter: yeah...,"[yum, even, better, deep, fried, butter, yeah,..."
4,2011-12-22,1349,132,206,{Model S options are out! Performance in red a...,{model s options are out! performance in red a...,"[model, options, performance, red, black, deli..."


In [41]:
df_tweets_clean = df_tweets_clean[['date', 'text', 'tokenized_text', 'like_count', 'reply_count', 'retweet_count']]
df_tweets_clean.head(10)

Unnamed: 0,date,text,tokenized_text,like_count,reply_count,retweet_count
0,2011-12-01,{I made the volume on the Model S http://t.co...,"[made, volume, model, go, need, work, miniatur...",267,63,24
1,2011-12-03,"{That was a total non sequitur btw, Great Volt...","[total, non, sequitur, great, voltaire, quote,...",82,38,31
2,2011-12-04,{Am reading a great biography of Ben Franklin ...,"[reading, great, biography, ben, franklin, isa...",65,17,9
3,2011-12-21,{Yum! Even better than deep fried butter: htt...,"[yum, even, better, deep, fried, butter, yeah,...",1330,87,597
4,2011-12-22,{Model S options are out! Performance in red a...,"[model, options, performance, red, black, deli...",1349,132,206
5,2011-12-24,{The Russians are having some challenges with ...,"[russians, challenges, rockets, many, engineer...",117113,1370,8434
6,2011-12-26,{Walked around a neighborhood recently rebuilt...,"[walked, around, neighborhood, recently, rebui...",558,102,171
7,2011-12-27,{If you ever wanted to know the *real* truth a...,"[ever, wanted, know, real, truth, moon, landin...",39,13,34
8,2011-12-28,{@TheOnion So true :)},"[theonion, true]",12,7,1
9,2011-12-29,{Am not saying that is *necessarily* good or b...,"[saying, necessarily, good, bad, reality, forc...",187,39,41


In [42]:
# count unique words
def get_most_freq_words(str, n=None):
    vect = CountVectorizer().fit(str)
    bag_of_words = vect.transform(str)
    sum_words = bag_of_words.sum(axis=0) 
    freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
    freq =sorted(freq, key = lambda x: x[1], reverse=True)
    return freq[:n]
  
len(get_most_freq_words([ word for tweet in df_tweets_clean.tokenized_text for word in tweet]))

7974

In [43]:
df_tweets_clean.to_csv('data/tweets_data_2011_2021.csv', index=False)

## (1.5) Upload dataset to SQL Database

In [44]:
import sqlalchemy
from sqlalchemy import create_engine, inspect
import psycopg2
from config import user, password, hostname

ImportError: cannot import name 'user' from 'config' (/Users/gobinaththangaiya/Documents/GitHub/Datalogy_Final/config.py)

In [None]:
# Create engine
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{hostname}/twitter_vs_stocks')

# Use the Inspector to explore the database
inspector = inspect(engine)

In [None]:
df_tweets_clean.to_sql('tweets_text', engine, if_exists ='replace',method='multi', index=False)