In [136]:
# Packages
import pandas as pd
import numpy as np
from collections import Counter
import nltk, spacy, re, json
from nltk.corpus import stopwords
#nltk.download("punkt")
#nltk.download('stopwords')

# Set up
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

In [137]:
filename="../../data/twitter/elon_clean.csv"

In [138]:
elon = pd.read_csv(filename)

In [139]:
elon.head()

Unnamed: 0,username,date,retweets,favorites,text,mentions,hashtags,permalink
0,elonmusk,2019-04-02 20:38,993,9263,Dogecoin value may vary https://www. theonion.com/bitcoin-plunge -reveals-possible-vulnerabilities-in-craz-1821134169 …,,,https://twitter.com/elonmusk/status/1113178951403180032
1,elonmusk,2019-04-02 20:16,7238,55271,Dogecoin rulz pic.twitter.com/flWWUgAgLU,,,https://twitter.com/elonmusk/status/1113173498384441344
2,elonmusk,2019-04-02 19:40,368,9159,Uh oh,,,https://twitter.com/elonmusk/status/1113164389929160706
3,elonmusk,2019-04-02 09:24,1317,6176,Dogecoin might be my fav cryptocurrency. It’s pretty cool.,,,https://twitter.com/elonmusk/status/1113009339743100929
4,elonmusk,2019-04-02 09:21,1557,26925,Yup https:// twitter.com/nasa/status/11 12860196043452417 …,,,https://twitter.com/elonmusk/status/1113008497006804992


In [140]:
tweets=elon['text'][0:10]

In [141]:
tweets

0    Dogecoin value may vary https://www. theonion.com/bitcoin-plunge -reveals-possible-vulnerabilities-in-craz-1821134169 …
1    Dogecoin rulz pic.twitter.com/flWWUgAgLU                                                                               
2    Uh oh                                                                                                                  
3    Dogecoin might be my fav cryptocurrency. It’s pretty cool.                                                             
4    Yup https:// twitter.com/nasa/status/11 12860196043452417 …                                                            
5    Some challenges with ice formation in the cryogenic propellant prevalves. Hopefully overcome soon.                     
6    What could possibly go wrong?                                                                                          
7    No                                                                                                                     


In [143]:
nltk_twitter_tokens=[]
for tweet in tweets:
    nltk_twitter_tokens.append(nltk.casual_tokenize(tweet))

In [144]:
### Remember to follow the order here: match from first to last
regexes=(
    # Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
    r"(?:@[\w_]+)",
    # Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",
    # Keep words with apostrophes, hyphens and underscores together
    r"(?:[a-z][a-z’'\-_]+[a-z])",
    # Keep all other sequences of A-Z, a-z, 0-9, _ together
    r"(?:[\w_]+)",
    # Everything else that's not whitespace
    r"(?:\S)")
big_regex="|".join(regexes)
extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def extensible_tokenize(text):
    return extensible_tokenizer.findall(text)

In [145]:
# Tokenize tweets data 
tweets_tokens=[]
for tweet in tweets:
    tokens=extensible_tokenize(tweet.lower())
    tweets_tokens.append(tokens)

In [146]:
tweets_tokens

[['dogecoin',
  'value',
  'may',
  'vary',
  'https',
  ':',
  '/',
  '/',
  'www',
  '.',
  'theonion',
  '.',
  'com',
  '/',
  'bitcoin-plunge',
  '-',
  'reveals-possible-vulnerabilities-in-craz',
  '-',
  '1821134169',
  '…'],
 ['dogecoin', 'rulz', 'pic', '.', 'twitter', '.', 'com', '/', 'flwwugaglu'],
 ['uh', 'oh'],
 ['dogecoin',
  'might',
  'be',
  'my',
  'fav',
  'cryptocurrency',
  '.',
  'it’s',
  'pretty',
  'cool',
  '.'],
 ['yup',
  'https',
  ':',
  '/',
  '/',
  'twitter',
  '.',
  'com',
  '/',
  'nasa',
  '/',
  'status',
  '/',
  '11',
  '12860196043452417',
  '…'],
 ['some',
  'challenges',
  'with',
  'ice',
  'formation',
  'in',
  'the',
  'cryogenic',
  'propellant',
  'prevalves',
  '.',
  'hopefully',
  'overcome',
  'soon',
  '.'],
 ['what', 'could', 'possibly', 'go', 'wrong', '?'],
 ['no'],
 ['you’re', 'so', 'right', '.', 'working', 'on', 'it', '!'],
 ['that',
  'car',
  'seemed',
  'so',
  'advanced',
  'when',
  'i',
  'watched',
  'that',
  'show',
  'a

In [147]:
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()
# token=ps.stem(token)

## Extract Features

In [151]:
tweets_tokens_flat=[]
for tokens in tweets_tokens:
    for token in tokens:
        tweets_tokens_flat.append(token)

In [152]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 ':',
 '/',
 '/',
 'www',
 '.',
 'theonion',
 '.',
 'com',
 '/',
 'bitcoin-plunge',
 '-',
 'reveals-possible-vulnerabilities-in-craz',
 '-',
 '1821134169',
 '…',
 'dogecoin',
 'rulz',
 'pic',
 '.',
 'twitter',
 '.',
 'com',
 '/',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'be',
 'my',
 'fav',
 'cryptocurrency',
 '.',
 'it’s',
 'pretty',
 'cool',
 '.',
 'yup',
 'https',
 ':',
 '/',
 '/',
 'twitter',
 '.',
 'com',
 '/',
 'nasa',
 '/',
 'status',
 '/',
 '11',
 '12860196043452417',
 '…',
 'some',
 'challenges',
 'with',
 'ice',
 'formation',
 'in',
 'the',
 'cryogenic',
 'propellant',
 'prevalves',
 '.',
 'hopefully',
 'overcome',
 'soon',
 '.',
 'what',
 'could',
 'possibly',
 'go',
 'wrong',
 '?',
 'no',
 'you’re',
 'so',
 'right',
 '.',
 'working',
 'on',
 'it',
 '!',
 'that',
 'car',
 'seemed',
 'so',
 'advanced',
 'when',
 'i',
 'watched',
 'that',
 'show',
 'as',
 'a',
 'kid',
 '!',
 'knight',
 'industries',
 'two',
 'thousa

In [153]:
# find most frequent words
Counter(tweets_tokens_flat).most_common(100)

[('.', 10),
 ('/', 9),
 ('dogecoin', 3),
 ('com', 3),
 ('https', 2),
 (':', 2),
 ('-', 2),
 ('…', 2),
 ('twitter', 2),
 ('so', 2),
 ('!', 2),
 ('that', 2),
 ('value', 1),
 ('may', 1),
 ('vary', 1),
 ('www', 1),
 ('theonion', 1),
 ('bitcoin-plunge', 1),
 ('reveals-possible-vulnerabilities-in-craz', 1),
 ('1821134169', 1),
 ('rulz', 1),
 ('pic', 1),
 ('flwwugaglu', 1),
 ('uh', 1),
 ('oh', 1),
 ('might', 1),
 ('be', 1),
 ('my', 1),
 ('fav', 1),
 ('cryptocurrency', 1),
 ('it’s', 1),
 ('pretty', 1),
 ('cool', 1),
 ('yup', 1),
 ('nasa', 1),
 ('status', 1),
 ('11', 1),
 ('12860196043452417', 1),
 ('some', 1),
 ('challenges', 1),
 ('with', 1),
 ('ice', 1),
 ('formation', 1),
 ('in', 1),
 ('the', 1),
 ('cryogenic', 1),
 ('propellant', 1),
 ('prevalves', 1),
 ('hopefully', 1),
 ('overcome', 1),
 ('soon', 1),
 ('what', 1),
 ('could', 1),
 ('possibly', 1),
 ('go', 1),
 ('wrong', 1),
 ('?', 1),
 ('no', 1),
 ('you’re', 1),
 ('right', 1),
 ('working', 1),
 ('on', 1),
 ('it', 1),
 ('car', 1),
 ('seeme

Sentiment Analysis

In [160]:
tweets_tokens_flat=[re.sub('[^a-zA-Z\ ]' ,' ',w) for w in tweets_tokens_flat]
[w for w in tweets_tokens_flat if w]

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 ' ',
 ' ',
 ' ',
 'www',
 ' ',
 'theonion',
 ' ',
 'com',
 ' ',
 'bitcoin plunge',
 ' ',
 'reveals possible vulnerabilities in craz',
 ' ',
 '          ',
 ' ',
 'dogecoin',
 'rulz',
 'pic',
 ' ',
 'twitter',
 ' ',
 'com',
 ' ',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'be',
 'my',
 'fav',
 'cryptocurrency',
 ' ',
 'it s',
 'pretty',
 'cool',
 ' ',
 'yup',
 'https',
 ' ',
 ' ',
 ' ',
 'twitter',
 ' ',
 'com',
 ' ',
 'nasa',
 ' ',
 'status',
 ' ',
 '  ',
 '                 ',
 ' ',
 'some',
 'challenges',
 'with',
 'ice',
 'formation',
 'in',
 'the',
 'cryogenic',
 'propellant',
 'prevalves',
 ' ',
 'hopefully',
 'overcome',
 'soon',
 ' ',
 'what',
 'could',
 'possibly',
 'go',
 'wrong',
 ' ',
 'no',
 'you re',
 'so',
 'right',
 ' ',
 'working',
 'on',
 'it',
 ' ',
 'that',
 'car',
 'seemed',
 'so',
 'advanced',
 'when',
 'i',
 'watched',
 'that',
 'show',
 'as',
 'a',
 'kid',
 ' ',
 'knight',
 'industries',
 'two',
 'thousa

In [133]:
# remove punctuation
tweets_tokens_flat=[re.sub('[^a-zA-Z ]' ,'',w) for w in tweets_tokens_flat]
tweets_tokens_flat=[w for w in tweets_tokens_flat if w]

In [134]:
# remove stopwords
tweets_tokens_flat = [w for w in tweets_tokens_flat if w not in stopwords.words("english")]

In [135]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 'www',
 'theonion',
 'com',
 'bitcoinplunge',
 'revealspossiblevulnerabilitiesincraz',
 'dogecoin',
 'rulz',
 'pic',
 'twitter',
 'com',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'fav',
 'cryptocurrency',
 'pretty',
 'cool',
 'yup',
 'https',
 'twitter',
 'com',
 'nasa',
 'status',
 'challenges',
 'ice',
 'formation',
 'cryogenic',
 'propellant',
 'prevalves',
 'hopefully',
 'overcome',
 'soon',
 'could',
 'possibly',
 'go',
 'wrong',
 'youre',
 'right',
 'working',
 'car',
 'seemed',
 'advanced',
 'watched',
 'show',
 'kid',
 'knight',
 'industries',
 'two',
 'thousand']