In [0]:
import nltk
# Using the following tutorial for sentiment analysis: https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

Since this is a sentiment analysis project, I googled up a tutorial on how to do that using NLTK's tweets corpus, and will follow that for the initial model. The goal, ultimately, is to classify the tweets of Elon Musk and use a markov generator to create Markov-generated Elon Musk tweets.



In [0]:
# get twitter samples from nltk

nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [0]:
# group tweets into positive and negative

from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
# get tokens from tweets

from nltk.tag import pos_tag
from nltk.corpus import twitter_samples

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [0]:
# use nltk for lemmatization

from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [0]:
import re, string

# function to clean tweet text data--removes @s and links
# also removes punctuation and stop words

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# get stop words

from nltk.corpus import stopwords
stop_words = stopwords.words('english')


['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [0]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#print(remove_noise(tweet_tokens[0], stop_words))

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [0]:
# get most common positive words

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [0]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(30))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253), ('u', 245), ('day', 242), ('like', 229), ('see', 195), ('happy', 192), ("i'm", 183), ('great', 175), ('hi', 173), ('go', 167), ('back', 163), ('know', 161), ('new', 147), ('make', 145), (':p', 139), ('<3', 135), ('one', 131), ('..', 129), ('time', 125), ('hope', 123), ('us', 115)]


In [0]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [0]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [0]:
import pandas as pd
import numpy as np


In [0]:
td = pd.DataFrame(train_data)


Unnamed: 0,0,1
0,"{'yelaaaaaaa': True, ':(': True}",Negative
1,"{'check': True, 'new': True, 'van': True, 'out...",Positive
2,"{'every': True, 'night': True, 'take': True, '...",Negative
3,"{'yes': True, 'always': True, 'selfish': True,...",Positive
4,{':(': True},Negative


In [0]:
# Use naive bayes classifier

from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.996
Most Informative Features
                      :( = True           Negati : Positi =   2056.5 : 1.0
                      :) = True           Positi : Negati =   1662.7 : 1.0
                follower = True           Positi : Negati =     36.8 : 1.0
                     bam = True           Positi : Negati =     21.3 : 1.0
                     sad = True           Negati : Positi =     19.6 : 1.0
                  arrive = True           Positi : Negati =     18.8 : 1.0
               community = True           Positi : Negati =     15.2 : 1.0
                 welcome = True           Positi : Negati =     14.6 : 1.0
                    blog = True           Positi : Negati =     13.8 : 1.0
                    poor = True           Negati : Positi =     13.5 : 1.0
None


Let's test the model out by classifying Elon Musk's tweets. Elon Musk is a very interesting individual, so we'll start by classifying his tweets first (according to this NLTK model), and then use a markov chain generator to generate random tweets that look like Elon Musk may have written them.

In [0]:
# test on Elon Musk tweets

elon_tweets = pd.read_csv("https://www.dropbox.com/s/s9tp2lv32l95r0g/user_tweets.csv?raw=1", sep = ",")

In [0]:
elon_tweets.head()

Unnamed: 0,text,username,linktotweet,tweetembedcode,createdat
0,@highqualitysh1t I love the thought of a car d...,elonmusk,http://twitter.com/elonmusk/status/93704198630...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2017-12-02T19:33:00
1,@novaspivack Asimov's Foundation books should ...,elonmusk,http://twitter.com/elonmusk/status/93709071522...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2017-12-02T22:46:00
2,@novaspivack That's certainly the right way to...,elonmusk,http://twitter.com/elonmusk/status/93710961569...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2017-12-03T00:01:00
3,To preserve the transcendent majesty &amp; spe...,elonmusk,http://twitter.com/elonmusk/status/93739733099...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2017-12-03T19:05:00
4,@harrisonlingren @JW8888888 Busted,elonmusk,http://twitter.com/elonmusk/status/93739781363...,"<blockquote class=""twitter-tweet""><p lang=""en""...",2017-12-03T19:07:00


In [0]:
from nltk.tokenize import word_tokenize

In [0]:
# classify Elon's tweets

classifications = []
elon_tweets = elon_tweets.dropna()
for tweet in elon_tweets['text']:

  custom_tokens = remove_noise(word_tokenize(tweet))
  classifications.append(classifier.classify(dict([token, True] for token in custom_tokens)))

In [0]:
elon_tweets['class'] = classifications
elon_tweets.head()
neg_elon_tweets = elon_tweets[elon_tweets['class'] == 'Negative']
pos_elon_tweets = elon_tweets[elon_tweets['class'] == 'Positive']
print(neg_elon_tweets.shape)
print(pos_elon_tweets.shape)
elon_tweets.head()
del elon_tweets['linktotweet']
del elon_tweets['tweetembedcode']
del elon_tweets['username']


(3174, 3)
(3722, 3)


KeyError: ignored

In [0]:
elon_tweets.head(30)

Unnamed: 0,text,createdat,class
0,@highqualitysh1t I love the thought of a car d...,2017-12-02T19:33:00,Positive
1,@novaspivack Asimov's Foundation books should ...,2017-12-02T22:46:00,Positive
2,@novaspivack That's certainly the right way to...,2017-12-03T00:01:00,Negative
3,To preserve the transcendent majesty &amp; spe...,2017-12-03T19:05:00,Negative
4,@harrisonlingren @JW8888888 Busted,2017-12-03T19:07:00,Negative
5,@IvanEscobosa Yes,2017-12-03T19:07:00,Positive
6,Hat,2017-12-03T19:20:00,Negative
7,Every 5000th buyer of our boringly boring hat ...,2017-12-03T19:24:00,Positive
8,@TheRealUtkarsh Because it's stupid,2017-12-03T19:29:00,Negative
9,@maralkalajian Maybe,2017-12-03T19:28:00,Negative


Elon apparently has nearly as many negative tweets as positive tweets according to the NLTK tweet corpus. This is probably not that good of a model. It seems the NLTK twitter corpus was just not that good.

In [0]:
import spacy
!pip install markovify
import markovify

Collecting markovify
  Downloading https://files.pythonhosted.org/packages/de/c3/2e017f687e47e88eb9d8adf970527e2299fb566eba62112c2851ebb7ab93/markovify-0.8.0.tar.gz
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.1MB/s 
[?25hBuilding wheels for collected packages: markovify
  Building wheel for markovify (setup.py) ... [?25l[?25hdone
  Created wheel for markovify: filename=markovify-0.8.0-cp36-none-any.whl size=10694 sha256=ca8a650180069eeb9ec4dd5df4ffa3f56752c6aaee8882909120a65e399f9bef
  Stored in directory: /root/.cache/pip/wheels/5d/a8/92/35e2df870ff15a65657679dca105d190ec3c854a9f75435e40
Successfully built markovify
Installing collected packages: unidecode, markovify
Successfully installed markovify-0.8.0 unidecode-1.1.1


In [0]:
nlp = spacy.load('en')

# below is necessary to avoid memory error of SpaCy
nlp.max_length = 20000000

# all the processing work is done below, so it may take a while
twitter_doc = nlp(" ".join(elon_tweets.text))

In [0]:
elon_negative_doc = nlp(" ".join(elon_tweets[elon_tweets["class"]=="Negative"].text))
elon_negative_sents = " ".join([sent.text for sent in elon_negative_doc.sents if len(sent.text) > 1])



In [0]:
elon_negative_generator = markovify.Text(elon_negative_sents, state_size = 3)

# three randomly generated negative sentences
for i in range(20):
    print(elon_negative_generator.make_sentence(tries=100))

# three randomly-generated negative sentences of no more than 100 characters
for i in range(20):
    print(elon_negative_generator.make_short_sentence(200, tries=100))

@kimbal you too @austinbarnard45 @flcnhvy @Joe__Wakefield @tjq1190 @tyger_cyber @fawfulfan @_Mikemo This is a significant policy difference in Italy vs most other countries.
@BrandonJHavard No @arctechinc @BrandonJHavard No @CarolineGee8 @NathanBomey @USATODAY @Tesla @mayemusk That was a mistake.
@PPathole Black &amp; white interior available only for Model 3 Performance https://t.co/Vejb9fTY5Q RT @SpaceX: Dragon is holding at the capture point ~10 meters from the @Space_Station.
We can solar power all of human civilization with a tiny % of the US prob helps convert some naysayers.
This is simple replacement of the Autopilot team has been working all weekend to resolve last minute issues.
@DMC_Ryan Can someone please do that!? I would def recommend this @williamwinters @austinhopperrrr @maysacha @thanr @JamesWorldSpace Exactly.
RT @CRcars: Consumer Reports tested the @Tesla Model 3 and 5-star safety ratings: name a more iconic duo.
@martinengwicht @Erdayastronaut @DiscoverMag Even conn

The sentiments in the Markov-generated Elon negative tweets do not seem to be negative *at all*. It seems that the NLTK twitter corpus is extremely lacking once the luxury of emoticons gets lost. It seems Elon Musk is generally a pretty positive guy.

In [0]:
elon_positive_doc = nlp(" ".join(elon_tweets[elon_tweets["class"]=="Positive"].text))
elon_positive_sents = " ".join([sent.text for sent in elon_positive_doc.sents if len(sent.text) > 1])


In [0]:
elon_positive_generator = markovify.Text(elon_positive_sents, state_size = 3)

# three randomly generated negative sentences
for i in range(20):
    print(elon_positive_generator.make_sentence(tries=100))

# three randomly-generated negative sentences of no more than 100 characters
for i in range(20):
    print(elon_positive_generator.make_short_sentence(200, tries=100))

@Kristennetten @mayapolarbear Technically, his bro RT @wonderofscience : This is what it would look a bit like a Mars simulator.
Really need to bring it to a Tesla service center. https://t.co/KCIFtliZr8 @ElectrekCo I was just a simple nucleotide, drifting alone in small crevice with 3 trillion siblings.
Essence of a good editor @lexiheft @Tesla Coming soon @CathieDWood Thank you for your trust in the @SpaceX team.
@Erdayastronaut @SpaceX Super proud of Tesla Autopilot team!
@NASA @SpaceX @Space_Station @Commercial_Crew Most likely, but this is an important clarification @ThePhoenixFlare @MKBHD @HyperChangeTV Yeah, news is actually super good.
It's amazing. https://t.co/eLqr4pLeIX @thanr Sure @aparanjape Prob early next year Just finished an engineering review with SpaceX Propulsion.
Weather is over 90% favorable for today's launch attempt - https://t.co/gtC39uBC7z RT @NASA: How many worlds exist outside our solar system?
Aiming to finish initial construction this summer, start Model 3

It seems the positive *and* negative tweets have about similar sentiments. There seems to be lots of talk about Elon's companies.