In [121]:
%matplotlib inline
import twitter, tweepy
import scattertext as st
import re, io, itertools
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
import os, pkgutil, json, urllib, datetime, time, itertools
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
display(HTML("<style>.container { width:98% !important; }</style>"))

In [122]:
secret_key, consumer_key = os.environ['TWITTER_SECRET_KEY'], os.environ['TWITTER_CONSUMER_KEY']
access_token, token_secret = os.environ['TWITTER_ACCESS_TOKEN'], os.environ['TWITTER_TOKEN_SECRET']

In [123]:
api = twitter.Api(consumer_key=consumer_key,
                  consumer_secret=secret_key,
                  access_token_key=access_token,
                  access_token_secret=token_secret)

In [124]:
auth = tweepy.OAuthHandler(consumer_key, secret_key)
auth.set_access_token(access_token, token_secret)
api = tweepy.API(auth)

In [127]:
def pull_statuses(screen_name, verbose=False, last_id = None):
    pages = []
    for page in tweepy.Cursor(api.user_timeline, screen_name=screen_name).pages():
        if verbose: print(len(page))
        pages.append(page)
    return pages

In [128]:
pages = pull_statuses('themaddimension')

In [None]:
status_df = pd.concat([pd.DataFrame([x._json for x in page]) for page in pages])
status_df['created_at'] = pd.to_datetime(status_df['created_at'])

In [98]:
status_df['time_segment'] = status_df.created_at.apply(lambda x: 'Before 2016' if x < datetime.datetime(2015, 10, 1) else '2016 and later')

In [99]:
nlp = spacy.en.English()
status_df['parse'] = status_df['text'].apply(nlp)

In [100]:
corpus = st.CorpusFromParsedDocuments(status_df, 
                                      category_col='time_segment', 
                                      parsed_col='parse').build()

In [104]:
corpus = corpus.remove_terms([t for t in corpus.get_term_freq_df().index if '@' in t])

## Let's look at how language in his tweets differed between the end of 2016 and after 2016, when we started the Wes Bellamy recall drive
* Before 2016 tweeted a lot about Baseball, Gamergate, the science fair clock/bomb mix-up, atheism, and pro-Netanyahu issues
* He expressed very strong opinions, but nothing too radical

In [108]:
html = st.produce_scattertext_explorer(corpus,
                                       category='Before 2016',
                                       category_name='Before 2016',
                                       not_category_name='2016 and later',
                                       use_full_doc=True,
                                       minimum_term_frequency=5,
                                       pmi_filter_thresold=6,
                                       #minimum_not_category_term_frequency=10,
                                       width_in_pixels=1000,
                                       metadata=status_df['created_at'].astype(str),
                                       sort_by_dist=False)
file_name = 'other_jk.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1300, height=700)

## Before and after last Orioles Tweet and start of 2016
### It seems he stopped tweeted about baseball and moved on to other issues.  Let's examine what those issues were, and what he was talking about pre-radicalization.
* In the baseball-era, he talked about Baseball (Orioles, Royals, Baltimore, etc...), as well as on animal rights
* After he stopped tweeting about baseball (Oct 12, 2014 was his last Orioles tweet), he began tweeting about standard alt-right topics along with topics about his writing
* "Women" tends to be used in tweets critical fo the pay gap
* Issues around rape reporting, Gamergate
* He also begins talking about race.  The term "white" was only used in his post-baseball period, and occurs frequently and in racially charged contexts

In [120]:
status_df['era'] = status_df.created_at.apply(lambda x: 'Orioles Era' if x < datetime.datetime(2014, 10, 12) 
                                                  else ('Emerging Radical' if x < datetime.datetime(2016, 1, 1) else 'Radical'))
pre_rad_status_df = status_df[status_df['era'].isin(['Orioles Era', 'Emerging Radical'])]
corpus = (st.CorpusFromParsedDocuments(pre_rad_status_df, 
                                       category_col='era', 
                                       parsed_col='parse')
          .build().get_stoplisted_unigram_corpus())
corpus = corpus.remove_terms([t for t in corpus.get_term_freq_df().index if '@' in t])
html = st.produce_scattertext_explorer(corpus,
                                       category='Orioles Era',
                                       category_name='Orioles Era',
                                       not_category_name='Emerging Radical',
                                       use_full_doc=True,
                                       minimum_term_frequency=2,
                                       pmi_filter_thresold=6,
                                       minimum_not_category_term_frequency=10,
                                       width_in_pixels=1000,
                                       metadata=pre_rad_status_df['created_at'].astype(str))
file_name = 'other_jk_pre_rad.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1300, height=700)