In [1]:
import pandas as pd
import chardet
import re
import spacy
import multiprocessing

In [2]:
congress_tweets = pd.read_csv("politician_tweets.csv")
troll_tweets = pd.read_csv("troll_tweets.csv")
trump_tweets = pd.read_csv("trump_tweets.csv", encoding ='Windows-1252')

#### Subsetting and renaming columns, grouping by author

Congressional Tweets

In [3]:
congress_tweets = congress_tweets[['Handle', 'Tweet']]
congress_tweets.columns = ['author', 'text']
congress_tweets['account_category'] = 'politician'
print(congress_tweets.shape)
congress_tweets.head()

(67559, 3)


Unnamed: 0,author,text,account_category
0,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",politician
1,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,politician
2,RepDarrenSoto,.@realDonaldTrump official policy to separate ...,politician
3,RepDarrenSoto,Thank you to my mom Jean and all the mothers a...,politician
4,RepDarrenSoto,We paid our respects at Nat’l Law Enforcement ...,politician


In [4]:
# Grouping by author
congress_tweets = congress_tweets.groupby('author')['text'].apply(' '.join).reset_index()

Trump Tweets

In [5]:
trump_tweets['author'] = 'realdonaldtrump'
trump_tweets['account_category'] = 'politician'
trump_tweets = trump_tweets.sample(n=200)
print(trump_tweets.shape)
trump_tweets.head()

(200, 3)


Unnamed: 0,text,author,account_category
174,James Comey is a proven LEAKER &amp; LIAR. Vir...,realdonaldtrump,politician
460,After years of rebuilding OTHER nations we are...,realdonaldtrump,politician
55,....great people of Montana will not stand for...,realdonaldtrump,politician
576,The Democrats are pushing for Universal Health...,realdonaldtrump,politician
79,.@JimRenacci has worked so hard on Tax Reducti...,realdonaldtrump,politician


In [6]:
trump_tweets = trump_tweets.groupby('author')['text'].apply(' '.join).reset_index()

Troll Tweets

In [4]:
troll_tweets = troll_tweets[['author', 'content', 'account_category']]
troll_tweets['account_category'] = 'troll'
troll_tweets.rename(columns = {'content': 'text'}, inplace = True)
print(troll_tweets.shape)
troll_tweets.head()

(457464, 3)


Unnamed: 0,author,text,account_category
0,10_GOP,"""We have a sitting Democrat US Senator on tria...",troll
1,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,troll
2,10_GOP,JUST IN: President Trump dedicates Presidents ...,troll
3,10_GOP,"Dan Bongino: ""Nobody trolls liberals better th...",troll
4,10_GOP,'@SenatorMenendez @CarmenYulinCruz Doesn't mat...,troll


In [95]:
troll_tweets = troll_tweets.groupby('author')['text'].apply(' '.join).reset_index()

In [96]:
troll_tweets['author'].nunique()

325

#### Merge Data Frame

In [6]:
labeled_tweets = pd.merge(congress_tweets, trump_tweets, how = 'outer')
labeled_tweets = pd.merge(labeled_tweets, troll_tweets, how = 'outer')
print(labeled_tweets.shape)

(525223, 3)


#### Dropping URLs from tweets

In [7]:
def drop_url(tweet):
    return re.sub(r'http\S+', '', tweet)

In [8]:
labeled_tweets['text'] = labeled_tweets['text'].apply(drop_url)

Stack Overflow

In [116]:
nlp = spacy.load('en')

tweets = ['This is a dummy tweet for stack overflow',
         'What do we do with generator objects?']

spacy_tweets = []
for tweet in tweets:
    doc_tweet = nlp.pipe(tweet, batch_size = 10, n_threads = 3)
    spacy_tweets.append(doc_tweet)
    
for tweet in spacy_tweets:
    print(tweet)
    
for tweet in spacy_tweets[0]:
    print(tweet)

<generator object Language.pipe at 0x0000023B65AD4410>
<generator object Language.pipe at 0x0000023B667C26D0>
T
h
i
s
 
i
s
 
a
 
d
u
m
m
y
 
t
w
e
e
t
 
f
o
r
 
s
t
a
c
k
 
o
v
e
r
f
l
o
w


#### Tokenize and process

In [36]:
nlp = spacy.load('en')

In [9]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [16]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
import string
punctuations = string.punctuation

In [49]:
def spacy_tokenizer(tweet):
    tweet = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tweet]
    tweet = [tok for tok in tweet if (tok not in stopwords and tok not in punctuations)] 
    return tweet

In [39]:
%%time
spacy_tweets = []
for tweet in log_progress(labeled_tweets.iloc[:10000,1], every = 1):
    doc_tweet = spacy_tokenizer(tweet)
    spacy_tweets.append(doc_tweet)

TypeError: Argument 'vocab' has incorrect type (expected spacy.vocab.Vocab, got generator)

In [101]:
%%time
# this creates a generator object which does not currently work with the lemmatizer

spacy_tweets = []
for tweet in log_progress(labeled_tweets.iloc[:1000,1], every = 1):
    doc_tweet = nlp.pipe(tweet, batch_size = 10, n_threads = 3)
    spacy_tweets.append(doc_tweet)

Wall time: 780 ms


In [89]:
%%time
spacy_tweets = []
for tweet in log_progress(labeled_tweets.iloc[:1000,1], every = 1):
    doc_tweet = nlp(tweet)
    spacy_tweets.append(doc_tweet)

Wall time: 12.1 s


Lemmatizing and removing stopwords

In [102]:
test_tweets = spacy_tweets[:10]

In [110]:
for tweet in test_tweets:
    print(tweet)

<generator object Language.pipe at 0x0000023B667C2048>
<generator object Language.pipe at 0x0000023B667A7EB8>
<generator object Language.pipe at 0x0000023B667A7E60>
<generator object Language.pipe at 0x0000023B667A7DB0>
<generator object Language.pipe at 0x0000023B667A7D58>
<generator object Language.pipe at 0x0000023B667A7D00>
<generator object Language.pipe at 0x0000023B667A7CA8>
<generator object Language.pipe at 0x0000023B667A7BF8>
<generator object Language.pipe at 0x0000023B667A7C50>
<generator object Language.pipe at 0x0000023B667A7BA0>


In [68]:
for tweet in test_tweets:
    print(spacy_tokenizer(tweet))

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'lemma_'

In [105]:
print(test_tweets)

[<generator object Language.pipe at 0x0000023B667C2048>, <generator object Language.pipe at 0x0000023B667A7EB8>, <generator object Language.pipe at 0x0000023B667A7E60>, <generator object Language.pipe at 0x0000023B667A7DB0>, <generator object Language.pipe at 0x0000023B667A7D58>, <generator object Language.pipe at 0x0000023B667A7D00>, <generator object Language.pipe at 0x0000023B667A7CA8>, <generator object Language.pipe at 0x0000023B667A7BF8>, <generator object Language.pipe at 0x0000023B667A7C50>, <generator object Language.pipe at 0x0000023B667A7BA0>]
