In [42]:
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from operator import itemgetter
import collections
from datetime import datetime, date
data = "russian-troll-tweets/"

In [None]:
# Loading data
all_data_df = pd.read_csv(data + "IRAhandle_tweets_1.csv")
for i in range(2, 10):
    all_data_df = all_data_df.append(pd.read_csv(data + "IRAhandle_tweets_{}.csv".format(i)), ignore_index=True)
    
# Cleaning data
all_data_df = all_data_df.dropna()
all_data_df["publish_date"] = pd.to_datetime(all_data_df["publish_date"], format='%m/%d/%Y %H:%M')
all_data_df['publish_date'] = all_data_df['publish_date'].apply(lambda datetime: datetime.date())

account_categories = all_data_df.account_category.unique()

In [None]:
all_data_df.head()

In [None]:
## HELPER FUNCTIONS ##

def get_hashtags(df, category):
    hashtags = []
    for t in df[df.account_category == category].content.values:
        m = re.search("#.* ", t)
        if m is not None:
            for hashtag in m.group(0).split(" "):
                if hashtag and hashtag[0] == "#":
                    hashtags.append(hashtag)
    return hashtags

def get_tweets(category, with_hashtags=None, keywords=None, retweets_only=False):
    df = all_data_df
    if retweets_only:
        df = df[df.retweet == 1]
    tweets = df[df.account_category == category].content.values
    if with_hashtags is None:
        return tweets
    else:
        filtered = [x for x in tweets if [w for w in with_hashtags if w in x]]
        if keywords is None:
            return filtered
        else:
            return [x for x in filtered if [w for w in keywords if w in x]]

def frequencies(hashtags):
    res = {}
    for hashtag in hashtags:
        res[hashtag] = res.get(hashtag, 0.0) + 1.0
    return res

# Plot the word cloud of the most commonly used ingredients
def word_graph(freqs, title=None):
    fig = plt.figure(figsize=(10,10))
    plt.title(title)
    wc = WordCloud(background_color='white', width=1000, height=500).generate_from_frequencies(freqs)
    ax=plt.imshow(wc, interpolation='bilinear')
    plt.imsave('wc_ing.png', wc)
    b=plt.axis('off')
    
def hashtag_graph(df, category):
    hashtags = get_hashtags(df, category)
    freqs = frequencies(hashtags)
    word_graph(freqs, title="{} most used hashtags".format(category))
    
def histogram(df, title=None, category=None, size=(15,7), ax=None):
    if category is not None:
        df = df[df.account_category == category]
    res = df.drop(columns=[x for x in list(all_data_df.columns) if x not in ["publish_date", "content"]])
    res = res.rename(columns={"content": "Amount of Tweets", "publish_date":"Time"})
    res = res.groupby("Time").count()
    ax = res.plot(figsize=size, kind="area", title=title, ax=ax)
    return ax.get_legend_handles_labels()

In [None]:
# Complete histogram
_ = histogram(all_data_df, "All Data")
plt.show()

Observations: The IRA activity is focused between 2015 and 2017

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2)
for (index, cat) in enumerate(account_categories):
    ax = axes[int(index/2), index%2]
    histogram(all_data_df, cat, cat, (25, 15), ax=ax)
    ax.get_legend().remove()

handles, labels = axes[0,0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center')
plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
hashtag_graph(all_data_df, "RightTroll")

#MAGA: Make America Great Again
#tcot: Top Conservatives On Twitter
#PJNET: Patriot Journalist Network

Others: #NeverHillary, #ISIS, #WakeUpAmerica, #iceisis

In [None]:
hashtag_graph(all_data_df, "LeftTroll")

### Immitating real users
#NowPlaying was used a lot and often found with '#hiphop', '#RAPStationRadio', '#rap', '#music', '#power1044fm.com', '#HipHop', '#checkitout', '#EDM', '#Rap', '#HouseMusic', '#RnB', '#NewMusic', '#Reggae', '#ListenLive', '#HOuseMusic', '#Pop', '#TheIndieHour'

In [None]:
hashtag_graph(all_data_df, "HashtagGamer")

In [None]:
hashtags = ["#MustBeBanned"]
keywords = ["Clinton", "Hillary", "Democrats", "Obama"]
tweets = get_tweets("HashtagGamer", with_hashtags=hashtags, keywords=keywords, retweets_only=True)
collections.Counter(tweets).most_common(10)

In [None]:
def related(df, hashtag, category=None):
    dump = ["", "the", "-", "of", "is", "to", "a", "in", "for", "and", "The", "on", "you", "our", "not", "&", "with", \
           "We", "are", "I", "be", "with", "we", "your", "To", "will", "who", "In", "via", "that", "In", \
           "Is", "this", "it", "by", "THE", "their", "my", "1.", "2.", "all", "at", "Get", "get", "up", \
           "out", "new", "some", "about", "This", "have", "was", "as", "from", "they", "has", "his", "You"\
           "what", "he", "an", "but", "You", "what", "so", "if", "more", "do", "like", "just", "can", "how", \
           "If", "A", "or", "should", "For", "no", "Of", 'one', 'With', "It's", 'He', "I'm", 'want', 'And', 'They', 'when',\
           'It', 'My', 'would', 'US', '4', 'back', 'What', 'us', 'New', 'going', 'Great', 'over', 'time', "don't", 'after', \
            'know', 'than', 'think', 'need', 'take', 'On', "because", "me", "Who", '��', '�']
    hashtags = []
    people = []
    words = []

    if category is not None:
        df = df[df.account_category == category]
        
    tweets = [x for x in df.content.values if hashtag in x]
    for tweet in tweets: 
        for word in tweet.split(" "):
            if word and word not in dump:
                if word[0] == "#":
                    hashtags.append(word)
                elif word[0] == "@":
                    people.append(word)
                else:
                    words.append(word)
    
    all_ = [frequencies(hashtags), frequencies(people), frequencies(words)]
    for counter in all_:
        print([a for (a, _) in sorted(counter.items(), key=itemgetter(1), reverse=True)][:30])
        print()
    return tuple(all_)

In [None]:
hashtags, people, words = related(all_data_df, "#NowPlaying")