In [39]:
import re
import string
import pickle
import pandas as pd
import numpy as np
import matplotlib.dates as md
import datetime as dt
from datetime import datetime
import matplotlib.pyplot as plt
from collections import Counter
from nltk.stem import PorterStemmer

from bokeh.palettes import Category10, brewer
from bokeh.plotting import *
from bokeh.models import *
from bokeh.plotting import figure, show, output_file
from bokeh.layouts import gridplot

In [2]:
def get_date_time(x):
    """
    From string to datatime.date()
    input: 
        x    : str
    output:
        date : datetime - Only month/day/year
        time : datetime - Only hour/minutes/
    """
    date_time = dt.datetime.strptime(x, '%m/%d/%Y %H:%M')
    #print(date_time)
    date = date_time.replace(minute=0, hour=0, second=0)
    #print(date)
    time = date_time.hour
    #type(time)
    
    return date


def cleaning_plot(df,dataset):
    """
    Cleans and selecting relevant tweets.
    input:
        df      - pd.DataFrame: Containing one of the IRA datasets
        dataset - int         : The IRA dataset ID
    output:
        df_tmp  - pd.DataFrame: Cleaned IRA dataset
    """
    RELEVANT_COL = ['author', 'publish_date', 'account_category', 'content', 'following', 'followers', 'retweet']
    
    #We will only select tweets in english. This represents 77% of the datasets.
    df_tmp = df[df.language == 'English'].drop(columns={'harvested_date', 'language'})
    df_tmp = df_tmp[RELEVANT_COL]

    #Keeping only right troll and left troll tweets. (57% datasets)
    df_tmp = df_tmp[(df_tmp.account_category == 'RightTroll') | (df_tmp.account_category == 'LeftTroll')  ]

    #date 
    df_tmp['publish_date'] = df_tmp.publish_date.apply(lambda x: get_date_time(x))
    df_tmp = df_tmp[df_tmp.publish_date > dt.datetime(2014,10,1,0,0,0)] #selecting after Oct 2014

    df_tmp['dataset'] = dataset #from which dataset it comes

    return df_tmp

In [3]:
DATA_FOLDER = 'Data/'
TWEET_DATA = 'russian-troll-tweets/IRAhandle_tweets_'
APPROVAL_DATA = 'approval_polllist.csv'
APPROVAL_POLLLIST = 'approval_polllist.csv'
CAMPAIGN_POLLS = 'presidential_polls.csv'

df = pd.read_csv(DATA_FOLDER + APPROVAL_DATA)

tweet_data = pd.DataFrame()

# The dataset is composed of 9 sub_datasets
for dataset in range(1,9):
    df_tmp = cleaning_plot(pd.read_csv(DATA_FOLDER + TWEET_DATA + str(dataset) + '.csv'), dataset)
    tweet_data = tweet_data.append(df_tmp, ignore_index=True)

In [4]:
#Selecting only relevant columns for approval data
UNRELEVANT = ['president', 'modeldate', 'startdate', 'enddate', 'url',\
              'question_id', 'tracking', 'multiversions', 'timestamp']
approval_df = df.drop(columns=UNRELEVANT)

#keeping only good pollster
#GOOD_GRADES = ['A+', 'A', 'A-', 'B+']
#approval_df = approval_df[approval_df.grade.isin(GOOD_GRADES)]


#Datetime format
approval_df.createddate = approval_df.apply(lambda row: datetime.strptime(row['createddate'], "%m/%d/%Y"),\
                                            axis=1)
approval_df.sort_values(by=['createddate'], inplace=True)
approval_df.reset_index(drop=True, inplace=True)
approval_df.head(10)

Unnamed: 0,subgroup,pollster,grade,samplesize,population,weight,influence,approve,disapprove,adjusted_approve,adjusted_disapprove,poll_id,createddate
0,All polls,Morning Consult,B-,1992.0,rv,0.946437,0.0,46.0,37.0,42.9784,39.17828,49249,2017-01-23
1,Voters,Morning Consult,B-,1992.0,rv,0.946437,0.0,46.0,37.0,44.03498,38.65438,49249,2017-01-23
2,Adults,Gallup,B,1500.0,a,0.245429,0.0,45.0,45.0,45.21144,43.51557,49253,2017-01-23
3,All polls,Gallup,B,1500.0,a,0.245429,0.0,45.0,45.0,46.03609,43.29538,49253,2017-01-23
4,All polls,Gallup,B,1500.0,a,0.226788,0.0,45.0,46.0,46.03609,44.29538,49262,2017-01-24
5,Adults,Gallup,B,1500.0,a,0.226788,0.0,45.0,46.0,45.21144,44.51557,49262,2017-01-24
6,All polls,Rasmussen Reports/Pulse Opinion Research,C+,1500.0,lv,0.22039,0.0,57.0,43.0,51.62358,43.74511,49266,2017-01-25
7,All polls,Gallup,B,1500.0,a,0.212047,0.0,46.0,45.0,47.03609,43.29538,49236,2017-01-25
8,Adults,Gallup,B,1500.0,a,0.212047,0.0,46.0,45.0,46.21144,43.51557,49236,2017-01-25
9,All polls,Public Policy Polling,B,1043.0,rv,1.165807,0.0,44.0,44.0,43.32141,44.38618,49237,2017-01-25


In [5]:
#We looked for events corresponding to increased tweet density
events_date = ["2015-07-21", "2015-11-15", "2015-11-15", "2016-03-22", 
               "2016-09-16", "2016-09-26", "2016-09-27", "2016-10-04", 
               "2016-10-06", "2016-10-07", "2016-10-07", "2016-10-07",
               "2016-11-08", "2016-11-09", "2017-07-24", "2017-07-31", 
               "2017-08-03", "2017-08-08", "2017-08-11"]
events_list = ["Chattanooga shootings", "Democrate Debate", 
               "Jamar Clark was shot by Minneapolis Police Department - BlackLivesMatter", 
               "Brussels Bombings", "CNN releases poll of polls: Hillary ahead by 2 points", 
               "Presidential Debate", "Alfredo Olango Police shooting - Blacklivesmatter", 
               "VP debate","Istanbul bombings", "Grab her by the pussy", "Obama admin. says Russia hacked DNC", 
               "Wikileaks releases Clinton emails", "Election Day", "Trump Elected", "Imran Awan scandal", 
               "Charlie Baker, GOP Gov is reelected as Massachusset gov", 
               "Leaked telephone conversations between Donald Trump and foreign leaders are leaked. ", 
               "North Korea Crisis", "Charlottesville riot"]
events_cat = ["RightTroll", "RightTroll", "LeftTroll", 
              "RightTroll", "RightTroll", "RightTroll", 
              "LeftTroll", "RightTroll", "RightTroll", 
              "LeftTroll", "RightTroll","LeftTroll", 
              "Both", "RightTroll", "RightTroll", 
              "RightTroll", "RightTroll", "RightTroll", "RightTroll"]
events_df = pd.DataFrame()
events_df['Dates'] = events_date
events_df['Events'] = events_list
events_df['Category'] = events_cat
events_df

Unnamed: 0,Dates,Events,Category
0,2015-07-21,Chattanooga shootings,RightTroll
1,2015-11-15,Democrate Debate,RightTroll
2,2015-11-15,Jamar Clark was shot by Minneapolis Police Dep...,LeftTroll
3,2016-03-22,Brussels Bombings,RightTroll
4,2016-09-16,CNN releases poll of polls: Hillary ahead by 2...,RightTroll
5,2016-09-26,Presidential Debate,RightTroll
6,2016-09-27,Alfredo Olango Police shooting - Blacklivesmatter,LeftTroll
7,2016-10-04,VP debate,RightTroll
8,2016-10-06,Istanbul bombings,RightTroll
9,2016-10-07,Grab her by the pussy,LeftTroll


In [6]:
def hashtag_extractor(text):
    hashtags = []
    text = text.split()
    for words in text:
        new_hash = re.match("#[A-Za-z0-9\-\.\_]+", words)
        translator = str.maketrans('', '', string.punctuation)
        if new_hash:
            new_hash = new_hash.group(0).translate(translator)
            hashtags.append(new_hash)
    return hashtags

In [7]:
tweet_data['hashtags'] = tweet_data.apply(lambda row: hashtag_extractor(row['content']), axis=1)
tweet_data.head()

Unnamed: 0,author,publish_date,account_category,content,following,followers,retweet,dataset,hashtags
0,10_GOP,2017-10-01,RightTroll,"""We have a sitting Democrat US Senator on tria...",1052,9636,0,1,[]
1,10_GOP,2017-10-01,RightTroll,Marshawn Lynch arrives to game in anti-Trump s...,1054,9637,0,1,[]
2,10_GOP,2017-10-01,RightTroll,Daughter of fallen Navy Sailor delivers powerf...,1054,9637,1,1,[BoycottNFL]
3,10_GOP,2017-10-01,RightTroll,JUST IN: President Trump dedicates Presidents ...,1062,9642,0,1,[]
4,10_GOP,2017-10-01,RightTroll,"19,000 RESPECTING our National Anthem! #StandF...",1050,9645,1,1,[StandForOurAnthem]


In [8]:
top_figure = figure(plot_width=850, plot_height=450, x_axis_type='datetime')
top_figure.title.text = 'Donald Trump Approval Rates - Ajusted'
top_figure.yaxis.axis_label = 'Approval Rates [%]'

# Creating a dataframe grouped by date with quantile 25,50,75
plot_df = pd.DataFrame(approval_df.adjusted_approve)
plot_df['date'] = approval_df.createddate

df2 = plot_df.adjusted_approve.rolling(window=200, min_periods=1).mean()
df3 = plot_df.adjusted_approve.rolling(window=200, min_periods=1).quantile(0.25)
df4 = plot_df.adjusted_approve.rolling(window=200, min_periods=1).quantile(0.75)

plot_df['lower'] = df3
plot_df['mean'] = df2
plot_df['upper'] = df4

source = ColumnDataSource(plot_df)

top_figure.scatter(x='date', y='adjusted_approve', line_color=None, fill_alpha=0.1, size=5, 
          source=source, legend='Poll Results')
mean_line = top_figure.line(x='date', y='mean', source=source, \
       line_width=2, alpha=0.8, legend='Mean Approval Rates')
band = Band(base='date', lower='lower', upper='upper', source=source, level='underlay',
            fill_alpha=0.5, line_width=1, line_color='black')
top_figure.add_layout(band)

top_figure.legend.location = 'top_left'
top_figure.legend.click_policy='hide'

hover_tool=tools.HoverTool(
    tooltips=[
        ('Date', '@date{%b %d, %Y}'),
        ('Mean Approval','@mean %')],

    formatters={
        'date' : 'datetime', # use 'datetime' formatter for 'date' field
        'mean' : 'printf',   },   # use 'printf' formatter for 'adj close' field

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline', 
    renderers = [mean_line], 
    attachment = 'above'
)

top_figure.tools.append(hover_tool)

#tweet plot
plot_tweet = tweet_data[tweet_data.publish_date >= approval_df.createddate.min()]
bottom_figure = figure(plot_width=850, plot_height=450, x_axis_type='datetime')
bottom_figure.title.text = 'Tweet density'
bottom_figure.yaxis.axis_label = 'Number of tweets'

categories = plot_tweet.account_category.unique()
cat_color = ["tomato", "dodgerblue"]

for color, category in enumerate(categories):
    df_plot = pd.DataFrame(plot_tweet[tweet_data.account_category==category].publish_date.value_counts().sort_index())
    source = ColumnDataSource(data=df_plot)
    bottom_figure.line(x='index', y='publish_date', source=source,\
            line_width=2, alpha=0.8, legend=category, color=cat_color[color])


bottom_figure.legend.location = 'top_left'
bottom_figure.legend.click_policy='hide'

hover_tool=tools.HoverTool(
    tooltips=[
        ('Date', '@index{%b %d, %Y}'),
        ('Number of tweets','@publish_date')],

    formatters={
        'index' : 'datetime', # use 'datetime' formatter for 'date' field
        'publish_date' : 'printf',   },   # use 'printf' formatter for 'adj close' field

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)

bottom_figure.tools.append(hover_tool)

p = gridplot([[top_figure], [bottom_figure]])
output_notebook()



In [9]:
show(p)

In [24]:
#DataFrame with the most contents per day 
df = tweet_data[['publish_date', 'account_category', 'hashtags']].copy()
df.set_index(["publish_date", "account_category"], inplace=True)

In [36]:
hash_per_day = (df.hashtags.apply(pd.Series)
              .stack()
              .reset_index(level=2, drop=True)
              .to_frame('hashtags'))
hash_per_day['count'] = 1
#Making every strong lower cap
hash_per_day.hashtags = hash_per_day.hashtags.str.lower()
#counting each string appearence per day, per account category
hash_per_day = hash_per_day.groupby(["publish_date", "account_category", "hashtags"]).agg("count")
#Discarding less than 3 appearance
hash_per_day = hash_per_day[hash_per_day['count'] > 3]
hash_per_day.head()
