In [265]:
#import nbpresent
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter
nltk.download('stopwords')   

from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /Users/Jean-
[nltk_data]     BaptistePROST/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing

In [3]:
def get_date(x):
    """from string to datatime.date()"""
    return dt.datetime.strptime(x, '%m/%d/%Y %H:%M').replace(minute=0, hour=0, second=0)

def cleaning_content(df,dataset,  right=True, left=True):
    df_tmp=df[df.language=='English'].drop(columns={'harvested_date', 'language'})
    df_tmp=df.loc[:,['publish_date','content', 'account_category']]

    #category
    if not left:
        df_tmp=df_tmp[(df_tmp.account_category=='RightTroll')]
    elif not right:
        df_tmp=df_tmp[(df_tmp.account_category=='LeftTroll')]
    else:        
        df_tmp=df_tmp[(df_tmp.account_category=='RightTroll') | (df_tmp.account_category=='LeftTroll')  ]

    #date 
    df_tmp['publish_date']=df_tmp.publish_date.apply(lambda x: get_date(x))
    df_tmp=df_tmp[df_tmp.publish_date > dt.datetime(2014,10,1,0,0,0)].drop(columns='publish_date') #selecting after Oct 2014

    df_tmp['dataset']=dataset #from which dataset it comes

    return df_tmp

In [4]:
data='Data/russian-troll-tweets/IRAhandle_tweets_'
    
df=pd.DataFrame()
for dataset in range(1,9):
    df_tmp=cleaning_content(pd.read_csv(data+str(dataset)+'.csv'), dataset)
    df=df.append(df_tmp, ignore_index=True)

In [171]:
df.head()

Unnamed: 0,content,account_category,dataset
0,"""We have a sitting Democrat US Senator on tria...",RightTroll,1
1,Marshawn Lynch arrives to game in anti-Trump s...,RightTroll,1
2,Daughter of fallen Navy Sailor delivers powerf...,RightTroll,1
3,JUST IN: President Trump dedicates Presidents ...,RightTroll,1
4,"19,000 RESPECTING our National Anthem! #StandF...",RightTroll,1


In [157]:
link_numbers=('http', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-')
stemmer = PorterStemmer()
stop_w=[word.replace('\'','') for word in stopwords.words('english')]+ ['', '&amp', 'amp','rt']

def tokenize(text): #__call__
    text_cleaned = ''.join(ch for ch in text if ch not in '#!"$%&\()*+,./:;<=>?@[\\]^_{|}~\'').split(' ') 
    
    words= [word.lower().encode('ascii',errors='ignore').decode() for word in text_cleaned \
            if not ( (word.startswith(link_numbers)) | (word.endswith(link_numbers)) )]
    
    words=[stemmer.stem(word) for word in words if word not in stop_w ]
    
    if len(words) > 0:
        return words
    else: #tweets that contains only links or emojiis ...
        pass

def top_vocabulary(text, top=10):
    words = [stemmer.stem(word) for word in tokenize(text) if word not in stop_w] 
    voc=pd.DataFrame.from_dict(Counter(words), orient='index')\
            .rename(columns={0:'Count'})\
            .sort_values(by='Count', ascending=False)[:top]
    return voc

In [181]:
df['Clean_tweet']=df.apply(lambda row: tokenize(row['content']), axis=1 )

In [252]:
df.head()

Unnamed: 0,content,account_category,dataset,Clean_tweet
0,"""We have a sitting Democrat US Senator on tria...",RightTroll,1,"[sit, democrat, us, senat, trial, corrupt, bar..."
1,Marshawn Lynch arrives to game in anti-Trump s...,RightTroll,1,"[marshawn, lynch, arriv, game, anti-trump, shi..."
2,Daughter of fallen Navy Sailor delivers powerf...,RightTroll,1,"[daughter, fallen, navi, sailor, deliv, power,..."
3,JUST IN: President Trump dedicates Presidents ...,RightTroll,1,"[presid, trump, dedic, presid, cup, golf, tour..."
4,"19,000 RESPECTING our National Anthem! #StandF...",RightTroll,1,"[respect, nation, anthem, standforouranthem]"


In [221]:
#non empty clean tweet
tweets=df[df.Clean_tweet.isnull()==False]['Clean_tweet']
model = Word2Vec(tweets.tolist(), min_count=10, size= 100)

In [222]:
print(model)

Word2Vec(vocab=31377, size=100, alpha=0.025)


In [302]:
def word_orientation(df, vocab,):
    tot_words=len(vocab)
    words_party= pd.DataFrame(np.zeros(tot_words), columns={'Score'},index=vocab)

    voc=[]
    for tweet in df[(df.Clean_tweet.isnull()==False) & (df.account_category=='RightTroll')].Clean_tweet:
        voc+=tweet
    count=pd.DataFrame.from_dict(Counter(voc), columns={'R'}, orient='index')

    words_party=words_party.merge(count, how='left', right_index=True, left_index=True)

    voc=[]
    for tweet in df[(df.Clean_tweet.isnull()==False) & (df.account_category=='LeftTroll')].Clean_tweet:
        voc+=tweet
    count=pd.DataFrame.from_dict(Counter(voc), columns={'L'}, orient='index')

    words_party=words_party.merge(count, how='left', right_index=True, left_index=True)
    words_party.fillna(0,inplace=True)
    words_party['Score']=words_party.apply(lambda row: (row['R']-row['L'])/(row['R']+row['L']), axis=1)

    return words_party

In [None]:
# fit a 2d PCA model to the vectors
X = model.wv.vectors
pca = PCA(n_components=3)

result = pca.fit_transform(X)
result = pd.DataFrame(pca.fit_transform(X), columns={'X','Y','Z'})
result.index=list(model.wv.vocab)
result=result.merge(word_orientation(df,model.wv.vocab), how='left', right_index=True, left_index=True)

In [301]:
vocab=list(model.wv.vocab)
tot_words=len(vocab)
words_party= pd.DataFrame(np.zeros(tot_words), columns={'Score'},index=vocab)

voc=[]
for tweet in df[(df.Clean_tweet.isnull()==False) & (df.account_category=='RightTroll')].Clean_tweet:
    voc+=tweet
count=pd.DataFrame.from_dict(Counter(voc), columns={'R'}, orient='index')

words_party=words_party.merge(count, how='left', right_index=True, left_index=True)

voc=[]
for tweet in df[(df.Clean_tweet.isnull()==False) & (df.account_category=='LeftTroll')].Clean_tweet:
    voc+=tweet
count=pd.DataFrame.from_dict(Counter(voc), columns={'L'}, orient='index')

words_party=words_party.merge(count, how='left', right_index=True, left_index=True)
words_party.fillna(0,inplace=True)
words_party['Score']=words_party.apply(lambda row: (row['R']-row['L'])/(row['R']+row['L']), axis=1)


Unnamed: 0,Score,R,L
sit,0.020845,906.0,869.0
democrat,0.795162,13654.0,1558.0
us,0.350144,28218.0,13582.0
senat,0.661033,7027.0,1434.0
trial,0.070137,740.0,643.0
corrupt,0.756555,4589.0,636.0
bare,-0.075829,195.0,227.0
heard,0.019608,1196.0,1150.0
peep,-0.463768,74.0,202.0
mainstream,0.654558,1207.0,252.0


## Plotting:

In [None]:
from bokeh.palettes import Category10
from bokeh.plotting import *
from bokeh.models import *

#interactive plot

p = figure(plot_width=850, plot_height=450) # x_axis_type='datetime')
p.title.text = 'Tweets topic clusters'

source = ColumnDataSource(data=result)
p.scatter(x='X', y='Y', source=source, fill_color='#4292c6', fill_alpha=0.7, line_color=None)

p.legend.location = 'top_left'
p.legend.click_policy='hide'

hover_tool=tools.HoverTool(
    tooltips=[
        ('Word', '@index')],
    formatters={'index' : 'printf', },
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
)

p.tools.append(hover_tool)



#p.tools.append(hover_tool)

output_notebook()#\interactive_legend.html\)# title=\interactive_legend.py example\)\n

In [None]:
show(p)

In [150]:
from bokeh.palettes import Category10
from bokeh.plotting import *
from bokeh.models import *

#interactive plot
p = figure(plot_width=850, plot_height=450) # x_axis_type='datetime')
p.title.text = 'Tweets topic clusters'

source = ColumnDataSource(data=result)
p.scatter(x='X', y='Z', source=source)#,fill_color=colors, fill_alpha=0.6, line_color=None)

p.legend.location = 'top_left'
p.legend.click_policy='hide'

hover_tool=tools.HoverTool(
    tooltips=[
        ('Word', '@index')],

    formatters={'index' : 'printf', },
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
)

p.tools.append(hover_tool)



#p.tools.append(hover_tool)

output_notebook()#\interactive_legend.html\)# title=\interactive_legend.py example\)\n

In [151]:
show(p)

In [152]:
model.most_similar("macron")

  """Entry point for launching an IPython kernel.


[('emmanuel', 0.9620119333267212),
 ('tusk', 0.9490189552307129),
 ('contenti', 0.9106845855712891),
 ('pen', 0.9074980020523071),
 ('bold', 0.9004539847373962),
 ('armageddon', 0.8982289433479309),
 ('holland', 0.8949971795082092),
 ('dynasti', 0.8894515633583069),
 ('trump-lessdeb', 0.8851872086524963),
 ('realdonaldtrmup', 0.8840502500534058)]