In [None]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm_notebook 

import re
import requests
from io import BytesIO
from PIL import Image
from wordcloud import WordCloud
from csaps import csaps
from bs4 import BeautifulSoup

import networkx as nx
from matplotlib import rcParams
import matplotlib.pyplot as plt
import community.community_louvain
from pyvis.network import Network
import operator
from collections import Counter
from collections import OrderedDict

import tweepy
import time
import advertools as adv

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

In [None]:
import spacy

nlp_nl = spacy.load("nl_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

nlp_des_nl = spacy.load("nl_core_news_sm")
nlp_des_en = spacy.load("en_core_web_sm")

nlp_des_nl.Defaults.stop_words |= {'brabant','een','twee','my','are','on','once', }

nlp_en.Defaults.stop_words |= {'de','der','van','een','nee','dank','en','bron','morge','het','precie','precies',
                               'regard','regards','lol','dm','thnks','thnk','lot','op','minute','minutes','hour',
                               'hours','day','days','week','weeks','weekend','weekends','month','months','year',
                               'years','today','todays','morning','afternoon','pm','tonight','night','evening',
                               'tomorrow','end','people','guy','guys','shit','thing','things','w/','bit','ppm',
                               'yr','yrs',}

nlp_nl.Defaults.stop_words |= {'de','van','een','nee','dank','en','bron','morge','avond','het','dag','dagen','jaar',
                               'dm','gister','morgen','weekje','weken','week','maand','maanden','uur','uurtje',
                               'minuut','minuten','mensen','weekend','Spui','ur','stuk','luister',}

In [None]:
#input Twitter credentials
consumer_key= ""
consumer_secret= "" 
access_token=""
access_token_secret=""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)    
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

#return all friends and followers' id in two lists 
def get_friends_and_followers(username_or_id):
    
    friends_id=api.get_friend_ids(user_id=username_or_id)
    followers_id=api.get_follower_ids(user_id=username_or_id)
    
    return friends_id, followers_id

#returns reciprocal friends in a list 
def setwise_friends_followers_analysis(screen_name, friends_ids, followers_ids):
    
    friends_ids, followers_ids = set(friends_ids), set(followers_ids)
    
    print('{0} is following {1}'.format(screen_name, len(friends_ids)))
    
    print('{0} is being followed by {1}'.format(screen_name, len(followers_ids)))
    
    print('{0} of {1} are not following {2} back'.format(
            len(friends_ids.difference(followers_ids)), 
            len(friends_ids), screen_name))
    
    print('{0} of {1} are not being followed back by {2}'.format(
            len(followers_ids.difference(friends_ids)), 
            len(followers_ids), screen_name))
    
    print('{0} has {1} mutual friends'.format(
            screen_name, len(friends_ids.intersection(followers_ids))))
    
    return list(friends_ids.intersection(followers_ids))

#return the follower count of the user's reciprocal friends.
def get_your_friends_follower_count(reciprocal_friends):
    
    #dictionary store information about friend and their follower count
    get_info={}
    #list to hold mined ids
    mined_ids=[]
    #list to hold mined follower counts
    mined_followers_count=[]
    #lists hold final values
    ids=[]
    followers_count=[]
    
    #execute when the length of the reciporcal friends list is greater than 100
    if len(reciprocal_friends) > 100:
        
        #tweepy only return the information of 100 followers at a time.
        #This code calculates values of 100 per number of followers
        #So if one has 1000 followers, the function will be calculated like: 100,200 and so on.
        value=len(reciprocal_friends)
        iters=int(value/100)
        it=-1
        vals=[]
        for i in range(0,iters):
            vals.append([(it+1)*100,(it+2)*100])
            it=it+1
            
        vals.append([iters*100,(iters*100)+(value%100)])
         
        a=[]   
        
        #Using the ranges in our vals list, use list to effectively make api calls
        for j in range(0,len(vals)):
            
            #json formated dict is stored in mined lists
            mined_ids.extend(api.lookup_users(user_id=reciprocal_friends[vals[j][0]:vals[j][1]]))
            mined_followers_count.extend(api.lookup_users(user_id=reciprocal_friends[vals[j][0]:vals[j][1]]))
        
        #exact target values are stored in this list
        for k in range(0,len(mined_ids)):
            ids.append(mined_ids[k]._json['id'])
            followers_count.append(mined_followers_count[k]._json['followers_count'])
              
    else:
        a=api.lookup_users(user_id=reciprocal_friends[0:100])
        for i in range(0,len(a)):
            ids.append(str(a[i]._json['id']))
            followers_count.append(a[i]._json['followers_count'])
    
    #convert both lists to a dictionary and return the data
    list_to_dict=zip(ids,followers_count)
    info=dict(list_to_dict)
    
    return info

#get the top 5 influential friend
def get_top_5_friends(info):
    
    #convert the values received from the previous function into a list and sort it
    values=list(info.values())
    values.sort()
    top_friends_followers_ids=[]
    
    #top 5 follower counts will be the 5 greatest at the end of the list
    follower_counts=values[len(values)-5:len(values)]
    
    i=0
    
    if len(info) <5:
        return info
    
    #iterate through the keys of the info dict, if the value in the dict equals the follower counts stored 
    #in the follower_counts list, store it in the top_friends_followers_ids list
    while(i<5):
        
        for element in list(info.keys()):
            if info[element]==follower_counts[i]:
                top_friends_followers_ids.append(element)
        i=i+1
        
        top_5=zip(top_friends_followers_ids,follower_counts)
        top_5=dict(top_5)
        
    return top_5

#function calls all previous functions to generate distance 1 friends with the top 5 highest follower counts.
def get_distance_one_friends(username_or_id):
    
    friends_id, followers_id=get_friends_and_followers(username_or_id)
    reciprocal_friends=setwise_friends_followers_analysis(username_or_id, friends_id,followers_id)
    info=get_your_friends_follower_count(reciprocal_friends)
    top_5=get_top_5_friends(info)
    return top_5

#use return data of distance 1 friends to find information about top 5 most popular followers from each user.
def get_distance_two_friends(username_or_id):
    
    data=get_distance_one_friends(username_or_id)
    information=[]
    list_data_keys=list(data.keys())
    
    for element in list_data_keys:
        information.append(get_distance_one_friends(element))
        
    return information

#input list of distance 2 friends to find information about the top 5 most popular followers from each user.
def get_distance_three_friends(data):
    information=[]
    
    for element in data:
        for i in list(element.keys()):
            try:
                information.append(get_distance_one_friends(i))
            
            # sleep for 15 minutes when Tweepy's rate limiterror occured.
            except tweepy.TweepError:
                time.sleep(60 * 15)
                continue
    
    return information

def network(user_id, name, data_1, data_2, data_3):    
    rcParams["figure.figsize"]=50,40 
    rcParams['axes.titlesize']=70

    #instantiate a networkx graph object    
    G=nx.MultiGraph()
    head=user_id
    G.add_node(head)

    #converts dictionaries into list format from data points 1,2 and 3
    list_data_1=list(data_1.keys())
    G.add_nodes_from(list_data_1)
    list_data_2=[]

    for element in data_2:
        for i in element.keys():
            list_data_2.append(i)

    list_data_3=[]

    for element in data_3:
        for i in element.keys():
            list_data_3.append(i)

    #add edges from my account and my top 5 most popular friends
    for i in range(0,len(list_data_1)):
        G.add_edges_from([(user_id,list_data_1[i])])

    #add edges between distance 1 friends and distance 2 friends    
    for j in range(0,len(data_2)):
        for k in list(data_2[j].keys()):
            G.add_edges_from([(list_data_1[j],k)])

    #add edges between distance 2 friends and distance 3 friends 
    for l in range(0,len(data_3)):
        for m in list(data_3[l].keys()):
            G.add_edges_from([(list_data_2[l],m)])

    # Computes the partition of the graph nodes which maximises the modularity using the Louvain heuristics algorithm. 
    # This is the partition of highest modularity.
    part = community.community_louvain.best_partition(G,random_state=10)
    # Computes the modularity of a partition of a graph
    mod = community.community_louvain.modularity(part,G)

    values = [part.get(node) for node in G.nodes()]

    plt.title(name+ "'s Twitter Network") 
    pos=nx.spring_layout(G,scale=4)
    nx.draw(G, pos, cmap=plt.cm.RdYlBu, node_size=5000, node_color = values, font_size=12, font_weight='bold',with_labels=False)
    plt.savefig(name+'.png', dpi=300, bbox_inches='tight')
    plt.show()

    n_edges=G.number_of_edges()
    diameter=nx.diameter(G)
    average_dist=nx.average_shortest_path_length(G)
    n_nodes=G.nodes()

    print("This graph contains {0} distinct nodes with {1} edges. ".format(len(n_nodes),n_edges))
    print("The Diameter of this network is: {0} ".format(diameter))
    print("The average distance between each node in the Graph is: {0}".format(average_dist))
    print('The modularity is', mod)
    
    return part

In [None]:
#functions for preprocessing and topic modeling

PATH_TO_MALLET = ''#input local path to MALLET

MIN_DF = 3 # Only include words that appear in more than 3 tweets
MAX_DF = 0.7 # Only include words that appear in less than 70% of the tweets 

#function for cleaning text
def clean_text(tweet):
    tweet = BeautifulSoup(tweet).get_text()
    tweet=' '.join(re.sub("(@[a-zA-Z0-9_]+)|(#[A-Za-z0-9_]+)", " ", tweet).split())#remove mention and hashtag
    tweet=' '.join(re.sub("(RT[a-zA-Z0-9_].\:)|(RT :)|(RT)", " ", tweet).split())#remove retweet indication
    tweet=' '.join(re.sub(r"http\S+", " ", tweet).split())#remove url
    tweet=' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())#remove link
    tweet=' '.join(re.sub("(\$[0-9]\S)|(\S[0-9])|([0-9]\S)|([0-9]+)", " ", tweet).split())#remove numbers 
    tweet=' '.join(re.sub("(\>)|(\<)|(\|)|(\+)|(\=)|(\*)|(\^)|(\$)|(\-)", " ", tweet).split())#remove signs
    tweet=' '.join(re.sub("\(\)", " ", tweet).split())
    tweet=' '.join(re.sub("Eric", " ", tweet).split())
    
    return tweet

def pre_process(df):
    lang=['en','nl']
    df['tweet_year'] = pd.DatetimeIndex(df['tweet_created_at']).year
    df['dup_text']=df['tweet_full_text']
    df['dup_text']=df['dup_text'].str.replace(r'[^\x00-\x7F]+', '', regex=True) #remove icons
    tweet_text=df.dup_text.values
    df['clean_text'] = [clean_text(text) for text in tweet_text]
    df=df.loc[df['tweet_lang'].isin(lang)]#keep only English and Dutch tweets
    
    return df

def spacy(df, nlp):
    processed_texts = [text for text in tqdm_notebook(nlp.pipe(df['clean_text'], disable=["ner", "parser"]),
                                                  total=len(df['clean_text']))]


    tokenized_texts = [[token.lemma_.lower() for token in text if not token.is_punct 
                    and not token.is_stop and token.pos_ == 'NOUN'] 
                   for text in tqdm_notebook(processed_texts)] 

    df['tokenized_texts']=tokenized_texts

    dictionary = Dictionary(df['tokenized_texts'])# Get the vocabulary
    dictionary.filter_extremes(no_below=MIN_DF, 
                           no_above=MAX_DF) # Remove words that appear in more than 70% of the tweets

    # Transform a list of words into a bag of words representation. 
    corpus = [dictionary.doc2bow(text) for text in df['tokenized_texts']]
    
    return df, dictionary, corpus

def spacy_description(df, nlp):
    processed_texts = [text for text in tqdm_notebook(nlp.pipe(df['clean_description'], disable=["ner", "parser"]),
                                                      total=len(df['clean_description']))]


    tokenized_texts = [[token.lemma_.lower() for token in text if not token.is_punct 
                        and not token.is_stop] 
                       for text in tqdm_notebook(processed_texts)] 

    df['tokenized_texts']=tokenized_texts

    dictionary = Dictionary(df['tokenized_texts']) # Get the vocabulary

    # Transform a list of words into a bag of words representation. 
    corpus = [dictionary.doc2bow(text) for text in df['tokenized_texts']]

    return df, dictionary, corpus

#for plotting
def smooth(x, y, smooth=0.95):
    
    xs = np.linspace(x[0], x[-1], 150)
    ys = csaps(x, y, xs, smooth=smooth)
    
    return xs, ys

#calculate the coherence value for different topic numbers
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(PATH_TO_MALLET, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,texts=texts,dictionary=dictionary,coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Show coherence graph for different topic number
def topic_coherence(name, coherence_values):

    limit=20; start=1; step=3;
    x = range(start, limit, step)
    ax2 = plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.title('Coherence score per number of topics \n'+name)
    plt.legend((""), loc='best')
    plt.savefig(name+'.png', dpi=600)
    plt.show()
    # Print the coherence scores
    for m, cv in zip(x, coherence_values):
        print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

# train the Latent Dirichlet Allocation algorithm
def train_LDA(corpus, dictionary, N_TOPICS): 
    
    N_ITERATIONS = 2000 # Number of training iterations
    lda = LdaMallet(PATH_TO_MALLET,
                    corpus=corpus,
                    id2word=dictionary,
                    num_topics=N_TOPICS,
                    optimize_interval=10,
                    iterations=N_ITERATIONS,
                    random_seed = 10)
    
    return lda

def combine_distribution(df, lda, N_TOPICS):
    # Combine the distributions to the tweets
    joined = df[["user_screen_name", "tweet_year", "clean_text"]]
    joined = joined.reset_index().drop(["index"], axis=1)
    transformed_docs = lda.load_document_topics()
    topic_distributions = pd.DataFrame([[x[1] for x in doc] for doc in transformed_docs], 
                 columns=['topic_{}'.format(i) for i in range(N_TOPICS)])
    topic_distributions.head()
    joined = pd.concat([joined, topic_distributions], axis =1, ignore_index=False)

    return joined

#plot topic distribution by account
def topic_user(name,col,topic):
    #x=[]
    #labels=[]
    print(col)
    joined_user2 = joined_user.sort_values(col, ascending = False)[:5]
    print(plt.bar(joined_user2.index, joined_user2[col]))
    #plt.xticks(x, labels)
    plt.xticks(rotation=90)
    plt.xlabel('Accounts')
    plt.title(name+'\n Leading accounts for '+topic)
    plt.ylabel('Topic probability')
    plt.savefig(name+'.png', dpi=600)
    plt.show()
    
#plot wordclouds
def wordcloud(name, lda, TOPIC2PLOT, MAX_WORDS):
    response = requests.get('https://i.ibb.co/kHNWRYD/black-circle-better.png')
    circle_mask = np.array(Image.open(BytesIO(response.content))) 
    wordcloud = WordCloud(background_color='#fff',
                          font_path='/System/Library/Fonts/Supplemental/DIN Alternate Bold.ttf',
                        color_func=lambda *args, **kwargs: (0,0,0),
                         mask=circle_mask)
    wordcloud.generate_from_frequencies(frequencies=dict(lda.show_topic(TOPIC2PLOT, MAX_WORDS)))

    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.title('Wordcloud for '+name)
    plt.savefig(name+'.png', dpi=600)
    
    
auth_params = {
'app_key': "",
'app_secret': "" ,
'oauth_token': "",
'oauth_token_secret':"",
}
adv.twitter.set_auth_params(**auth_params)

#get tweets
def tweet(user_id):
    tweet=adv.twitter.get_user_timeline(user_id=user_id, count=4000, tweet_mode='extended')
    tweets=tweet[['user_id','user_name','user_screen_name','user_description','tweet_created_at','tweet_id',
                  'tweet_full_text','tweet_lang']]

    return tweets

In [None]:
data_1=get_distance_one_friends(id)
data_1

In [None]:
data_2=get_distance_two_friends(id)
data_2

In [None]:
data_3=get_distance_three_friends(data_2)
data_3

In [None]:
#plot the network
part=network(id, name, data_1, data_2, data_3)

In [None]:
#sort the nodes
sorted_node = sorted(part.items(), key=operator.itemgetter(1))
sorted_dict = OrderedDict()
for i, j in sorted_node:
    sorted_dict[i] = j
    
#count nodes number within each cluster    
count=Counter(sorted_dict.values())
print(count)  
print()

#print the nodes within same cluster  
res = {}
for n, m in sorted_dict.items():
    res[m] = [n] if m not in res.keys() else res[m] + [n]
print(res)

In [None]:
#get tweets from the users in the influence community
users=[]#input the id of the users to get their tweets

all_tweets=[]
for i in users:
    tweets=tweet(i)
    all_tweets.append(tweets)
    
all_tweet= pd.concat(all_tweets)

In [None]:
user=pd.DataFrame()
user[['id','name','screen_name','description']]=all_tweet[['user_id','user_name','user_screen_name',
                                                               'user_description']]
user=user.drop_duplicates()
user['clean_description']=user['description']
user['clean_description']=user['clean_description'].str.replace(r'[^\x00-\x7F]+', '', regex=True)#remove icons 
descrip_text=i_user.clean_description.values
user['clean_description'] = [clean_text(text) for text in descrip_text]


user, dictionary_user, corpus_user= spacy_description(user, nlp_des_nl)

user.to_csv(name+'.csv')

lda_user=train_LDA(corpus_user, dictionary_user, 1)

#wordcloud of users description 
rcParams["figure.figsize"]=10,5
rcParams['axes.titlesize']=12
wordcloud(name, lda_user, 0, 80)

In [None]:
all_tweet=pre_process(all_tweet)#clean tweet text

#seperate tweets into two dataframe(English and Dutch)
all_tweet_en=all_tweet.loc[all_tweet['tweet_lang'] == 'en']
all_tweet_nl=all_tweet.loc[all_tweet['tweet_lang'] == 'nl']

#preprocess tweet text with spaCy 
all_tweet_en, dictionary_en, corpus_en= spacy(all_tweet_en, nlp_en)
all_tweet_nl, dictionary_nl, corpus_nl= spacy(all_tweet_nl, nlp_nl)

# save the dataframe
all_tweet_en.to_csv(name+'.csv')
all_tweet_nl.to_csv(name+'.csv')

In [None]:
#compute the coherence value
model_en, coherence_values_en = compute_coherence_values(dictionary=dictionary_en, corpus=corpus_en, 
                                                             texts=all_tweet_en['tokenized_texts'], start=1, 
                                                             limit=20, step=3)

model_nl, coherence_values_nl = compute_coherence_values(dictionary=dictionary_nl, corpus=corpus_nl, 
                                                             texts=all_tweet_nl['tokenized_texts'], start=1, 
                                                             limit=20, step=3)

In [None]:
#plot the coherence value for different topic number
topic_coherence(name+' community - English tweets', coherence_values_en)
topic_coherence(name+' community - Dutch tweets', coherence_values_nl)

In [None]:
TOPICS_en=5 #topic number

lda_en=train_LDA(corpus_en, dictionary_en, TOPICS_en)

for topic in range(TOPICS_en):
    words = lda_en.show_topic(topic, topn=10) # get the 10 most relevant words for each topic
    topic_n_words = ' '.join([word[0] for word in words])
    print('Topic {}: {}'.format(str(topic), topic_n_words))
    
    
joined_en=combine_distribution(all_tweet_en, lda_en, TOPICS_en)

In [None]:
# overall topic distribution per user
joined_user = joined_en.groupby('user_screen_name').mean()

columns = joined_user.columns

#plot the topic distribution for each users within a topic
topic_user(name,col,topic)

In [None]:
#print the topic distribution per user and find the most used topic within the community
user_tweet_count=all_tweet_en.user_screen_name.value_counts()
user_tweet_count= pd.DataFrame(user_tweet_count)
user_topic_dis=joined_en.groupby('user_screen_name').sum()

for i in user_topic_dis.columns[1:]:
    user_topic_dis[i] = user_topic_dis[i] / user_tweet_count['user_screen_name']#divide the tweet number of the user
    
for i in user_topic_dis.columns[1:]:
    user_topic_dis[i+' sum']=user_topic_dis[i].sum()
    
print(user_topic_dis)

In [None]:
wordcloud(name+' community \n English tweets - Topic 0',lda_en, 0, 80)