### Import Libraries

In [1]:
from tweepy import API 
from tweepy import OAuthHandler
from tweepy import Cursor
import tweepy

import pandas as pd
import time,json
from twitter_client import get_twitter_client

import spacy
import time
from datetime import datetime

import os.path
import sys 
import os
from glob import glob

### Define Functions

In [56]:
def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))

In [57]:
rescue_code(get_top_interactions)

In [111]:
auth = tweepy.OAuthHandler('key', 'key')
auth.set_access_token('key', 'key')
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

def get_tweets (user_list, dir_name):
    '''
    The funciton takes in a list of users to collect their tweets and directory name
    where the tweet jsonl's will be saved
    '''
    
    path = f'{dir_name}'
    try:
        os.mkdir(path)
    except OSError:
            print ("Creation of the directory %s failed" % path)
    else:
            print ("Successfully created the directory %s " % path)
    client = api
    for user in user_list:
        fname = f'{dir_name}/user_timeline_{user}.jsonl'
        with open(fname, 'w') as f:
            for page in Cursor(client.user_timeline, tweet_mode='extended', screen_name=user, count=200).pages(16):
                for status in page:
                    f.write(json.dumps(status._json)+"\n")
                    
def get_urls(tweets_df):
    '''
    The function takes in a tweets dataframe and returns urls that the user tweeted 
    '''
    
    tweets_urls = pd.DataFrame(tweets_df['entities'].values.tolist(), index=tweets_df.index)
    tweets_urls = pd.DataFrame(tweets_data['urls'].values.tolist(), index=tweets_urls.index)
    tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist())
    tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist())
    urls = tweets_urls[0].dropna().apply(pd.Series)
    return urls
                    
def get_tweet_id_list(filepath):
    '''
    The function takes in a tweets jsonl filepath and return a list of tweet id's. The list can be
    later used to collect retweeters, up to 100 per tweet
    '''
    tweets = pd.read_json (filepath, lines = True)
    tweet_ids = tweets['id'].to_list()
    return tweet_ids

def get_top_interactions(file_path): 
    '''
    The function takes in a filepath and returns a list of retweeters and returns a dataframe with
    a count of interactions
    '''
    with open(file_path) as f:
        lines = f.read()
    df = pd.DataFrame(lines.strip('[').strip(']').split(","))
    df = pd.DataFrame(df[0].value_counts()).reset_index()
    df.rename(columns = {'index':'user_id', 0:'weight'}, inplace = True)
    df = df[df.weight > 10]
    return df


def get_retweeters(list_of_tweet_ids):
    ''' 
    The function takes in a list of tweet ids and returns users who retweeted the tweet
    '''
    
    retweeters = []
    for x in list_of_tweet_ids:
        for status in api.retweets(x):
            retweeters.append(status.user.id)
    return retweeters

def get_screen_names(user_id_list):
    '''
    The function converts user id's to user_names by using Twitter API
    '''
    screen_names = []
    for user_id in user_id_list:
        try:
            screen_name = api.get_user(user_id).screen_name
            screen_names.append(screen_name)
        except:
            pass
        
    return screen_names

def get_tweets_for_retweeters(network_scv_filepath, tweets_storage_directory_name):
    '''
    The funciton takes in the network csv and an output directiory path.
    The names are used to collect tweets and the directory name indicates
    a path to stroe the collected tweets
    '''
    df = pd.read_csv(network_scv_filepath)
    user_list = df['user_name'].tolist()
    get_tweets(user_list, tweets_storage_directory_name)
    

def get_network_from_txt (input_file_fullpath, output_file_path_fullpath):
    '''
    The function takes in a filepath to a text file containing twitter ids for retweeters
    and outputs a csv with users who have interacted with the account more than 10 times,
    the function also coverts the users ids to screen names
    '''
    df = get_top_interactions(input_file_fullpath)
    account_l_length = len(df['user_id'])
    print(f'{account_l_length} total accounts to process')
    df['user_name']=get_screen_names(df['user_id'])
    network = df.astype(str)
    network.to_csv(f'{output_file_path_fullpath}.csv')

def get_post_freq(depository_path): #gets a number of posts a day per user form a json in a directory
    import glob, os
    os.chdir(depository_path)
    name_avg_posts= []
    for file in glob.glob("*.jsonl"):
        df = pd.read_json (file, lines = True)
        df['dates'] = df['created_at'].dt.date
        dates = pd.DataFrame(df['dates'].value_counts()).reset_index()
        name=depository_path[31:-6]
        retweet_percentage = len(df[df['full_text'].str.contains('RT')])/len(df['full_text'])
        name_avg_posts.append([name,dates['dates'].mean(),retweet_percentage])
    freq_matrix_df = pd.DataFrame(name_avg_posts, columns = ['name', 'post_freq', 'retweet_perc'])
    freq_matrix_df.to_csv(f'{depository_path}freq_matrix.csv')
    return freq_matrix_df

def get_mentions_from_json(json_file):
    ''' 
    The function takes the tweets json file and extracts all the entities that were
    mentioned by the author
    '''
    try:
        mentions = pd.read_json (json_file, lines = True) #read the json into a pd dataframe
        mentions = pd.DataFrame( mentions['entities'].values.tolist(), index=mentions.index) #break down the entities column
        mentions = pd.DataFrame( mentions['user_mentions'].values.tolist(), index=mentions.index) #bread down the user mentions
        mentions = mentions.stack().reset_index() #put all the mentions in one column
        mentions = pd.DataFrame( mentions[0].values.tolist(), index=mentions.index)
        mentions_count = pd.DataFrame(mentions['screen_name'].value_counts().reset_index()) #count all the screename values
        mentions_count.rename(columns = {'index':'user_names','screen_name':'count'}, inplace = True) #rename columns
        name = json_file.rsplit('\\', 1)[1] #convert the json name to a name by dropping the path
        name=name[14:-6] #convert the jason name to a variable name by dropping .jsonl
        mentions_count['user']=name #creating a user column to indicate the outgoing node
        return mentions_count
    except:
        print (f"couldn't get entities from {json_file}")
        pass

def get_all_nodes_edges(path):
    import json
    import os
    import glob
    import pprint
    edges_for_nodes = []
    for filename in glob.glob(os.path.join(path, '*.jsonl')): #only process .JSON files in folder.
        edges_for_nodes.append(get_mentions_from_json(f'{filename}'))
    df = pd.concat(edges_for_nodes)
    df.rename(columns = {'count':'weight', 'user_names':'target', 'user':'source'}, inplace = True)
    df.to_csv(f'{path}full_edge_list.csv',  index=False)

def get_friends(screeen_name_list):
    friends_collection = pd.DataFrame()
    for x in screeen_name_list:
        ids = []
        for page in tweepy.Cursor(api.followers_ids, screen_name=x).pages():
            ids.extend(page)

        screen_names = [user.screen_name for user in api.lookup_users(user_ids=ids)]
        screen_names_df = pd.DataFrame(screen_names)
        screen_names_df['name'] = x
        friends_collection = pd.concat(friends_collection,screen_names_df)
    return friends_collection



In [128]:
RT_articles = pd.read_json('data/user_timeline_RT_com.jsonl', lines = True )

In [102]:
def get_urls(tweets_df):
    '''
    The function takes in a tweets dataframe and returns urls that the user tweeted 
    '''
    
    tweets_data = pd.DataFrame(tweets_df['entities'].values.tolist(), index=tweets_df.index)
    tweets_urls = pd.DataFrame(tweets_data['urls'].values.tolist(), index=tweets_urls.index)
    tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist())
    urls = tweets_urls[0].dropna().apply(pd.Series)
    return urls

In [154]:
RT_articles = pd.read_json('data/user_timeline_RT_com.jsonl', lines = True )
RT_articles = RT_articles[RT_articles['retweet_count']>80]
print(len(RT_articles['id']))
tweets_urls = pd.DataFrame(RT_articles['entities'].values.tolist(), index=RT_articles.index)
tweets_urls = pd.DataFrame(tweets_urls['urls'].values.tolist(), index=tweets_urls.index)
tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist(), index=tweets_urls.index)
tweets_urls
#tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist())
#urls = tweets_urls[0].dropna().apply(pd.Series)
tweets_urls.to_csv('RT_urls_with_over_20')

264


In [None]:
def get_urls_over_80(jsonfile_path):   
    RT_articles = pd.read_json('jsonfile_path', lines = True )
    RT_articles = RT_articles[RT_articles['retweet_count']>20]
    print(len(RT_articles['id']))
    tweets_urls = pd.DataFrame(RT_articles['entities'].values.tolist(), index=RT_articles.index)
    tweets_urls = pd.DataFrame(tweets_urls['urls'].values.tolist(), index=tweets_urls.index)
    tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist(), index=tweets_urls.index)
    tweets_urls
    #tweets_urls = pd.DataFrame(tweets_urls[0].values.tolist())
    #urls = tweets_urls[0].dropna().apply(pd.Series)
    tweets_urls.to_csv('sputnik_urls_with_over_20')

In [143]:
get_urls_over_80 ('data/user_timeline_Ruptly.jsonl')
get_urls_over_80 ('data/user_timeline_SputnikInt.jsonl')

357
         0
7     None
14    None
18    None
19    None
72    None
...    ...
3121  None
3126  None
3136  None
3170  None
3174  None

[357 rows x 1 columns]
21
                          url             expanded_url      display_url  \
196   https://t.co/KyJ7g0MFSO  https://sptnkne.ws/Cpfx  sptnkne.ws/Cpfx   
201   https://t.co/GwHNa68NdW  https://sptnkne.ws/CpeJ  sptnkne.ws/CpeJ   
403   https://t.co/95P0rwHJEB  https://sptnkne.ws/CnDm  sptnkne.ws/CnDm   
415   https://t.co/exbLARklZ1  https://sptnkne.ws/CnBt  sptnkne.ws/CnBt   
433   https://t.co/dyPOpJ9abE  https://sptnkne.ws/Cn8H  sptnkne.ws/Cn8H   
651   https://t.co/HzSzZKllMO  https://sptnkne.ws/CmUY  sptnkne.ws/CmUY   
883   https://t.co/TePZRDhjsF  https://sptnkne.ws/Cmsn  sptnkne.ws/Cmsn   
1091  https://t.co/W1sYEglSd8  https://sptnkne.ws/CkFr  sptnkne.ws/CkFr   
1333  https://t.co/qX2t2dDWku   http://sptnkne.ws/CjXt  sptnkne.ws/CjXt   
1376  https://t.co/BK6VM1ixb2  https://sptnkne.ws/CjXt  sptnkne.ws/CjXt   
1469  https:

KeyError: 0

In [74]:
get_all_nodes_edges('output/rt_Sputnik/')

couldn't get entities from output/rt_Sputnik\user_timeline_Marlyn99802970.jsonl


In [67]:
get_tweets_for_retweeters('C:\\Users\\Viktor Avdulov\\Milestone\\output\\sputnik_retweeters.txt', 'C:\\Users\\Viktor Avdulov\\Milestone\\output\\rt_Sputnik\\')

Creation of the directory C:\Users\Viktor Avdulov\Milestone\output\rt_Sputnik\ failed
Empty DataFrame
Columns: [user_id, weight]
Index: []


In [71]:
df = pd.read_csv('C:\\Users\\Viktor Avdulov\\Milestone\\output\\sputnik_retweeters.txt', header = None).T

In [28]:
def get_friends(screeen_name_list):
    friends_collection = pd.DataFrame()
    for x in screeen_name_list:
        ids = []
        for page in tweepy.Cursor(api.followers_ids, screen_name=x).pages():
            ids.extend(page)

        screen_names = [user.screen_name for user in api.lookup_users(user_ids=ids)]
        screen_names_df = pd.DataFrame(screen_names)
        screen_names_df['name'] = x
        friends_collection = pd.concat(friends_collection,screen_names_df)
    return friends_collection



    

In [55]:
friends_of_RT = get_friends(friends_of)

Rate limit reached. Sleeping for: 896
Rate limit reached. Sleeping for: 895
Rate limit reached. Sleeping for: 895
Rate limit reached. Sleeping for: 896


KeyboardInterrupt: 

In [48]:
users_df = pd.read_csv('output/rt_RT/full_RT_edge_list.csv')

In [49]:


degree_2 = pd.DataFrame(users_df['target'].value_counts().reset_index())

In [50]:
degree_2 = degree_2[degree_2['target']>5]

In [53]:
friends_of = degree_2['index'].tolist()
len(friends_of)

3749

### Set a list of entities to discover and screen settings to fully display the dataframe fields

During this step we defined the list of entities that were of interest to us. Specifically the media outlets contorlled by Russia

In [54]:
rus_entities = ['SputnikInt', 'Ruptly', 'RT_com']

### Change view of a dataframe to display full columns and full text

In [78]:
pd.set_option('display.max_columns', 350) #show all columns
#pd.set_option('display.max_colwidth', -1) #show all the text in a column

### Collect tweets for the Russian media channels

In [None]:
get_tweets(rus_entities, 'data/')

### Get all tweet ids for the Russian media channels

In [None]:
ruptly_tweet_ids = get_tweet_id_list('data/user_timeline_Ruptly.jsonl')

In [4]:
sputnik_tweet_ids = get_tweet_id_list('data/user_timeline_SputnikInt.jsonl')

In [5]:
RT_tweet_ids = get_tweet_id_list('data/user_timeline_RT_com.jsonl')

### Get twitter id's of the retweeters and save it to a txt file

In [None]:
sputnik_retweeters = get_retweeters(sputnik_tweet_ids)
with open('output/sputnik_retweeters_all.txt', 'w') as filehandle:
    filehandle.write(str(sputnik_retweeters))
    
RT_retweeters = get_retweeters(RT_tweet_ids)
with open('output/RT_retweeters_all.txt', 'w') as filehandle:
    filehandle.write(str(RT_retweeters))
    
ruptly_retweeters = get_retweeters(ruptly_tweet_ids)
with open('output/ruptly_retweeters_all.txt', 'w') as filehandle:
    filehandle.write(str(ruptly_retweeters))

### Get top retweeters

In [None]:
sputnik_interactions = get_top_interactions ('output/rt_Sputnik/sputnik_retweeters_all.txt')
RT_interactions = get_top_interactions ('output/RT_retweeters_all.txt')
ruptly_interactions = get_top_interactions ('output/ruptly_retweeters_all.txt')

### Convert twitter ids to names

In [None]:
sputnik_names  = get_screen_names(sputnik_interactions)
RT_names = get_screen_names(RT_interactions)
ruptly_names = get_screen_names(ruptly_interactions)

### Get tweets for all retweeters

In [None]:
get_tweets(sputnik_retweeters_set, 'output/sputink_retweeters')
get_tweets(RT_retweeters_set, 'output/RT_retweeters')
get_tweets(ruptly_retweeters_set, 'output/ruptly_retweeters')

### Extract all mentions from the tweets

In [None]:
get_post_freq

In [116]:
Explorador_IT = pd.read_json ('output/rt_ruptly/user_timeline_Explorador_IT.jsonl', lines = True)

In [1]:
sputnik_interactions = get_top_interactions ('output/rt_Sputnik/sputnik_retweeters_all.txt')

NameError: name 'get_top_interactions' is not defined

In [145]:
import json
import os
import glob
import pprint
edges_for_nodes = []
path = 'output/rt_ruptly/'
for filename in glob.glob(os.path.join(path, '*.jsonl')): #only process .JSON files in folder.
    edges_for_nodes.append(get_mentions_from_json(filename))
    

In [None]:
edges_for_nodes_df = edges_for_nodes

In [2]:
edges_for_nodes_df

NameError: name 'edges_for_nodes_df' is not defined

In [3]:
name='output/rt_ruptly/user_timeline_Explorador_IT.jsonl'
name=name[31:-6]
name

'Explorador_IT'

retweeters_sputnik = []
for x in sputnik_tweet_id_list:
    for status in api.retweets(x):
        retweeters_sputnik.append(status.user.id)

In [138]:
rt_sputnik_df = pd.DataFrame(retweeters_sputnik)
rt_sput_df = pd.DataFrame(rt_sputnik_df[0].value_counts())

In [140]:
rt_sput_df.reset_index(inplace = True)

In [142]:
rt_sput_df.rename(columns={0:'count', 'index':'user_id'}, inplace=True)

In [144]:
rt_sput_df.to_csv('spuntic_count.txt', header=None, index=None, sep=' ')