In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket
from functools import partial
import time

# Params

In [2]:
cutoff = 500
print('Save Data After Downloading',cutoff,'Timelines')


Save Data After Downloading 500 Timelines


In [3]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 0 (Default)
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 8 (Default)


In [4]:
path_to_data='../data'
path_to_users = os.path.join(path_to_data + '/users')
path_to_keys = os.path.join('../keys')
path_to_timelines = os.path.join(path_to_data,'timelines','API')
os.makedirs(path_to_timelines, exist_ok=True)
print(path_to_users)
print(path_to_keys)
print(path_to_timelines)


../data/users
../keys
../data\timelines\API


# Credentials

In [5]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'key*')))
    auth_file = np.random.permutation(glob(os.path.join(path_to_keys,'auth*')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check environment variables:')
        print('# Credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only keeping', SLURM_JOB_CPUS_PER_NODE, 'credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files, auth_file

key_files, auth_file = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))
print('\n'.join(auth_file))

# Credentials Allocated To Node: 7
../keys\key_youssr.json
../keys\key_jihanne.json
../keys\key_cyril.json
../keys\key_naila.json
../keys\key_clemence.json
../keys\key_noemie.json
../keys\key_naila2.json
../keys\auth_naila.json


In [6]:
def get_auth(key_file):
    
    # Import Auth keys
    for auth_file in glob(os.path.join(path_to_keys,'auth*')) :
        with open (auth_file) as f:
            auth_key = json.load(f)
    
    # Import token pairs
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(auth_key['consumer_key'], auth_key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
        print(key_file,": Authentication checked")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return auth, api

# Users list

In [7]:
print('Import Users in IDF and Paris')
users_idf = pd.read_json(os.path.join(path_to_users, 'users_idf.json'), lines=True)[0].tolist()
users_paris = pd.read_json(os.path.join(path_to_users, 'users_paris.json'), lines=True)[0].tolist()

Import Users in IDF and Paris


In [8]:
# print('Import Users By Account')
# start = timer()

# l = []
# for filename in sorted(glob(os.path.join(path_to_users,'users*.json'))):
#     try:
#         df = pd.read_json(filename, lines=True) 
#         l.append(df)
#     except:
#         print('error importing', filename)
        
# #users_by_account_location=pd.concat(l, axis=0, ignore_index=True)
# #users_by_account_location=users_by_account_location.set_index('city')['user_id']
# #users_by_account_location=users_by_account_location.apply(eval).apply(lambda x:[str(y) for y in x])
# # print('# Locations:', len(users_by_account_location))
# # print('# Users Total:', users_by_account_location.apply(len).sum())

# end = timer()
# print('Computing Time:', round(end - start), 'sec')

In [9]:
# print('Import Locations')
# account_locations=pd.read_pickle(os.path.join(path_to_locations,'account-locations.pkl')) 
# print('# Locations:', len(account_locations))

In [10]:
start = timer()
print('Select Users...')

# Sorted list of users in selected countries
# users=pd.merge(
# users_by_account_location.reindex(
# account_locations.loc[
# account_locations['country_short'].isin(country_codes),'user_location']).dropna().reset_index(),
# account_locations[['user_location','country_short']]).drop('user_location',1).rename(
# columns={'country_short':'country_code'}).explode('user_id').set_index('user_id')['country_code'].sort_index()
#users=pd.DataFrame(users_by_account_location.set_index('user_id')['country'].sort_index())

# Randomize users
#users=users.sample(frac=1,random_state=0)

# del users_by_account_location
# del account_locations

#print('# Users :', len(users)) 
# print(users.reset_index().groupby('country_code').count())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Select Users...
Computing Time: 0 sec


In [11]:
start = timer()
# print('Split Users Across Nodes...')

# print('First user:', users.index[0])
# users=np.array_split(users,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
# print('# Users for this node:', len(users)) 
# print('First user for this node:', users.index[0])

# end = timer()
print('Computing Time:', round(end - start), 'sec')

Computing Time: -1 sec


In [12]:
# country_codes = users_by_account_location['country'].unique()

In [13]:
# start = timer()
# print('Remove users whose timeline were successfully downloaded...')

# def get_success(country_code):
    
#     if not os.path.exists(os.path.join(path_to_timelines, country_code, 'success')):
#         return set()
#     else:
#         success = set()
#         with open(os.path.join(path_to_timelines, country_code, 'success'), 'r', encoding='utf-8') as file:
#             for line in file:
#                 success.add(line.strip('\n').split('\t')[0])
#         return set(success)

# # success=set()
# # for country_code in country_codes:
# #     tmp=get_success(country_code)
# #     print(country_code, ':', len(tmp))
# #     success=success.union(tmp)
# # print('# downloaded timelines:', len(success))

# # users.drop(success,errors='ignore',inplace=True)
# # print('# remaining users for this node:', len(users))

# # Group users by country
# users_by_country=users.reset_index().groupby('country')['user_id'].apply(list).reindex(country_codes)

# end = timer()
# print('Computing Time:', round(end - start), 'sec')

# Get timeline

In [14]:
def get_timeline(user_id,api):
    
    timeline = []
    error = None
    
    # Collect All Statuses in Timeline
    try:
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True).items()
        
        for status in cursor:
            timeline.append(status._json)
     
    except tweepy.error.TweepError as e:
        error = str(e)
        
    return pd.DataFrame(timeline), error

# timeline = get_user_timeline('12',get_auth(key_file))

In [15]:
# timelines=pd.DataFrame()
# downloaded_ids = []
# country_code='France'
# index_key=0
# for user_id in users_block[0:3]:
#         print(str(user_id))
#         # Try Downloading Timeline
#         timeline, error = get_timeline(user_id,api)
        
#         if error!=None:
# #             print(user_id,index_key,error)
#             continue
#         # Append
#         print('append')
#         timelines = pd.concat([timelines, timeline],sort=False)
#         downloaded_ids.append(user_id)
            
#         # Save after <cutoff> timelines or when reaching last user
#         print('save')
#         if len(downloaded_ids) == cutoff or user_id == users_block[2]:
            
#             filename = \
#             'timelines-'+\
#             str(SLURM_JOB_ID)+'-'+\
#             str(SLURM_ARRAY_TASK_ID)+'-'+\
#             str(index_key)+'-'+\
#             str(len(downloaded_ids))+'-'+\
#             output_id+'.json.bz2'
            
#             print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output file:', 
#             os.path.join(path_to_timelines,'IDF',filename))
            
#             # Save as list of dict discarding index
#             print('save as list')
#             timelines.to_json(
#             os.path.join(path_to_timelines,'IDF',filename),
#             orient='records',
#             #force_ascii=False,
#             date_format=None,
#             double_precision=15)
            
#              # Save User Id and File In Which Its Timeline Was Saved
# #             print('save user_id')
# #             with open(os.path.join(path_to_timelines,'IDF','success'), 'a', encoding='utf-8') as file:
# #                 for downloaded_id in downloaded_ids:
# #                     file.write(downloaded_id+'\t'+filename+'\n')
            
#             # Reset Output File ID, Data, and Downloaded Users
#             print('reset')
#             del timelines, downloaded_ids
#             output_id = str(uuid.uuid4())
#             timelines = pd.DataFrame()
#             downloaded_ids = []

In [16]:
# def download_timelines(index_key,users_list):

#     # Create Access For Block of Users
#     api = get_auth(key_files[index_key])
    
#     # Select Block of Users
#     users_block = np.array_split(users_idf,len(key_files))[0]
    
#     # Initialize Output File ID
#     output_id = str(uuid.uuid4())
    
#     # Initialize DataFrame
#     timelines = pd.DataFrame()
    
#     # Initialize Downloaded User List
#     downloaded_ids = []
    
#     for user_id in users_block[0:3]:
#         # Try Downloading Timeline
#         timeline, error = get_timeline(user_id,api)
        
#         if error!=None:
# #             print(user_id,index_key,error)
#             continue
#         # Append
#         print('append')
#         timelines = pd.concat([timelines, timeline],sort=False)
#         downloaded_ids.append(user_id)
            
#         # Save after <cutoff> timelines or when reaching last user
#         print('save')
#         if len(downloaded_ids) == cutoff or user_id == users_block[2]:
            
#             filename = \
#             'timelines-'+\
#             str(SLURM_JOB_ID)+'-'+\
#             str(SLURM_ARRAY_TASK_ID)+'-'+\
#             str(index_key)+'-'+\
#             str(len(downloaded_ids))+'-'+\
#             output_id+'.json.bz2'
            
#             print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output file:', 
#             os.path.join(path_to_timelines,'IDF',filename))
            
#             # Save as list of dict discarding index
#             print('save as list')
#             timelines.to_json(
#             os.path.join(path_to_timelines,'IDF',filename),
#             orient='records',
#             #force_ascii=False,
#             date_format=None,
#             double_precision=15)
            
#              # Save User Id and File In Which Its Timeline Was Saved
# #             print('save user_id')
# #             with open(os.path.join(path_to_timelines,'IDF','success'), 'a', encoding='utf-8') as file:
# #                 for downloaded_id in downloaded_ids:
# #                     file.write(downloaded_id+'\t'+filename+'\n')
            
#             # Reset Output File ID, Data, and Downloaded Users
#             print('reset')
#             del timelines, downloaded_ids
#             output_id = str(uuid.uuid4())
#             timelines = pd.DataFrame()
#             downloaded_ids = []
#     return 0

In [17]:
import functions_api

SLURM_JOB_ID : 0 (Default)
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 8 (Default)
# Credentials Allocated To Node: 7


In [None]:
print('Extract Timelines...\n')
with mp.Pool() as pool:
    pool.map(partial(functions_api.download_timelines, users_list=users_idf), range(len(key_files)))


Extract Timelines...

