In [1]:
from timeit import default_timer as timer
import itertools
import os
import sys
import uuid
from glob import glob
import json
import tweepy
import numpy as np
import pandas as pd
import multiprocessing as mp
import psutil
import socket
from functools import partial
import time

# Params

In [2]:
cutoff = 100
print('Save Data After Downloading',cutoff,'Timelines')

Save Data After Downloading 100 Timelines


In [3]:
def get_env_var(varname,default):
    
    if os.environ.get(varname) != None:
        var = int(os.environ.get(varname))
        print(varname,':', var)
    else:
        var = default
        print(varname,':', var,'(Default)')
    return var

# Choose Number of Nodes To Distribute Credentials: e.g. jobarray=0-4, cpu_per_task=20, credentials = 90 (<100)
SLURM_JOB_ID            = get_env_var('SLURM_JOB_ID',0)
SLURM_ARRAY_TASK_ID     = get_env_var('SLURM_ARRAY_TASK_ID',0)
SLURM_ARRAY_TASK_COUNT  = get_env_var('SLURM_ARRAY_TASK_COUNT',1)
SLURM_JOB_CPUS_PER_NODE = get_env_var('SLURM_JOB_CPUS_PER_NODE',mp.cpu_count())

SLURM_JOB_ID : 0 (Default)
SLURM_ARRAY_TASK_ID : 0 (Default)
SLURM_ARRAY_TASK_COUNT : 1 (Default)
SLURM_JOB_CPUS_PER_NODE : 16 (Default)


In [4]:
path_to_data='../data'
path_to_users = os.path.join(path_to_data + '/users')
path_to_keys = os.path.join('../keys')
path_to_timelines = os.path.join(path_to_data,'timelines','API')
os.makedirs(path_to_timelines, exist_ok=True)
print(path_to_users)
print(path_to_keys)
print(path_to_timelines)


../data/users
../keys
../data/timelines/API


# Credentials

In [5]:
def get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE):

    # Randomize set of key files using constant seed
    np.random.seed(0)
    all_key_files = np.random.permutation(glob(os.path.join(path_to_keys,'key*')))
    auth_file = np.random.permutation(glob(os.path.join(path_to_keys,'auth*')))
    
    # Split file list by node
    key_files = np.array_split(all_key_files,SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID]
    
    # Check that node has more CPU than key file 
    if len(key_files) <= SLURM_JOB_CPUS_PER_NODE:
        print('# Credentials Allocated To Node:', len(key_files)) 
    else:
        print('Check environment variables:')
        print('# Credentials (',len(key_files),') > # CPU (', SLURM_JOB_CPUS_PER_NODE,')')
        print('Only keeping', SLURM_JOB_CPUS_PER_NODE, 'credentials')
        key_files = key_files[:SLURM_JOB_CPUS_PER_NODE]
        
    return key_files, auth_file

key_files, auth_file = get_key_files(SLURM_ARRAY_TASK_ID,SLURM_ARRAY_TASK_COUNT,SLURM_JOB_CPUS_PER_NODE)
print('\n'.join(key_files))
print('\n'.join(auth_file))

# Credentials Allocated To Node: 10
../keys/key_jihanne.json
../keys/key_othmane.json
../keys/key_naila.json
../keys/key_sam.json
../keys/key_cyril.json
../keys/key_noemie.json
../keys/key_youssr.json
../keys/key_marc.json
../keys/key_clemence.json
../keys/key_naila2.json
../keys/auth_naila.json


In [6]:
def get_auth(key_file):
    
    # Import Auth keys
    for auth_file in glob(os.path.join(path_to_keys,'auth*')) :
        with open (auth_file) as f:
            auth_key = json.load(f)
    
    # Import token pairs
    with open(key_file) as f:
        key = json.load(f)

    # OAuth process, using the keys and tokens
    auth = tweepy.OAuthHandler(auth_key['consumer_key'], auth_key['consumer_secret'])
    auth.set_access_token(key['access_token'], key['access_token_secret'])

    # Creation of the actual interface, using authentication
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    
    try:
        api.verify_credentials()
        print(key_file,": Authentication checked")
    except:
        print(key_file,": error during authentication")
        sys.exit('Exit')
    
    return auth, api

# Users list

In [7]:
print('Import Users in IDF and Paris')
users_idf = pd.read_json(os.path.join(path_to_users, 'users_idf.json'), lines=True)[0].tolist()
users_paris = pd.read_json(os.path.join(path_to_users, 'users_paris.json'), lines=True)[0].tolist()

Import Users in IDF and Paris


In [34]:
print("Users for which we don't have the timeline")
with open(os.path.join(path_to_timelines,'IDF','success','downloaded_ids.txt'), 'r', encoding='utf-8') as file:
    users_idf_downloaded = file.read().splitlines()
users_idf_to_download = [item for item in users_idf if item not in users_idf_downloaded]

Users for which we don't have the timeline


# Get timeline

In [8]:
def get_timeline(user_id,api):
    
    timeline = []
    error = None
    
    # Collect All Statuses in Timeline
    try:
        cursor = tweepy.Cursor(
        api.user_timeline, 
        user_id=user_id, 
        count=3200,
        tweet_mode="extended", 
        include_rts=True, 
        sleep_on_rate_limit=False).items()
        
        for status in cursor:
            timeline.append(status._json)
     
    except tweepy.error.TweepError as e:
        error = str(e)
    timeline = pd.DataFrame(timeline, columns = ['id_str','user','full_text','created_at','lang'])
    timeline['user_id'] = timeline['user'].apply(lambda x: x.get('id_str'))
    timeline['user_name'] = timeline['user'].apply(lambda x: x.get('name'))
    return timeline[['id_str','user_id','user_name','full_text','created_at','lang']], error

# timeline = get_user_timeline('12',get_auth(key_file))

In [38]:
cutoff=100

def download_timelines(index_key,users_list):

    # Create Access For Block of Users
    api = get_auth(key_files[index_key])[1]
    
    # Select Block of Users
    users_block = np.array_split(users_list,len(key_files))[index_key]
    
    # Initialize Output File ID
    output_id = str(uuid.uuid4())
    
    # Initialize DataFrame
    timelines = pd.DataFrame()
    
    # Initialize Downloaded User List
    downloaded_ids = []
    
    for user_id in users_block:
        #print(str(user_id))
        # Try Downloading Timeline
        timeline, error = get_timeline(user_id,api)
        
        if error!=None:
            print(user_id,index_key,error)
            continue
        # Append
        timelines = pd.concat([timelines, timeline],sort=False)
        downloaded_ids.append(user_id)

        #print('appended')

        # Save after <cutoff> timelines or when reaching last user
        if len(downloaded_ids) == cutoff or user_id == users_block[-1]:
            print('save')

            filename = \
            'timelines-'+\
            str(SLURM_JOB_ID)+'-'+\
            str(SLURM_ARRAY_TASK_ID)+'-'+\
            str(index_key)+'-'+\
            str(len(downloaded_ids))+'-'+\
            output_id+'.json'
            
            print('Process', index_key, 'saving', len(downloaded_ids), 'timelines with output file:', 
            os.path.join(path_to_timelines,'IDF',filename))
            
            # Save as list of dict discarding index
            print('save as list')
            timelines.to_json(
            os.path.join(path_to_timelines,'IDF',filename),
            orient='records',
            #force_ascii=False,
            date_format=None,
            double_precision=15)
            
            # Save User Id and File In Which Its Timeline Was Saved
            print('save user_id')
            with open(os.path.join(path_to_timelines,'IDF','success','downloaded_ids.txt'), 'a', encoding='utf-8') as file:
                for downloaded_id in downloaded_ids:
                    file.write(str(downloaded_id)+'\n')
            print('user_id_saved')
            
            # Reset Output File ID, Data, and Downloaded Users
            print('reset')
            del timelines, downloaded_ids
            output_id = str(uuid.uuid4())
            timelines = pd.DataFrame()
            downloaded_ids = []
    return 0

In [None]:
print('Extract Timelines...\n')
with mp.Pool() as pool:
    #pool.map(partial(download_timelines, users_list=users_idf), range(len(key_files)))
    pool.map(partial(download_timelines, users_list=users_idf_to_download), range(len(key_files)))

Extract Timelines...

../keys/key_naila.json : Authentication checked
../keys/key_cyril.json : Authentication checked
../keys/key_jihanne.json : Authentication checked
../keys/key_othmane.json : Authentication checked
../keys/key_sam.json : Authentication checked
../keys/key_noemie.json : Authentication checked
../keys/key_youssr.json : Authentication checked
../keys/key_clemence.json : Authentication checked
../keys/key_marc.json : Authentication checked
../keys/key_naila2.json : Authentication checked
464903859 4 Twitter error response: status code = 401
1057794546 0 Twitter error response: status code = 401
2223133986 6 Twitter error response: status code = 401
143728976 3 Twitter error response: status code = 401
1239295644336115713 0 Twitter error response: status code = 401
739568210 7 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 520
Rate limit reached. Sleeping for: 518


1251619861656109058 6 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 497
Rate limit reached. Sleeping for: 496
Rate limit reached. Sleeping for: 482
Rate limit reached. Sleeping for: 481
Rate limit reached. Sleeping for: 478
Rate limit reached. Sleeping for: 473
Rate limit reached. Sleeping for: 469
Rate limit reached. Sleeping for: 464


28349211 2 Twitter error response: status code = 401
988407190057897984 8 Twitter error response: status code = 503
1078627279696265216 8 Twitter error response: status code = 401
1187734345580199937 0 Twitter error response: status code = 401
340422281 7 Twitter error response: status code = 401
1150518223273697280 9 Twitter error response: status code = 401
1248643806922244100 2 Twitter error response: status code = 401
1119976115672633344 8 Twitter error response: status code = 401
154603854 1 Twitter error response: status code = 401
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-fc38e4fc-f2d7-4413-afb5-11f307ce04e6.json
save as list
save user_id
user_id_saved
reset
107728519 6 Twitter error response: status code = 401
1247616994792308741 5 Twitter error response: status code = 401
save
Process 6 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-6-100-0a0f76d0-95d8-4358-881f-4ba52ec9533e.json
save as 

Rate limit reached. Sleeping for: 312


save user_id
user_id_saved
reset


Rate limit reached. Sleeping for: 320
Rate limit reached. Sleeping for: 306
Rate limit reached. Sleeping for: 300
Rate limit reached. Sleeping for: 288
Rate limit reached. Sleeping for: 285
Rate limit reached. Sleeping for: 262
Rate limit reached. Sleeping for: 280
Rate limit reached. Sleeping for: 250
Rate limit reached. Sleeping for: 247


348149269 6 Twitter error response: status code = 401
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-78c9d18f-7762-47d1-8a79-6ac1610e4fe4.json
save as list
save user_id
user_id_saved
reset
1187965343672741889 8 Twitter error response: status code = 401
1036025028352126978 1 Twitter error response: status code = 401
1229489877877710848 7 Twitter error response: status code = 401
save
Process 6 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-6-100-dae031d5-61b9-4491-b75e-1f4e28b9da42.json
save as list
save user_id
user_id_saved
reset
1235350358727237632 7 Twitter error response: status code = 401
save
Process 9 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-9-100-1087afb7-e519-4d59-8faf-9607efd11c5a.json
save as list
save user_id
user_id_saved
reset
1236303463124008964 7 Twitter error response: status code = 404
1081252262431899650 8 Twitter error response: status code = 40

Rate limit reached. Sleeping for: 360
Rate limit reached. Sleeping for: 368
Rate limit reached. Sleeping for: 360
Rate limit reached. Sleeping for: 359
Rate limit reached. Sleeping for: 348


2609141204 1 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 332
Rate limit reached. Sleeping for: 322
Rate limit reached. Sleeping for: 333
Rate limit reached. Sleeping for: 334
Rate limit reached. Sleeping for: 307


2322240294 5 Twitter error response: status code = 401
save
Process 2 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-2-100-7b7ec00a-c408-4858-bc44-4690c02686f5.json
save as list
save user_id
user_id_saved
reset
515649425 7 Twitter error response: status code = 404
2580378615 4 Twitter error response: status code = 404
274448693 9 Twitter error response: status code = 401
269916766 4 Twitter error response: status code = 401
1163124996232945665 5 Twitter error response: status code = 401
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-9b9cedc1-6870-4675-9063-94bb39e619d2.json
save as list
save user_id
user_id_saved
reset
save
Process 8 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-8-100-40bb252d-52c2-40f7-ae68-5a632e302fd1.json
save as list
save user_id
user_id_saved
reset
1305547003 5 Twitter error response: status code = 401
save
Process 6 saving 100 timelines with outp

Rate limit reached. Sleeping for: 365
Rate limit reached. Sleeping for: 347


1222876965503410177 0 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 345


1237708791099076608 6 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 334
Rate limit reached. Sleeping for: 328
Rate limit reached. Sleeping for: 323
Rate limit reached. Sleeping for: 341
Rate limit reached. Sleeping for: 312
Rate limit reached. Sleeping for: 311
Rate limit reached. Sleeping for: 282


2532997170 6 Twitter error response: status code = 401
1213981371414138880 8 Twitter error response: status code = 401
1051547735889973248 2 Twitter error response: status code = 401
save
Process 1 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-1-100-95e78749-b681-4590-aedd-b89d79ef8bcf.json
save as list
save user_id
user_id_saved
reset
1221447443444838401 8 Twitter error response: status code = 401
2394985801 6 Twitter error response: status code = 401
1242814446021627904 8 Twitter error response: status code = 401
1185518208511553536 7 Twitter error response: status code = 401
save
Process 6 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-6-100-7039832d-93ab-499a-aa1f-bd1ea960a046.json
save as list
save user_id
user_id_saved
reset
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-a1c4d198-c0e4-49f0-b317-65a884ab36ae.json
save as list
save user_id
user_id_saved
reset
714555

Rate limit reached. Sleeping for: 358
Rate limit reached. Sleeping for: 364


54059293 8 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 348
Rate limit reached. Sleeping for: 349
Rate limit reached. Sleeping for: 337
Rate limit reached. Sleeping for: 327
Rate limit reached. Sleeping for: 300
Rate limit reached. Sleeping for: 320


244863241 3 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 286
Rate limit reached. Sleeping for: 291


save
Process 7 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-7-100-bc31de0a-8d82-4d29-b9f2-2a43c701e500.json
save as list
save user_id
user_id_saved
reset
save
Process 4 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-4-100-b50c6710-4ed3-4d67-aece-03c67f7b883f.json
save as list
save user_id
user_id_saved
reset
290981389 0 Twitter error response: status code = 401
1243551672024080384 8 Twitter error response: status code = 401
save
Process 3 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-3-100-a73d0f99-583c-4327-a72e-0b9e0b990545.json
save as list
save user_id
user_id_saved
reset
1237260994684096512 3 Twitter error response: status code = 401
770310099940286464 7 Twitter error response: status code = 401
save
Process 0 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-0-100-9a77a5c3-15a4-43a6-b767-625bbe2d4b0a.json
save as list
save user_id
user_id_saved
reset
101

Rate limit reached. Sleeping for: 352
Rate limit reached. Sleeping for: 350
Rate limit reached. Sleeping for: 339
Rate limit reached. Sleeping for: 339
Rate limit reached. Sleeping for: 330
Rate limit reached. Sleeping for: 307
Rate limit reached. Sleeping for: 326
Rate limit reached. Sleeping for: 308
Rate limit reached. Sleeping for: 304
Rate limit reached. Sleeping for: 282


1205635771572670465 0 Twitter error response: status code = 401
save
Process 2 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-2-100-47f45178-a4ee-4a28-8c14-edcd20194b99.json
save as list
save user_id
user_id_saved
reset
1545530472 9 Twitter error response: status code = 401
1120042115382353920 8 Twitter error response: status code = 401
315398241 5 Twitter error response: status code = 401
185074648 9 Twitter error response: status code = 401
579668208 6 Twitter error response: status code = 401
save
Process 7 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-7-100-6d46f219-3912-4001-9324-f486b75abb6d.json
save as list
save user_id
user_id_saved
reset
1241883229524615168 7 Twitter error response: status code = 401
410327685 9 Twitter error response: status code = 401
save
Process 4 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-4-100-3c02980b-9a7a-431e-b0a7-84368f17d926.json
save as list
save u

Rate limit reached. Sleeping for: 355
Rate limit reached. Sleeping for: 359
Rate limit reached. Sleeping for: 347
Rate limit reached. Sleeping for: 342
Rate limit reached. Sleeping for: 339
Rate limit reached. Sleeping for: 333
Rate limit reached. Sleeping for: 344
Rate limit reached. Sleeping for: 302
Rate limit reached. Sleeping for: 303
Rate limit reached. Sleeping for: 306


14640579 7 Twitter error response: status code = 401
1196575979478511616 7 Twitter error response: status code = 404
save
Process 0 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-0-100-a68e2ca9-eef6-4f4b-98b2-3d98ee678480.json
save as list
save user_id
user_id_saved
reset
708382559403712514 0 Twitter error response: status code = 401
1026650140147437568 4 Twitter error response: status code = 401
save
Process 9 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-9-100-b398fbdd-5599-4cfd-84dc-7ea0d76c2b97.json
save as list
save user_id
user_id_saved
reset
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-6cbb46a0-5bd2-4fa5-b9ee-148a8c88b8dc.json
save as list
save user_id
user_id_saved
reset
2839887351 0 Twitter error response: status code = 401
1220687206181851136 8 Twitter error response: status code = 401
1205865577522384902 9 Twitter error response: status code = 401
798963997

Rate limit reached. Sleeping for: 370
Rate limit reached. Sleeping for: 379
Rate limit reached. Sleeping for: 376
Rate limit reached. Sleeping for: 369
Rate limit reached. Sleeping for: 367
Rate limit reached. Sleeping for: 328
Rate limit reached. Sleeping for: 332


926603249943302144 0 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 326
Rate limit reached. Sleeping for: 332
Rate limit reached. Sleeping for: 325


286082576 7 Twitter error response: status code = 401
save
Process 8 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-8-100-72d2d638-9334-4b95-934e-07734d32bb19.json
save as list
save user_id
user_id_saved
reset
762879373904973824 9 Twitter error response: status code = 401
save
Process 9 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-9-100-2ce78a1e-71a6-4d10-a50b-5edd4ea4477d.json
save as list
save user_id
user_id_saved
reset
79671535 8 Twitter error response: status code = 401
4872874330 7 Twitter error response: status code = 401
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-0b73d5b2-f0c6-4218-a110-1bba36c57f01.json
save as list
save user_id
user_id_saved
reset
129287832 7 Twitter error response: status code = 401
4775693663 9 Twitter error response: status code = 401
save
Process 7 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-7-100-31

Rate limit reached. Sleeping for: 424


976418336132214785 0 Twitter error response: status code = 401


Rate limit reached. Sleeping for: 404


save
Process 3 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-3-100-b9442484-2d8e-4cec-b69f-7fb55367510a.json
save as list
save user_id
user_id_saved
reset


Rate limit reached. Sleeping for: 384
Rate limit reached. Sleeping for: 384
Rate limit reached. Sleeping for: 380
Rate limit reached. Sleeping for: 379
Rate limit reached. Sleeping for: 358
Rate limit reached. Sleeping for: 364
Rate limit reached. Sleeping for: 376
Rate limit reached. Sleeping for: 350


1213416215713968128 7 Twitter error response: status code = 401
1209597298726060032 7 Twitter error response: status code = 401
582976740 9 Twitter error response: status code = 401
618340609 4 Twitter error response: status code = 401
save
Process 8 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-8-100-ea3daf59-b613-4621-9fbb-f812f0018ebf.json
save as list
save user_id
user_id_saved
reset
1244579848594612232 9 Twitter error response: status code = 401
1250577438851190784 4 Twitter error response: status code = 401
save
Process 9 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-9-100-f596100c-c51b-480d-834c-e911085bc242.json
save as list
save user_id
user_id_saved
reset
1008171770132787200 4 Twitter error response: status code = 401
1126220377435398144 2 Twitter error response: status code = 401
save
Process 5 saving 100 timelines with output file: ../data/timelines/API/IDF/timelines-0-0-5-100-2aea6d29-5a3d-4d8c-98cd-408853fb