In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import gzip
import pickle as pkl

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

import fonctions
import metric_loading

import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join

import random
random.seed(0)


Using TensorFlow backend.


In [2]:
def trend_processing(x):
    if type(x) is str:
        return re.split(r'\t+', x)
    return float('Nan')

def get_trends(directory, doc_name, init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
        
        df['reply_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['reply_timestamp'] ]
        df['retweet_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_timestamp'] ]
        df['retweet_with_comment_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_with_comment_timestamp'] ]
        df['like_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['like_timestamp'] ]
        
    else:
         df.columns = all_features
        
    df['hashtags'] = [ trend_processing(x) for x in df['hashtags'] ] 
    df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:13] for date in df['tweet_timestamp'] ]
    
    df = df.filter(labels+['hashtags','tweet_timestamp' ],axis=1)
    
    return df


def hashtags_on_chunk(directory, chunk, chunk_id, init):

    hashtag_ratio = {}
    buff_hashtag_ratio = {}
    engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']


    iteration=1
    for batch_file in chunk:
        
        df = get_trends(directory,batch_file,True)
        df = df [ df['hashtags'].isna()==False ]
        hashtags = np.unique ( list( itertools.chain.from_iterable(df.hashtags) ) )
        
        select = { k:[0,0,0,0] for k in hashtags if k not in buff_hashtag_ratio.keys()  }
        buff_hashtag_ratio.update(select)
        
        for idx, engagement in enumerate(engagements):
            buff =  df[ df[engagement]==1 ]
            buff_presence = collections.Counter( list( itertools.chain.from_iterable(buff.hashtags) ) )
            { update_eng(buff_hashtag_ratio, k, v, idx) for k,v in buff_presence.items() }
        
        if iteration in [4, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]:
            
            select = {k:v for k,v in buff_hashtag_ratio.items() if k in hashtag_ratio.keys() }
            reste = {k:v for k,v in buff_hashtag_ratio.items() if k not in hashtag_ratio.keys() }
            { update_agg(hashtag_ratio, k, v) for k,v in select.items() }
            hashtag_ratio.update(reste)
            buff_hashtag_ratio={}
            print(len( hashtag_ratio.keys() ))

        print(iteration)
        iteration=iteration+1
        
    select = {k:v for k,v in buff_hashtag_ratio.items() if k in hashtag_ratio.keys() }
    reste = {k:v for k,v in buff_hashtag_ratio.items() if k not in hashtag_ratio.keys() }
    { update_agg(hashtag_ratio, k, v) for k,v in select.items() }
    hashtag_ratio.update(reste)
    
    print('cutting and saving...')
    #hashtag_ratio = { k:v for k,v in hashtag_ratio.items() if sum(v)>1 }
    
    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/hashtag_ratio_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(hashtag_ratio,f)
            
    else: 
        
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update_hashtag_ratio_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(hashtag_ratio,f)
    
    return True

def update_eng(hashtag_ratio, key,v, idx):
    hashtag_ratio[key][idx]=hashtag_ratio[key][idx]+v

def update_agg(hashtag_ratio, k, v):
    hashtag_ratio[k]=[x + y for x, y in zip(hashtag_ratio[k], v )]
    


In [3]:

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'

if __name__ == '__main__':
    # Setup a list of processes that we want to run
        
    processes = [ mp.Process(target=hashtags_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
27564
4
27953
4
27569
4
27740
27938
4
4
27338
4
27903
4
27305
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
16
16
16
16
16
16
16
16
17
17
17
17
17
17
17
17
18
18
18
18
18
18
18
18
19
19
19
19
19
19
19
19
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
22
22
22
22
22
22
22
22
23
23
23
23
23
23
23
23
24
24
24
24
24
24
24
24
25
25
25
25
25
25
25
25
26
26
26
26
26
26
26
26
27
27
27
27
27
27
27
27
28
28
28
28
28
28
28
28
29
29
29
29
29
29
29
30
29
30
30
30
30
30
30
31
31
31
31
30
31
31
31
32
31
32
32
32
32
32
32
32
33
33
33
33
33
33
33
33
34
34
34
34
34
34
34
34
35
35
35
35
35
35
35
35
36
36
36
36
36
36
36
36
37
37
37
37
37
37
37
38
38
37
38
38
38
38
38
38
39
39
39
39
39
39
40
39
39
40
40
40
40
40
40
41
40
41
41
41
41
41
42
41
42
41
42
42
42
42
43
42
42
43
43
43
43
4

272
273
275
273
274
274
275
275
273
274
274
276
275
275
276
274
275
276
275
277
276
277
276
275
276
277
276
277
278
277
278
277
276
278
278
277
278
279
278
277
279
279
278
279
279
279
280
278
280
280
279
280
280
281
280
279
281
281
280
281
281
282
281
282
280
282
281
282
282
283
282
283
281
283
282
283
283
284
282
283
284
284
283
284
284
285
283
284
285
285
284
285
285
286
285
284
286
286
285
286
286
287
286
285
287
287
287
287
286
288
286
287
288
288
288
288
287
289
287
288
289
289
289
288
289
290
288
290
289
290
290
289
290
291
289
291
290
291
291
291
290
292
291
290
292
292
292
291
292
293
293
293
292
291
293
292
293
294
292
294
293
294
295
293
295
293
294
294
294
295
294
295
296
294
296
295
295
296
296
295
296
296
297
297
295
297
297
297
296
298
297
296
298
298
298
297
299
299
298
298
297
299
299
298
299
299
298
574587
299
300
299
574308
300
574484
300
576549
300
575215
300
576566
300
301
301
301
301
574672
300
576928
300
301
302
301
302
302
302
301
302
301
302
303
303
303
303
302


In [4]:
engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']
global_hashtags_ratio = {}
global_hashtags_presence = {}

for chunk_id in range(8):
    
    print(chunk_id)
    
    with gzip.open('/home/maxime/Desktop/RecSys2020/trends/hashtag_ratio_{}.pkl.gz'.format(chunk_id), 'rb') as f:
        hashtags_ratio = pkl.load(f)
        
    select = {k:v for k,v in hashtags_ratio.items() if k in global_hashtags_ratio.keys() }
    reste = {k:v for k,v in hashtags_ratio.items() if k not in global_hashtags_ratio.keys() }
    { update_agg(global_hashtags_ratio, k, v) for k,v in select.items() }     
    global_hashtags_ratio.update(reste)
    
    with gzip.open('/home/maxime/Desktop/RecSys2020/trends/hashtag_presence_{}.pkl.gz'.format(chunk_id), 'rb') as f:
        hashtags_presence = pkl.load(f)
        
    select = {k:v for k,v in hashtags_presence.items() if k in global_hashtags_presence.keys() }
    reste = {k:v for k,v in hashtags_presence.items() if k not in global_hashtags_presence.keys() }
    { update_agg(global_hashtags_presence, k, v) for k,v in select.items() }     
    global_hashtags_presence.update(reste)
        
global_hastags_presence = {k:len(v) for k,v in tqdm( global_hashtags_presence.items() ) }


0
1
2
3
4
5
6
7


HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




In [5]:
def ratio_extraction(v):

    total_engagement = sum(v)

    if sum(v)==0:
        like_ratio = 0
        retweet_ratio = 0
        rtc_ratio = 0
        reply_ratio = 0
    else:
        like_ratio = round( v[0]/sum(v), 3)
        retweet_ratio = round(v[1]/sum(v),3)
        rtc_ratio = round(v[2]/sum(v) , 3)
        reply_ratio = round(v[3]/sum(v),3)

    return [total_engagement, like_ratio, retweet_ratio, rtc_ratio, reply_ratio ]

global_ratio2 = { k:ratio_extraction(v) for k,v in tqdm( global_hashtags_ratio.items() ) }
computed_ratio = {'like_timestamp':{},'retweet_timestamp':{},'retweet_with_comment_timestamp':{},'reply_timestamp':{}}

engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']

for idx,engagement in enumerate(engagements):
    
    computed_ratio[engagement]= {k:(v[0],v[idx+1]) for k,v in tqdm(global_ratio2.items()) if v[0]>500 and v[idx+1]>0.6 }


HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2360218.0), HTML(value='')))




In [6]:
like_id = [ k for k in computed_ratio['like_timestamp'].keys() ]
print(len(like_id))
dictionnary_lk = { val:idx for idx,val in enumerate(like_id)}

retweet_id = [k for k in computed_ratio['retweet_timestamp'].keys() ]
print(len(retweet_id))
dictionnary_rt = { val:idx for idx,val in enumerate(retweet_id)}

rtc_id = [ k for k in computed_ratio['retweet_with_comment_timestamp'].keys() ]
print(len(rtc_id))
dictionnary_rtc = { val:idx for idx,val in enumerate(rtc_id)}

reply_id = [k for k in computed_ratio['reply_timestamp'].keys() ]
print(len(reply_id))
dictionnary_rpl = { val:idx for idx,val in enumerate(reply_id)}

with gzip.open('/home/maxime/Desktop/RecSys2020/trends/hashtags_influence.pkl.gz','wb') as f:
    pkl.dump(dictionnary_lk, f)
    pkl.dump(dictionnary_rt, f)


5365
264
0
0
