In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

import fonctions
import itertools

from os import listdir
from os.path import isfile, join

import random
random.seed(0)

Using TensorFlow backend.


In [2]:
def trend_processing(x):
    if type(x) is str:
        return re.split(r'\t+', x)
    return float('Nan')

def get_trends(directory, doc_name, init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
    else:
         df.columns = all_features
        
    df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:13] for date in df['tweet_timestamp'] ]
    
    df = df.filter(['present_domains','tweet_timestamp','tweet_id' ],axis=1)
    
    return df


def present_domains_on_chunk(directory, chunk, chunk_id, init):

    present_domains_presence = {}

    iteration=1
    for batch_file in chunk:

        df = get_trends(directory,batch_file,init)
        df = df [ df['present_domains'].isna()==False ]
        df = df.drop_duplicates()
        df['present_domains'] = [ trend_processing(x) for x in df['present_domains'] ] 

        domains = list( itertools.chain.from_iterable(df.present_domains) ) 
        
        
        tweet_id=[ [(x1,x3)] if len(x2)==1 else [ (x1,x3) ]*len(x2)  for x1,x2,x3 in zip(df.tweet_id, df.present_domains, df.tweet_timestamp) ]
        tweet_id = np.array( list( itertools.chain.from_iterable(tweet_id) ) )
        df2 = pd.DataFrame({'domains':domains, 'tweet_id':tweet_id[:,0], 'tweet_timestamp':tweet_id[:,1] })
        dictio = reformater(df2)

        select = { k:v for k,v in dictio.items() if k not in present_domains_presence.keys()  }
        reste = { k:v for k,v in dictio.items() if k in present_domains_presence.keys()  }
        
        present_domains_presence.update( select )
        { update_links(present_domains_presence, k, v) for k,v in reste.items() }
        
        
        print(iteration)
        iteration=iteration+1

    print('cutting and saving...')
    present_domains_presence = {k:sorting(v) for k,v in present_domains_presence.items() }

    
    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/present_domains_presence_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(present_domains_presence,f)
            
    else: 
        
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update1_present_domains_presence_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(present_domains_presence,f)
            

    return True

def reformater(df):
    dictio=collections.defaultdict(list)
    for x1,x2,x3 in zip(df.domains, df.tweet_id, df.tweet_timestamp):
        if x1 in dictio.keys():
            dictio[ x1 ].append( (x2, x3) )
        else:
            dictio[ x1 ]= [ (x2,x3) ]
            
    return dictio

def update_links(links_rank, k, v):
    links_rank[k].extend( v )

def sorting(v):
    unique = set(v)
    res = sorted(unique, key=lambda x: datetime.strptime(str(x[1]), '%Y-%m-%d %H') )
    return res

In [6]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'

if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=present_domains_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
16
16
15
16
16
16
16
16
17
16
17
17
17
17
17
17
18
17
18
18
18
18
18
18
19
18
19
19
19
19
19
19
20
20
20
20
19
20
20
20
21
21
21
21
20
21
21
21
22
22
22
21
22
22
22
22
23
23
23
22
23
23
23
23
24
24
24
24
24
24
23
24
25
25
25
25
24
25
25
25
26
26
26
26
26
25
26
26
27
27
27
27
27
26
27
27
28
28
28
28
28
28
27
28
29
29
29
29
29
28
29
30
29
30
30
30
30
29
30
30
31
31
31
31
31
31
30
32
31
32
32
32
32
32
31
33
32
33
33
33
33
33
32
34
33
34
34
34
34
33
34
35
34
35
35
35
35
34
35
35
36
36
36
36
36
35
36
37
37
36
37
37
37
36
37
38
37
38
38
38
38
38
37
39
39
39
38
39
39
39
38
40
40
40
39
40
40
40
39
40
41
41
41
41
41
41
40
41
42
42
42
42
42
42
41
43
42
43
43
43
42
43
43
44
43
44
44
44
43
44
44
45
44
45
45
45
45
44
4

283
284
283
285
285
284
285
284
284
285
284
286
286
285
286
286
285
285
285
287
287
287
286
286
286
287
288
286
287
288
288
287
287
288
287
289
289
289
288
288
288
289
288
290
290
290
289
289
289
290
291
291
289
290
290
290
291
291
292
290
292
291
291
292
292
291
291
293
292
293
292
293
293
292
294
293
294
292
293
294
294
293
294
293
295
295
295
294
294
295
296
295
294
296
296
295
295
296
296
297
295
297
297
296
296
297
297
298
296
298
298
297
297
298
298
299
297
299
299
298
298
299
299
300
298
300
300
299
299
300
300
299
301
301
300
301
300
301
301
302
300
302
301
302
301
302
301
302
303
303
303
302
303
302
302
303
304
304
303
304
304
303
305
304
303
305
305
304
305
304
305
304
306
306
306
305
306
305
306
307
305
307
307
306
307
306
308
306
307
308
308
307
308
307
308
309
307
309
309
308
309
308
309
308
310
310
310
309
310
309
309
310
311
311
311
310
311
310
310
312
311
312
311
312
311
312
311
313
313
312
313
312
312
312
313
314
314
313
314
313
313
314
315
313
314
315
315
314
315
314


In [3]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/test'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/test/'

if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=present_domains_on_chunk, args=(directory, chunk, chunk_id, False) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
16
16
16
16
16
16
16
17
16
17
17
17
17
17
17
18
17
18
18
18
18
18
18
19
18
19
19
19
19
20
19
19
19
20
20
20
20
21
20
20
20
21
21
21
21
22
21
21
22
21
22
22
22
23
22
22
23
23
22
23
23
24
23
23
23
24
24
24
24
24
25
24
25
24
25
25
25
26
25
25
26
26
26
26
25
27
26
26
27
27
27
26
27
27
28
27
28
28
28
28
27
28
28
29
29
29
29
28
29
29
30
30
29
30
30
30
29
30
31
31
30
31
31
30
31
32
31
31
32
32
32
32
31
32
33
32
33
33
33
34
34
33
32
33
33
34
35
33
34
35
34
34
34
35
36
36
35
34
35
35
35
36
35
37
37
36
36
36
36
38
37
36
38
37
37
37
37
39
37
38
39
38
38
38
38
40
38
39
39
40
39
39
39
41
40
39
41
40
40
40
40
42
41
41
42
41
40
41
41
43
42
43
42
42
41
42
43
42
43
44
44
42
43
43
43
44
44
45
45
44
43
44
45
46
44
45
46
4