In [3]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_curve, auc, log_loss
import joblib

import fonctions
import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join
from collections import defaultdict

import random
random.seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def get_behavior(directory,doc_name,init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init==True:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables

        df['link']=[1 if type(x) is str else 0 for x in df['present_links']]
        df['photo'] = [ 1 if type(x) is str and x.count('Photo')>0 else 0 for x in df['present_media'] ]
        df['video'] = [ 1 if type(x) is str and x.count('Video')>0 else 0 for x in df['present_media'] ]
        df['gif'] = [ 1 if type(x) is str and x.count('GIF')>0 else 0 for x in df['present_media'] ]
        
        df = df.filter(['link','photo','video','gif','engaged_with_user_id', 'engaging_user_id'],axis=1)
        
    else:
        df.columns = all_features
           
        df['link']=[1 if type(x) is str else 0 for x in df['present_links']]
        df['photo'] = [ 1 if type(x) is str and x.count('Photo')>0 else 0 for x in df['present_media'] ]
        df['video'] = [ 1 if type(x) is str and x.count('Video')>0 else 0 for x in df['present_media'] ]
        df['gif'] = [ 1 if type(x) is str and x.count('GIF')>0 else 0 for x in df['present_media'] ]
        
        df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:10] for date in df['tweet_timestamp'] ]
        df = df.filter(['link','photo','video','gif','engaged_with_user_id', 'engaging_user_id'], axis=1)

    return df


def multimedia_on_chunk(directory, chunk, chunk_id, init):

    multimedias = {}

    iteration=1
    for batch_file in chunk:
        
        df = get_behavior(directory, batch_file, init)
        df = df.groupby(['engaged_with_user_id']).sum()
        dictio = {k : [link,photo,video,gif] for k, link,photo,video,gif in zip(df.index,df.link,df.photo,df.video,df.gif)}

        select = { k:v for k,v in dictio.items() if k not in multimedias.keys()  }
        reste = { k:v for k,v in dictio.items() if k in multimedias.keys()  }
        
        multimedias.update( select )
        { update_multimedia(multimedias, k, v) for k,v in reste.items() }
        
        print(iteration)
        iteration=iteration+1
    
    print('saving...')
    
    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/multimedia_author_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(multimedias,f)
            
    else: 
        
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update1_multimedia_author_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(multimedias,f)
    
    return True

def update_multimedia(multimedias, k, v):
    multimedias[k]=[ val+v[idx] for idx,val in enumerate( multimedias[k] ) ]


In [9]:
%%time

batch_path = '/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)

directory = '/home/maxime/Desktop/RecSys2020/data/batches/'
if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=multimedia_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in zip([0,1,2,3],[ chunks[0],chunks[1],chunks[2],chunks[3] ] ) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
2
2
2
2
3
3
3
3
4
4
4
4
5
5
5
5
6
6
6
6
7
7
7
7
8
8
8
8
9
9
9
9
10
10
10
11
10
11
11
11
12
12
13
12
12
13
14
13
13
14
14
14
15
15
15
16
15
16
16
17
16
17
17
18
17
18
18
19
18
19
20
19
19
20
21
20
20
21
22
21
21
22
22
23
22
23
23
24
23
24
25
24
24
25
26
25
25
26
27
26
26
27
27
27
28
28
29
28
28
29
30
29
29
30
31
30
30
31
31
32
31
32
32
32
33
33
33
34
33
34
34
35
35
34
35
36
36
35
36
37
37
36
37
38
37
38
38
39
38
39
39
40
40
39
40
41
41
40
41
42
42
41
42
43
43
42
43
44
43
44
44
45
44
45
46
46
45
45
47
47
46
46
48
48
47
47
49
49
48
48
50
49
49
50
51
50
50
51
52
52
51
51
53
53
52
52
54
54
53
53
55
55
54
54
56
56
55
55
57
56
56
57
57
58
57
58
59
59
58
58
60
60
59
59
61
60
61
60
62
61
62
61
63
62
63
62
63
64
63
64
64
64
65
65
66
66
65
65
67
67
66
66
68
68
67
67
68
69
69
68
69
70
70
69
70
70
71
71
71
71
72
72
72
73
73
72
74
74
73
73
75
74
75
74
76
75
76
75
77
76
77
76
78
77
77
78
78
78
79
79
79
79
80
80
80
80
81
81
81
82
82
81
82
82
83
83
83
84
83
84
84
84
85
85
85
85
86
86
86
87
86
8

In [5]:
%%time

batch_path = '/home/maxime/Desktop/RecSys2020/data/test'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)

directory = '/home/maxime/Desktop/RecSys2020/data/test/'
if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=multimedia_on_chunk, args=(directory, chunk, chunk_id, False) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
14
13
14
14
14
14
14
14
15
15
15
14
15
15
15
15
15
16
16
16
16
16
16
16
17
17
17
16
17
17
17
17
18
18
17
18
18
18
18
18
19
19
19
18
19
19
19
19
19
20
20
20
20
20
20
21
20
20
21
21
21
21
21
21
22
21
22
22
22
22
22
22
23
22
23
23
23
23
23
23
23
24
24
24
24
24
24
24
25
24
25
25
25
25
25
25
26
25
26
26
26
26
26
26
27
26
27
27
27
27
27
27
27
28
28
28
28
28
28
28
28
29
29
29
29
29
29
29
30
29
30
30
30
30
30
30
30
31
31
31
31
31
31
31
31
32
32
32
32
32
32
32
32
33
33
33
33
33
33
33
33
34
34
34
34
34
34
34
34
35
35
35
35
35
35
35
35
36
36
36
36
36
36
37
36
36
37
37
37
37
37
37
37
38
38
38
38
38
38
38
38
39
39
39
39
39
39
39
39
40
40
40
40
40
40
40
40
41
41
41
41
41
41
41
41
42
42
42
42
42
42
42
42
43
43
43
43
43
43
43
43
44
44
44
44
44
44
44
44
45
45
45
45
45
4