In [5]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

import fonctions
import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join
import time

import random
random.seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def trend_processing(x):
    if type(x) is str:
        return re.split(r'\t+', x)
    return []

def get_trends(directory,doc_name):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
    all_variables = all_features + labels
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    df.columns = all_variables
    
    df['reply_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['reply_timestamp'] ]
    df['retweet_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_timestamp'] ]
    df['retweet_with_comment_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_with_comment_timestamp'] ]
    df['like_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['like_timestamp'] ]
    
    df = df.filter(labels+['engaging_user_id','engaged_with_user_id'])
    
    return df

def taste(series):
    taste = collections.Counter( list(itertools.chain.from_iterable( series ) ))
    return taste

def author_taste(series):
    taste = collections.Counter( series ) 
    return taste

def user_tastes_on_chunk(chunk,chunk_id):
    directory = '/home/maxime/Desktop/RecSys2020/data/batches/'
    global_tastes = {}
    global_tastes_buff = {}
    iteration=1
    
    for batch_file in chunk:

        print(iteration)
        
        df = get_trends(directory, batch_file)
        
        buff = df [ df['like_timestamp']==1 ]
        results1 = buff.groupby(['engaging_user_id']).agg({'engaged_with_user_id':[author_taste]})
        results1.columns = ['lk_authors']
        concat = results1.to_dict(orient='index')
        concat = {k:v['lk_authors'] for k,v in concat.items() }

        common_id =[ k for k in concat.keys() if k in global_tastes_buff.keys()]
        else_id = [ k for k in concat.keys() if k not in global_tastes_buff.keys() ]

        { k:update_taste(global_tastes_buff,concat,k) for k in common_id }
        reste = { k:concat[k] for k in else_id}
        global_tastes_buff.update(reste)

        if iteration in [4, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]:
            
            select = {k:v for k,v in global_tastes_buff.items() if k in global_tastes.keys() }
            reste = {k:v for k,v in global_tastes_buff.items() if k not in global_tastes.keys() }
            
            { update_taste_agg(global_tastes, k, v) for k,v in select.items() }
            global_tastes.update(reste)
            
            global_tastes_buff={}
            select = {}
            reste = {}
            
            print(len( global_tastes.keys() ))

        iteration=iteration+1
     
    select = {k:v for k,v in global_tastes_buff.items() if k in global_tastes.keys() }
    reste = {k:v for k,v in global_tastes_buff.items() if k not in global_tastes.keys() }    
    { update_taste_agg(global_tastes, k, v) for k,v in select.items() }
    global_tastes.update(reste)
    
    print('saving...')

    with gzip.open('/home/maxime/Desktop/RecSys2020/trends/liked_author_tastes_{}.pkl.gz'.format(chunk_id),'wb') as f:
        pkl.dump(global_tastes,f)
        
    return True

def update_taste(global_tastes_buff,concat,k):
    request = global_tastes_buff.get(k, collections.Counter() )
    global_tastes_buff[k]=request+concat[k]
    
def update_taste_agg(global_tastes, k, v):
    global_tastes[k]= global_tastes[k]+v

In [None]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)

if __name__ == '__main__':
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=user_tastes_on_chunk, args=(chunk, idx) ) for idx, chunk in zip([6,7],[chunks[6],chunks[7] ] ) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join()

1
1
2
2
3
3
4
4
45052
5
45467
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
17
17
18
18
19
19
20
20
21
21
22
22
23
23
24
24
25
25
26
26
27
27
28
28
29
29
30
30
31
31
32
32
33
33
34
34
35
35
36
36
37
37
38
38
39
39
40
40
41
41
42
42
43
43
44
44
45
45
46
46
47
47
48
48
49
49
50
50
547754
51
548171
51
52
52
53
53
54
54
55
55
56
56
57
57
58
58
59
59
60
60
61
61
62
62
63
63
64
64
65
65
66
66
67
67
68
68
69
69
70
70
71
71
72
72
73
73
74
74
75
75
76
76
77
77
78
78
79
79
80
80
81
81
82
82
83
83
84
84
85
85
86
86
87
87
88
88
89
89
90
90
91
91
92
92
93
93
94
94
95
95
96
96
97
97
98
98
99
99
100
100
1077040
101
1072674
101
102
102
103
103
104
104
105
105
106
106
107
107
108
108
109
109
110
110
111
111
112
112
113
113
114
114
115
115
116
116
117
117
118
118
119
119
120
120
121
121
122
122
123
123
124
124
125
125
126
126
127
127
128
128
129
129
130
130
131
131
132
132
133
133
134
134
135
135
136
136
137
137
138
138
139
139
140
140
141
141
142
142
143
143
144
144
145
145
146
146
147
14