In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_curve, auc, log_loss

import fonctions
import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join

import random
random.seed(0)


Using TensorFlow backend.


In [2]:
def get_behavior(directory,doc_name,init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init==True:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
        
        df['reply_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['reply_timestamp'] ]
        df['retweet_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_timestamp'] ]
        df['retweet_with_comment_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_with_comment_timestamp'] ]
        df['like_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['like_timestamp'] ]
        df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:10] for date in df['tweet_timestamp'] ]
        df = df.filter(labels+['language','tweet_timestamp','engaged_with_user_id', 'engaging_user_id'],axis=1)
        
    else:
        df.columns = all_features
        df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:10] for date in df['tweet_timestamp'] ]
        df = df.filter(['language','tweet_timestamp','engaged_with_user_id', 'engaging_user_id'], axis=1)
        
    return df

def behavior_on_chunk(directory, chunk, chunk_id, init):
        
    user_langue = {}

    
    iteration=1
    for batch_file in chunk:

        df = get_behavior(directory, batch_file, init)
        langue_ref = df.groupby(['engaging_user_id']).agg({'language':[langues]}).language.langues.to_dict()
         
        select = { k:collections.Counter() for k in np.unique(df.engaging_user_id) if k not in user_langue.keys()  }
        user_langue.update(select)
        
        { update_langue(user_langue, k,v) for k,v in langue_ref.items()  }
            
        print(iteration)
        iteration=iteration+1
    
    print('cutting and saving...')
    user_langue = { k:cut(v) for k,v in user_langue.items()}
    
    if init==True:
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/user_langue_{}.pkl.gz'.format(chunk_id),'wb') as f:
             pkl.dump(user_langue,f)

    else:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update1_user_langue_{}.pkl.gz'.format(chunk_id),'wb') as f:
            pkl.dump(user_langue,f)
       
        return True
    
    return True

def cut(v):
    comm = v.most_common(3)
    res = collections.Counter({k[0]:k[1] for k in comm})
    return res

def langues(serie):
    return collections.Counter(serie)
    
def update_langue(user_presence, key,v):
    user_presence[key]=user_presence[key]+v

def update_agg(user_presence, k, v):
    langue = user_presence[k] + v
    user_presence[k]= langue
    

In [6]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 16)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'

if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=behavior_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in zip([12,13,14,15],[ chunks[12],chunks[13],chunks[14],chunks[15] ] ) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
2
2
2
2
3
3
3
3
4
4
4
4
5
5
5
6
5
6
6
7
6
7
7
8
7
8
8
9
8
9
9
10
10
9
10
11
10
11
11
12
11
12
12
13
12
13
13
14
13
14
14
15
14
15
15
16
15
16
16
17
16
17
17
18
17
18
18
19
18
19
19
19
20
20
20
20
21
21
21
21
22
22
23
22
23
22
23
24
23
24
24
25
24
25
25
26
25
26
26
27
26
27
27
28
27
29
28
28
28
30
29
29
29
31
30
30
30
32
31
31
31
33
32
32
32
34
33
33
33
35
34
34
34
36
35
35
35
37
36
36
36
38
37
37
37
39
40
38
38
38
39
41
39
39
40
42
40
40
43
41
41
41
44
42
42
42
45
43
43
43
46
44
44
44
47
45
45
45
48
46
46
46
49
47
47
47
48
50
48
48
49
51
49
49
52
50
50
50
53
51
51
51
54
52
52
52
55
53
53
53
56
54
54
54
57
55
55
58
56
55
56
59
57
56
57
58
60
57
58
59
58
61
59
60
59
62
60
61
63
60
61
62
64
61
62
63
65
62
63
64
63
64
66
65
64
65
67
66
65
66
68
67
66
67
68
69
67
68
69
70
68
69
70
71
69
70
71
72
70
71
72
73
71
72
73
74
72
73
74
75
73
74
75
76
74
75
76
77
75
76
77
78
76
77
78
77
79
78
79
78
80
79
79
81
80
82
80
80
81
83
81
81
82
84
82
82
83
85
83
83
84
86
84
84
85
87
85
85
86
88
86
8

In [None]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/test/'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/test/'

if __name__ == '__main__':
    # Setup a list of processes that we want to run
        
    processes = [ mp.Process(target=behavior_on_chunk, args=(directory, chunk, chunk_id, False) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
10
10
9
9
10
10
10
10
11
10
11
10
11
11
11
11
12
11
12
12
11
12
12
12
13
12
13
13
12
13
13
13
14
14
13
14
13
14
14
14
15
14
15
15
14
15
15
15
16
15
16
16
16
16
15
16
16
17
17
17
17
17
17
16
17
18
18
18
18
18
18
17
18
19
19
19
19
19
19
18
19
20
20
20
20
20
20
19
21
20
21
21
21
21
21
20
22
21
22
22
22
22
22
23
21
23
23
23
23
23
22
24
22
24
24
25
24
24
24
23
23
25
25
25
25
26
25
24
26
24
26
26
26
27
26
27
25
25
27
27
27
28
27
28
26
26
28
28
29
28
28
29
27
27
29
30
29
29
29
30
28
28
30
30
30
