In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_curve, auc, log_loss

import fonctions
import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join

import random
random.seed(0)


Using TensorFlow backend.


In [2]:
def get_behavior(directory,doc_name,init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init==True:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
        
        df['reply_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['reply_timestamp'] ]
        df['retweet_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_timestamp'] ]
        df['retweet_with_comment_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_with_comment_timestamp'] ]
        df['like_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['like_timestamp'] ]
        df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:10] for date in df['tweet_timestamp'] ]
        df = df.filter(labels+['language','tweet_timestamp','engaged_with_user_id', 'engaging_user_id'],axis=1)
        
    else:
        df.columns = all_features
        df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:10] for date in df['tweet_timestamp'] ]
        df = df.filter(['language','tweet_timestamp','engaged_with_user_id', 'engaging_user_id'], axis=1)
        
    return df

def behavior_on_chunk(directory, chunk, chunk_id, dates, init):
    
    engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']
    
    user_presence = {}
    buff_user_presence = {}
    
    iteration=1
    for batch_file in chunk:

        df = get_behavior(directory, batch_file, init)
         
        select = { k:[0,0,0,0,0,0,0] for k in np.unique(df.engaging_user_id) if k not in buff_user_presence.keys()  }
        buff_user_presence.update(select)
            
        for idx, date in enumerate(dates):       
            buff =  df[ df['tweet_timestamp']==date ]
            buff_presence = collections.Counter(buff.engaging_user_id)
            { update_date(buff_user_presence,k, v, idx) for k,v in buff_presence.items() }
                
        if iteration in [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]:
            
            select = {k:v for k,v in buff_user_presence.items() if k in user_presence.keys() }
            reste = {k:v for k,v in buff_user_presence.items() if k not in user_presence.keys() }
            { update_agg(user_presence, k, v) for k,v in select.items() }
            user_presence.update(reste)
            print( len(user_presence.keys() ))
            buff_user_presence={}
            
        print(iteration)
        iteration=iteration+1
        
    select = {k:v for k,v in buff_user_presence.items() if k in user_presence.keys() }
    reste = {k:v for k,v in buff_user_presence.items() if k not in user_presence.keys() }
    { update_agg(user_presence, k, v) for k,v in select.items() }
    user_presence.update(reste)
    
    print('cutting and saving...')

    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/user_presence_{}.pkl.gz'.format(chunk_id),'wb') as f:
             pkl.dump(user_presence,f)
 
    else:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update1_user_presence_{}.pkl.gz'.format(chunk_id),'wb') as f:
            pkl.dump(user_presence,f)
       
        return True
    
    return True


def update_date(user_presence, key,v, idx):
    user_presence[key][idx]=user_presence[key][idx]+v


def update_agg(user_presence, k, v):
    user_presence[k] = [x + y for x, y in zip(user_presence[k], v )]
    

In [5]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'
dates = ['2020-02-06','2020-02-07', '2020-02-08','2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12']

if __name__ == '__main__':
    # Setup a list of processes that we want to run
        
    processes = [ mp.Process(target=behavior_on_chunk, args=(directory, chunk, chunk_id, dates, True) ) for chunk_id, chunk in zip([4,5,6,7],[chunks[4],chunks[5],chunks[6],chunks[7]]) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1


In [None]:
## update on validation

batch_path='/home/maxime/Desktop/RecSys2020/data/test'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/test/'
# dates = ['2020-02-06','2020-02-07', '2020-02-08','2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12']
dates = ['2020-02-13','2020-02-14', '2020-02-15','2020-02-16', '2020-02-17', '2020-02-18', '2020-02-19']

if __name__ == '__main__':
    # Setup a list of processes that we want to run
        
    processes = [ mp.Process(target=behavior_on_chunk, args=(directory, chunk, chunk_id, dates, False) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
15
14
15
15
15
15
15
15
15
16
16
16
16
16
16
16
16
17
17
17
17
17
17
17
18
17
18
18
18
18
18
18
18
19
19
19
19
19
19
19
19
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
22
22
22
22
22
22
22
22
23
23
23
23
23
23
23
23
24
24
24
24
24
24
24
25
24
25
25
25
25
25
25
25
26
26
26
26
26
26
26
26
27
27
27
27
27
27
27
27
28
28
28
