In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import time
import pandas as pd
import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_curve, auc, log_loss

import tensorflow as tf

import fonctions
import itertools

from os import listdir
from os.path import isfile, join

import random
random.seed(0)

Using TensorFlow backend.


In [2]:
def trend_processing(x):
    if type(x) is str:
        return re.split(r'\t+', x)
    return float('Nan')

def get_trends(directory, doc_name, init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
    else:
         df.columns = all_features
        
    df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:13] for date in df['tweet_timestamp'] ]
    
    df = df.filter(['tweet_id','tweet_timestamp' ],axis=1)
    
    return df


def present_tweet_on_chunk(directory, chunk, chunk_id, init):

    tweet_presence = collections.Counter()

    iteration=1
    for batch_file in chunk:
        t1 = time.time()

        df = get_trends(directory,batch_file,init)
        presence = collections.Counter(df.tweet_id)
                
        select = {k:v for k,v in presence.items() if k not in tweet_presence }
        reste = {k:v for k,v in presence.items() if k in tweet_presence}
        
        tweet_presence.update(reste)
        { update(tweet_presence, k,v) for k,v in select.items() }

        t2 = time.time()

        print( str(iteration)+' '+str(round(t2-t1,3))+' '+str(len(tweet_presence.keys() ) ) ) 
        iteration=iteration+1
        
    
    print('saving...')
    
    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/tweet_presence_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(tweet_presence,f)
            
    else: 
        
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update1_tweet_presence_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(tweet_presence,f)

    
    return True

def update(tweet_presence, k, v):
    tweet_presence[k]=tweet_presence[k]+v



In [4]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'

if __name__ == '__main__':
    # Setup a list of processes that we want to run
    
    processes = [ mp.Process(target=present_tweet_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in zip([4,5,6,7],[ chunks[4],chunks[5],chunks[6],chunks[7]]) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1 0.46 29752
1 0.451 29775
1 0.546 29721
1 0.545 29716
2 0.434 59052
2 0.422 59033
2 0.561 59042
2 0.555 59029
3 0.555 88030
3 0.51 88012
3 0.558 88084
3 0.513 88004
4 0.449 116824
4 0.473 116848
4 0.502 116727
4 0.586 116746
5 0.489 145376
5 0.526 145322
5 0.523 145203
5 0.532 145235
6 0.42 173601
6 0.615 173699
6 0.441 173536
6 0.627 173467
7 0.449 201660
7 0.455 201563
7 0.537 201750
7 0.447 201535
8 0.435 229701
8 0.565 229563
8 0.603 229469
8 0.431 229465
9 0.471 257251
9 0.518 257464
9 0.474 257268
9 0.541 257328
10 0.472 284867
10 0.47 285086
10 0.445 284983
10 0.495 284836
11 0.452 312335
11 0.505 312583
11 0.549 312414
11 0.552 312245
12 0.495 339672
12 0.505 339880
12 0.526 339690
12 0.538 339528
13 0.474 366808
13 0.571 366933
13 0.504 366937
13 0.604 366721
14 0.514 393866
14 0.49 394023
14 0.469 394039
15 0.452 420749
14 0.528 393737
15 0.447 421015
15 0.485 420978
16 0.524 447668
15 0.495 420620
16 0.594 447812
16 0.553 447767
17 0.429 474446
16 0.483 447463
17 0.507 4744

122 0.455 2977776
126 0.548 3067925
124 0.489 3023228
124 0.48 3021792
123 0.482 2999909
127 0.468 3089984
125 0.473 3045313
124 0.466 3022048
125 0.557 3043964
126 0.453 3067380
128 0.663 3111827
126 0.45 3065943
125 0.559 3044298
127 0.473 3089453
127 0.472 3088002
129 0.626 3133806
126 0.576 3066517
128 0.492 3111496
128 0.47 3110182
130 0.548 3155826
127 0.496 3088646
129 0.578 3133450
129 0.576 3132125
128 0.573 3110721
131 0.602 3177833
130 0.529 3155359
130 0.49 3153929
129 0.477 3132752
132 0.472 3199786
131 0.455 3175922
131 0.529 3177467
130 0.531 3154680
133 0.515 3221818
132 0.606 3197886
132 0.615 3199443
131 0.445 3176613
134 0.47 3243778
133 0.484 3219780
133 0.485 3221347
132 0.496 3198593
135 0.495 3265662
134 0.461 3243121
136 0.45 3287721
134 0.592 3241823
133 0.587 3220591
135 0.48 3264859
137 0.524 3309569
134 0.469 3242380
135 0.591 3263718
136 0.5 3286667
136 0.458 3285592
138 0.573 3331423
135 0.562 3264320
137 0.441 3308545
137 0.498 3307628
136 0.583 3286163
1

235 0.526 5368281
239 0.487 5450740
241 0.548 5492549
239 0.497 5450361
236 0.591 5388718
240 0.519 5471168
242 0.534 5512928
240 0.538 5470601
237 0.494 5409118
241 0.642 5491498
243 0.533 5533305
241 0.556 5490804
238 0.471 5429303
242 0.513 5511801
244 0.52 5553591
239 0.507 5449728
242 0.621 5511031
243 0.505 5532102
245 0.556 5573837
240 0.556 5469976
243 0.506 5531331
244 0.558 5552295
241 0.498 5490294
244 0.524 5551590
245 0.564 5572665
246 1.083 5594085
242 0.562 5510482
245 0.55 5571697
247 0.481 5614277
243 0.492 5530742
246 0.605 5591935
246 1.121 5592815
248 0.576 5634504
244 0.498 5550976
247 0.495 5612977
249 0.505 5654545
245 0.51 5571159
247 0.973 5612071
248 0.486 5633108
250 0.535 5674613
246 0.602 5591337
248 0.488 5632403
249 0.569 5653404
251 0.534 5694750
249 0.559 5652690
250 0.59 5673504
252 0.554 5714791
247 1.04 5611548
250 0.601 5672899
251 0.503 5693765
253 0.509 5734861
248 0.518 5631750
251 0.485 5692994
252 0.543 5713712
254 0.557 5754948
249 0.485 56519

349 0.575 7619308
354 0.51 7718418
354 0.48 7714718
355 0.573 7738044
350 0.477 7638474
355 0.566 7737456
355 0.499 7733956
356 0.521 7757247
351 0.577 7657507
356 0.48 7756696
356 0.477 7753192
357 0.544 7776274
357 0.485 7775820
352 0.565 7676691
357 0.477 7772215
358 0.525 7795237
358 0.507 7794977
358 0.469 7791510
353 0.628 7695831
359 0.451 7814020
359 0.645 7814346
359 0.456 7810565
354 0.546 7714901
360 0.471 7833150
360 0.511 7829677
360 0.637 7833441
355 0.627 7734062
361 0.461 7852295
361 0.462 7848730
361 0.571 7852577
356 0.499 7753320
362 0.51 7871400
362 0.545 7867727
362 0.584 7871602
357 0.507 7772381
363 0.529 7890477
363 0.503 7886704
364 0.46 7909572
363 0.64 7890736
358 0.644 7791501
364 0.456 7905793
365 0.496 7928409
364 0.55 7909792
365 0.488 7924887
359 0.586 7810494
366 0.559 7947574
366 0.475 7943919
365 0.567 7928947
360 0.469 7829648
367 0.489 7966575
367 0.487 7962925
361 0.514 7848761
366 0.546 7948156
368 0.487 7982014
368 0.548 7985592
367 0.493 7967228

469 0.51 9873273
469 0.523 9870263
463 0.47 9760738
470 0.662 9893335
470 0.585 9891657
470 0.561 9888447
471 0.508 9911671
464 0.614 9778940
471 0.472 9910035
471 0.513 9906815
472 0.499 9929958
465 0.504 9797338
472 0.495 9928275
472 0.514 9925126
473 0.535 9948215
466 0.552 9815753
473 0.516 9946413
473 0.505 9943527
474 0.547 9966529
467 0.492 9834028
474 0.532 9964663
475 0.467 9984893
474 0.557 9961866
468 0.484 9852470
475 0.566 9982993
475 0.501 9980106
476 0.518 10003237
469 0.559 9870719
476 0.544 10001383
476 0.504 9998526
477 0.503 10021546
470 0.502 9889081
477 0.488 10019520
477 0.538 10017029
478 0.523 10039680
471 0.485 9907598
478 0.496 10037852
479 0.478 10057942
472 0.467 9925958
478 0.573 10035331
479 0.581 10056234
480 0.561 10076273
473 0.519 9944071
479 0.494 10053669
480 0.504 10074554
481 0.47 10094464
474 0.555 9962415
480 0.574 10071945
481 0.566 10092997
482 0.478 10112672
481 0.462 10090129
475 0.565 9980693
482 0.57 10111214
483 0.504 10130865
482 0.567 10

In [3]:
%%time

batch_path='/home/maxime/Desktop/RecSys2020/data/test'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/test/'

if __name__ == '__main__':
    # Setup a list of processes that we want to run
    
    processes = [ mp.Process(target=present_tweet_on_chunk, args=(directory, chunk, chunk_id, False) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1 0.774 29742
1 0.769 29748
1 0.794 29702
1 0.825 29705
1 0.827 29702
1 0.849 29720
1 0.882 29738
1 0.876 29727
2 0.714 58944
2 0.731 58980
2 0.768 58934
2 0.793 58961
2 0.804 58983
2 0.812 59037
2 0.854 58949
2 0.864 59002
3 0.746 87841
3 0.815 87859
3 0.776 87869
3 0.779 87904
3 0.78 87878
3 0.798 87880
3 0.841 87910
3 0.819 87870
4 0.724 116398
4 0.81 116420
4 0.796 116450
4 0.767 116461
4 0.744 116467
4 0.788 116545
4 0.825 116406
4 0.799 116487
5 0.77 144730
5 0.775 144768
5 0.737 144786
5 0.854 144812
5 0.872 144753
5 0.762 144820
5 0.783 144747
5 0.754 144781
6 0.874 172865
6 1.023 172788
6 0.954 172899
6 0.935 172925
6 1.084 172857
6 1.077 172859
6 1.046 172765
6 1.058 172789
7 0.972 200682
7 0.946 200724
7 0.886 200815
7 0.987 200802
7 0.86 200774
7 0.993 200731
7 0.947 200634
7 0.981 200637
8 0.753 228366
8 0.758 228326
8 0.764 228485
8 0.74 228479
8 0.846 228384
8 0.818 228257
8 0.869 228319
8 0.921 228297
9 0.81 255928
9 0.865 255836
9 0.862 255796
9 0.859 255914
9 0.797 25