In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd

import collections
import re
import pprint as pp
import numpy as np
import collections

import multiprocessing as mp
from multiprocessing.pool import ThreadPool

import math
import gzip
import pickle as pkl
from datetime import datetime
import matplotlib.pyplot as plt

import fonctions
import metric_loading

import itertools
from tqdm.notebook import tqdm

from os import listdir
from os.path import isfile, join

import random
random.seed(0)


Using TensorFlow backend.


In [2]:
def trend_processing(x):
    if type(x) is str:
        return re.split(r'\t+', x)
    return float('Nan')

def get_trends(directory, doc_name, init):
    
    all_features = ["text_tokens", "hashtags", "tweet_id", 
                    "present_media", "present_links", 
                    "present_domains", "tweet_type","language", 
                    "tweet_timestamp", "engaged_with_user_id",
                    "engaged_with_user_follower_count", "engaged_with_user_following_count", 
                    "engaged_with_user_is_verified", "engaged_with_user_account_creation",
                    "engaging_user_id", "engaging_user_follower_count", 
                    "engaging_user_following_count", "engaging_user_is_verified",
                    "engaging_user_account_creation", "engagee_follows_engager"]
    
    df = pd.read_csv(directory+doc_name, encoding="utf-8", sep='\x01', header=None)
    
    if init:
        labels = ['reply_timestamp','retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp']
        all_variables = all_features + labels
        df.columns = all_variables
        
        df['reply_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['reply_timestamp'] ]
        df['retweet_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_timestamp'] ]
        df['retweet_with_comment_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['retweet_with_comment_timestamp'] ]
        df['like_timestamp']=[ 0 if math.isnan(x) else 1 for x in df['like_timestamp'] ]
        
    else:
         df.columns = all_features
        
    df['present_domains'] = [ trend_processing(x) for x in df['present_domains'] ] 
    df['tweet_timestamp']=[ str(datetime.utcfromtimestamp(int(date)))[0:13] for date in df['tweet_timestamp'] ]
    
    df = df.filter(labels+['present_domains','tweet_timestamp' ],axis=1)
    
    return df


def domains_on_chunk(directory, chunk, chunk_id, init):

    domains_ratio = {}
    buff_domains_ratio = {}
    engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']


    iteration=1
    for batch_file in chunk:
        
        df = get_trends(directory,batch_file,True)
        df = df [ df['present_domains'].isna()==False ]
        links = np.unique ( list( itertools.chain.from_iterable(df.present_domains) ) )
        
        select = { k:[0,0,0,0] for k in links if k not in buff_domains_ratio.keys()  }
        buff_domains_ratio.update(select)
            
        for idx, engagement in enumerate(engagements):
            buff =  df[ df[engagement]==1 ]
            buff_presence = collections.Counter( list( itertools.chain.from_iterable(buff.present_domains) ) )
            { update_eng(buff_domains_ratio, k, v, idx) for k,v in buff_presence.items() }
        
        if iteration in [4, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550]:
            
            select = {k:v for k,v in buff_domains_ratio.items() if k in domains_ratio.keys() }
            reste = {k:v for k,v in buff_domains_ratio.items() if k not in domains_ratio.keys() }
            { update_agg(domains_ratio, k, v) for k,v in select.items() }
            domains_ratio.update(reste)
            buff_domains_ratio = {}
            print(len( domains_ratio.keys() ))

        print(iteration)
        iteration=iteration+1
        
    select = {k:v for k,v in buff_domains_ratio.items() if k in domains_ratio.keys() }
    reste = {k:v for k,v in buff_domains_ratio.items() if k not in domains_ratio.keys() }
    { update_agg(domains_ratio, k, v) for k,v in select.items() }
    domains_ratio.update(reste)
    
    print('cutting and saving...')
    
    if init==True:

        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/domains_ratio_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(domains_ratio,f)
            
    else: 
        
        with gzip.open('/home/maxime/Desktop/RecSys2020/trends/update_domains_ratio_{}.pkl.gz'.format(chunk_id), 'wb') as f:
            pkl.dump(domains_ratio,f)
    
    return True

def update_eng(domains_ratio, key,v, idx):
    domains_ratio[key][idx]=domains_ratio[key][idx]+v

def update_agg(domains_ratio, k, v):
    domains_ratio[k]=[x + y for x, y in zip(domains_ratio[k], v )]
    

In [3]:

batch_path='/home/maxime/Desktop/RecSys2020/data/batches'
batch_list = [f for f in listdir(batch_path) if isfile(join(batch_path, f))]
chunks = fonctions.chunkIt(batch_list, 8)
directory = '/home/maxime/Desktop/RecSys2020/data/batches/'

if __name__ == '__main__':
    
    # Setup a list of processes that we want to run
    processes = [ mp.Process(target=domains_on_chunk, args=(directory, chunk, chunk_id, True) ) for chunk_id, chunk in enumerate(chunks) ]

    # Run processes
    for p in processes:
        p.start()
        
    #Stop the processes
    for p in processes:
        p.join() 

1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3


In [4]:
engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']
global_domains_ratio = {}
global_domains_presence = {}

for chunk_id in range(8):
    
    print(chunk_id)
    
    with gzip.open('/home/maxime/Desktop/RecSys2020/trends/domains_ratio_{}.pkl.gz'.format(chunk_id), 'rb') as f:
        domains_ratio = pkl.load(f)
        
    select = {k:v for k,v in domains_ratio.items() if k in global_domains_ratio.keys() }
    reste = {k:v for k,v in domains_ratio.items() if k not in global_domains_ratio.keys() }
    { update_agg(global_domains_ratio, k, v) for k,v in select.items() }     
    global_domains_ratio.update(reste)
    
    with gzip.open('/home/maxime/Desktop/RecSys2020/trends/present_domains_presence_{}.pkl.gz'.format(chunk_id), 'rb') as f:
        domains_presence = pkl.load(f)
        
    select = {k:v for k,v in domains_presence.items() if k in global_domains_presence.keys() }
    reste = {k:v for k,v in domains_presence.items() if k not in global_domains_presence.keys() }
    { update_agg(global_domains_presence, k, v) for k,v in select.items() }     
    global_domains_presence.update(reste)
        
global_domains_presence = {k:len(v) for k,v in tqdm( global_domains_presence.items() ) }


0
1
2
3
4
5
6
7


HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




In [5]:
def ratio_extraction(v):

    total_engagement = sum(v)

    if sum(v)==0:
        like_ratio = 0
        retweet_ratio = 0
        rtc_ratio = 0
        reply_ratio = 0
    else:
        like_ratio = round( v[0]/sum(v), 3)
        retweet_ratio = round(v[1]/sum(v),3)
        rtc_ratio = round(v[2]/sum(v) , 3)
        reply_ratio = round(v[3]/sum(v),3)

    return [total_engagement, like_ratio, retweet_ratio, rtc_ratio, reply_ratio ]

global_ratio2 = { k:ratio_extraction(v) for k,v in tqdm( global_domains_ratio.items() ) }
computed_ratio = {'like_timestamp':{},'retweet_timestamp':{},'retweet_with_comment_timestamp':{},'reply_timestamp':{}}

engagements = ['like_timestamp','retweet_timestamp','retweet_with_comment_timestamp','reply_timestamp']

for idx,engagement in enumerate(engagements):
    
    computed_ratio[engagement]= {k:(v[0],v[idx+1]) for k,v in tqdm(global_ratio2.items()) if v[0]>100 and v[idx+1]>0.6 }


HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=330050.0), HTML(value='')))




In [6]:
like_id = [ k for k in computed_ratio['like_timestamp'].keys() ]
print(len(like_id))
dictionnary_lk = { val:idx for idx,val in enumerate(like_id)}

retweet_id = [k for k in computed_ratio['retweet_timestamp'].keys() ]
print(len(retweet_id))
dictionnary_rt = { val:idx for idx,val in enumerate(retweet_id)}

rtc_id = [ k for k in computed_ratio['retweet_with_comment_timestamp'].keys() ]
print(len(rtc_id))
dictionnary_rtc = { val:idx for idx,val in enumerate(rtc_id)}

reply_id = [k for k in computed_ratio['reply_timestamp'].keys() ]
print(len(reply_id))
dictionnary_rpl = { val:idx for idx,val in enumerate(reply_id)}


with gzip.open('/home/maxime/Desktop/RecSys2020/trends/domains_influence.pkl.gz','wb') as f:
    pkl.dump(dictionnary_lk, f)
    pkl.dump(dictionnary_rt, f)


3690
69
0
0
