In [26]:
import pandas as pd
import csv
import pickle
import numpy as np
from IPython.display import display
import multiprocessing
from multiprocessing import Pool
import math
from tqdm import tqdm
import time
import traceback
import json

In [27]:
path = "/Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/"

In [28]:
def load_pickle_file(pickled_file):
    print(f'Loading data file from {pickled_file}')
    infile = open(pickled_file,'rb')
    unpickled_file = pickle.load(infile)
    print(f'Loaded {len(unpickled_file)} entries')
    infile.close()
    return unpickled_file
          
    
def save_pickle_file(path, data):
    print('Dumping data to path {}'.format(path))
    with open(path, 'wb') as file:
        pickle.dump(data, file)
    print('Finished dumping data to path {}'.format(path))


def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)


def safe_division(x, y):
    if y == 0:
        return 0
    else:
        return x/y



In [29]:
users = load_pickle_file(path+"80000users_3.pkl")
users.reset_index(drop =True , inplace =True)

Loading data file from /Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/80000users_3.pkl
Loaded 70013 entries


In [30]:
network_simulation = pd.DataFrame(columns= ['id','time_lapsed','favourites_count','followers_count','friends_count',
                                'listed_count','statuses_count','exposed_source_candidates','source_index','seed_index','generation',
                                 'time_since_seed','user_created_days','normalized_statuses_count','normalized_followers_count',
                                 'normalized_favourites_count','normalized_listed_count','normalized_friends_count'])


In [31]:
network_simulation['id']=users['user_id']
network_simulation['favourites_count']=users['user_favourites_count']

network_simulation['followers_count']=users['followers_count']

network_simulation['friends_count']=users['friends_count']

network_simulation['listed_count']=users['user_listed_count']

network_simulation['statuses_count']=users['user_statuses_count']

network_simulation['user_created_days']=users['user_created_days']

network_simulation['normalized_statuses_count']=users['normalized_user_statuses_count']

network_simulation['normalized_followers_count']=users['normalized_user_followers_count']

network_simulation['normalized_favourites_count']=users['normalized_user_favourites_count']
network_simulation['normalized_listed_count']=users['normalized_user_listed_count']

network_simulation['normalized_friends_count']=users['normalized_user_friends_count']

#network_simulation['time_lapsed'] = users['time_lapsed'].apply(lambda x: x if x <= current_time else None)
#network_simulation['source_index'] = users.apply(lambda x: x['source_index'] if x['time_lapsed'] <= current_time else None,axis=1)

#network_simulation['seed_index'] = users.apply(lambda x: x['seed_index'] if x['time_lapsed'] <= current_time else None,axis=1)

#network_simulation['generation'] = users.apply(lambda x: x['generation'] if x['time_lapsed'] <= current_time else None,axis=1)


#network_simulation['time_since_seed'] = users.apply(lambda x: x['time_since_seed'] if x['time_lapsed'] <= current_time else None,axis=1)


In [32]:
network_simulation["friends_list"] = users['source_candidates']

In [33]:
network_simulation["followers_list"] = users['target_nodes']

In [34]:
in_degree = list(users.friends_count)
out_degree = list(users.followers_count)
degree = in_degree + out_degree

In [35]:
def process_data(start_index, end_index):
    
    features = {
        #Columns which are added for simulation, but they are not used as features for model prediction
        'user_id':[],
        'infected_status':[],
        'infection_time':[],
        'followers_list':[],
        
        #Columns used as features for model prediction
        'UsM_deltaDays': [],
        'UsM_statusesCount': [],
        'UsM_followersCount': [],
        'UsM_favouritesCount': [],
        'UsM_friendsCount': [],
        'UsM_listedCount': [],
        'UsM_normalizedUserStatusesCount': [],
        'UsM_normalizedUserFollowersCount': [],
        'UsM_normalizedUserFavouritesCount': [],
        'UsM_normalizedUserListedCount': [],
        'UsM_normalizedUserFriendsCount': [],          
        'UsM_deltaDays0': [],
        'UsM_statusesCount0': [],
        'UsM_followersCount0': [],
        'UsM_favouritesCount0': [],
        'UsM_friendsCount0': [],
        'UsM_listedCount0': [],
        'UsM_normalizedUserStatusesCount0': [],
        'UsM_normalizedUserFollowersCount0': [],
        'UsM_normalizedUserFavouritesCount0': [],
        'UsM_normalizedUserListedCount0': [],
        'UsM_normalizedUserFriendsCount0': [],
        'UsM_deltaDays-1': [],
        'UsM_statusesCount-1': [],
        'UsM_followersCount-1': [],
        'UsM_favouritesCount-1': [],
        'UsM_friendsCount-1': [],
        'UsM_listedCount-1': [],
        'UsM_normalizedUserStatusesCount-1': [],
        'UsM_normalizedUserFollowersCount-1': [],
        'UsM_normalizedUserFavouritesCount-1': [],
        'UsM_normalizedUserListedCount-1': [],
        'UsM_normalizedUserFriendsCount-1': [],
        # TwM: Tweet metadata
        'TwM_t0': [],
        'TwM_tSeed0': [],
        'TwM_t-1': [],
        'TwM_tSeed-1': [],
        'TwM_tCurrent': [],
        # Nw: Network
        'Nw_degree': [],
        'Nw_inDegree': [],
        'Nw_outDegree': [],
        'Nw_degree0': [],
        'Nw_inDegree0': [],
        'Nw_outDegree0': [],
        'Nw_degree-1': [],
        'Nw_inDegree-1': [],
        'Nw_outDegree-1': [],
        'Nw_degreeSeed0': [],
        'Nw_inDegreeSeed0': [],
        'Nw_outDegreeSeed0': [],
        'Nw_degreeSeed-1': [],
        'Nw_inDegreeSeed-1': [],
        'Nw_outDegreeSeed-1': [],
        # SNw: Spreading Network
        'SNw_nFriendsInfected': [],
        'SNw_friendsInfectedRatio': [],
        'SNw_generation0': [],
        'SNw_generation-1': [],
        'SNw_timeSinceSeed0': [],
        'SNw_timeSinceSeed-1': [],
        'SNw_totalNodesInfected': [],
        'SNw_nodeInfectedCentrality': [],
        'SNw_totalInDegree': [],
        'SNw_totalOutDegree': [],
        'SNw_inDegreeCentrality': [],
        'SNw_inDegreeCentrality0': [],
        'SNw_inDegreeCentrality-1': [],
        'SNw_outDegreeCentrality': [],
        'SNw_outDegreeCentrality0': [],
        'SNw_outDegreeCentrality-1': [],
        'SNw_inDegreeCentralitySeed0':[],
        'SNw_outDegreeCentralitySeed0':[],
        'SNw_inDegreeCentralitySeed-1':[],
        'SNw_outDegreeCentralitySeed-1':[],
        # Stat: Statistical
        'Stat_average_kOut': [],
        'Stat_average_t': [],
        'Stat_average_deltaDays': [],
        'Stat_average_statusesCount': [],
        'Stat_average_followersCount': [],
        'Stat_average_favouritesCount': [],
        'Stat_average_friendsCount': [],
        'Stat_average_listedCount': [],
        'Stat_average_normalizedUserStatusesCount': [],
        'Stat_average_normalizedUserFollowersCount': [],
        'Stat_average_normalizedUserFavouritesCount': [],
        'Stat_average_normalizedUserListedCount': [],
        'Stat_average_normalizedUserFriendsCount': [],                
        'Stat_max_kOut': [],
        'Stat_min_kOut': []
        
    }

    with tqdm(total=len(list(users[start_index: end_index].iterrows()))) as pbar: 
        #print(f"start_index:{start_index}, end_index: {end_index}")
        for index, user_row in users[start_index: end_index].iterrows():
           
            features['user_id'].append(user_row['user_id'])
            features['infected_status'].append(False)
            features['infection_time'].append(None)
            features['followers_list'].append(user_row['target_nodes'])
            features['UsM_deltaDays'].append(user_row['user_created_days'])
            features['UsM_statusesCount'].append(user_row['user_statuses_count'])
            features['UsM_followersCount'].append(user_row['followers_count'])
            features['UsM_favouritesCount'].append(user_row['user_favourites_count'])
            features['UsM_friendsCount'].append(user_row['friends_count'])
            features['UsM_listedCount'].append(user_row['user_listed_count'])
            features['UsM_normalizedUserStatusesCount'].append(user_row['normalized_user_statuses_count'])
            features['UsM_normalizedUserFollowersCount'].append(user_row['normalized_user_followers_count'])
            features['UsM_normalizedUserFavouritesCount'].append(user_row['normalized_user_favourites_count'])
            features['UsM_normalizedUserListedCount'].append(user_row['normalized_user_listed_count'])
            features['UsM_normalizedUserFriendsCount'].append(user_row['normalized_user_friends_count'])              
            features['UsM_deltaDays0'].append(None)
            features['UsM_statusesCount0'].append(None)
            features['UsM_followersCount0'].append(None)
            features['UsM_favouritesCount0'].append(None)
            features['UsM_friendsCount0'].append(None)
            features['UsM_listedCount0'].append(None)
            features['UsM_normalizedUserStatusesCount0'].append(None)
            features['UsM_normalizedUserFollowersCount0'].append(None)
            features['UsM_normalizedUserFavouritesCount0'].append(None)
            features['UsM_normalizedUserListedCount0'].append(None)
            features['UsM_normalizedUserFriendsCount0'].append(None)
            features['UsM_deltaDays-1'].append(None)
            features['UsM_statusesCount-1'].append(None)
            features['UsM_followersCount-1'].append(None)
            features['UsM_favouritesCount-1'].append(None)
            features['UsM_friendsCount-1'].append(None)
            features['UsM_listedCount-1'].append(None)
            features['UsM_normalizedUserStatusesCount-1'].append(None)
            features['UsM_normalizedUserFollowersCount-1'].append(None)
            features['UsM_normalizedUserFavouritesCount-1'].append(None)
            features['UsM_normalizedUserListedCount-1'].append(None)
            features['UsM_normalizedUserFriendsCount-1'].append(None) 
            # TwM: Tweet metadata
            features['TwM_t0'].append(None)
            features['TwM_tSeed0'].append(None)
            features['TwM_t-1'].append(None)
            features['TwM_tSeed-1'].append(None)
            features['TwM_tCurrent'].append(None)
            # Nw: Network
            features['Nw_degree'].append(None)
            features['Nw_inDegree'].append(None)
            features['Nw_outDegree'].append(None)
            features['Nw_degree0'].append(None)
            features['Nw_inDegree0'].append(None)
            features['Nw_outDegree0'].append(None)
            features['Nw_degree-1'].append(None)
            features['Nw_inDegree-1'].append(None)
            features['Nw_outDegree-1'].append(None)
            features['Nw_degreeSeed0'].append(None)
            features['Nw_inDegreeSeed0'].append(None)
            features['Nw_outDegreeSeed0'].append(None)
            features['Nw_degreeSeed-1'].append(None)
            features['Nw_inDegreeSeed-1'].append(None)
            features['Nw_outDegreeSeed-1'].append(None)
            # SNw: Spreading Network
            features['SNw_nFriendsInfected'].append(0)
            features['SNw_friendsInfectedRatio'].append(None)
            features['SNw_generation0'].append(None)
            features['SNw_generation-1'].append(None)
            features['SNw_timeSinceSeed0'].append(None)
            features['SNw_timeSinceSeed-1'].append(None)
            features['SNw_totalNodesInfected'].append(None)
            features['SNw_nodeInfectedCentrality'].append(None)
            features['SNw_totalInDegree'].append(None)
            features['SNw_totalOutDegree'].append(None)
            features['SNw_inDegreeCentrality'].append(None)
            features['SNw_inDegreeCentrality0'].append(None)
            features['SNw_inDegreeCentrality-1'].append(None)
            features['SNw_outDegreeCentrality'].append(None)
            features['SNw_outDegreeCentrality0'].append(None)
            features['SNw_outDegreeCentrality-1'].append(None)
            features['SNw_inDegreeCentralitySeed0'].append(None)
            features['SNw_outDegreeCentralitySeed0'].append(None)
            features['SNw_inDegreeCentralitySeed-1'].append(None)
            features['SNw_outDegreeCentralitySeed-1'].append(None)
            # Stat: Statistical
            features['Stat_average_kOut'].append(None)
            features['Stat_average_t'].append(None)
            features['Stat_average_deltaDays'].append(None)
            features['Stat_average_statusesCount'].append(None)
            features['Stat_average_followersCount'].append(None)
            features['Stat_average_favouritesCount'].append(None)
            features['Stat_average_friendsCount'].append(None)
            features['Stat_average_listedCount'].append(None)
            features['Stat_average_normalizedUserStatusesCount'].append(None)
            features['Stat_average_normalizedUserFollowersCount'].append(None)
            features['Stat_average_normalizedUserFavouritesCount'].append(None)
            features['Stat_average_normalizedUserListedCount'].append(None)
            features['Stat_average_normalizedUserFriendsCount'].append(None)
            features['Stat_max_kOut'].append(None)
            features['Stat_min_kOut'].append(None)
            
            pbar.update(1)
    processed_dataframe = pd.DataFrame(features)      
    return processed_dataframe

In [36]:
number_of_processes = multiprocessing.cpu_count()
print('Will start {} processes'.format(number_of_processes))
with Pool(number_of_processes) as pool:
    parameters = []
    number_of_users = len(users.index)
    task_size = math.ceil(number_of_users/number_of_processes)
    for i in range(number_of_processes):
        start_index = i * task_size
        end_index = min((i + 1) * task_size, number_of_users)
        parameters.append((start_index, end_index))
    dataframe_results = pool.starmap(process_data, parameters)



result = pd.DataFrame()
result = result.append(dataframe_results,ignore_index = True)
save_pickle_file(path+"keynode_initial_features.pkl",result)
print('extracted {} of rows'.format(len(result.index)))
display(result)

# start_index = 0
# end_index = 70012

# features = process_data(start_index, end_index)
# save_pickle_file(path+"keynode_initial_features.pkl",features)
        
        

Will start 4 processes


100%|██████████| 17504/17504 [00:05<00:00, 3151.65it/s]
100%|██████████| 17501/17501 [00:05<00:00, 3172.00it/s]
100%|██████████| 17504/17504 [00:05<00:00, 3158.21it/s]
100%|██████████| 17504/17504 [00:05<00:00, 3139.90it/s]


Dumping data to path /Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/keynode_initial_features.pkl
Finished dumping data to path /Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/keynode_initial_features.pkl
extracted 70013 of rows


Unnamed: 0,user_id,infected_status,infection_time,followers_list,UsM_deltaDays,UsM_statusesCount,UsM_followersCount,UsM_favouritesCount,UsM_friendsCount,UsM_listedCount,...,Stat_average_favouritesCount,Stat_average_friendsCount,Stat_average_listedCount,Stat_average_normalizedUserStatusesCount,Stat_average_normalizedUserFollowersCount,Stat_average_normalizedUserFavouritesCount,Stat_average_normalizedUserListedCount,Stat_average_normalizedUserFriendsCount,Stat_max_kOut,Stat_min_kOut
0,214328887,False,,"[34428380, 56860418, 157829215, 158419434, 149...",2679,201,2090,548,46,65,...,,,,,,,,,,
1,34428380,False,,"[31331740, 21548772, 17759158, 17868918, 40981...",3246,5695,305257,38,181,4929,...,,,,,,,,,,
2,17116707,False,,"[28465635, 231238695, 121533789, 86221475, 808...",3417,5124,11756,31,498,93,...,,,,,,,,,,
3,28465635,False,,"[30971165, 297801196, 259842341, 123371682, 19...",3266,35499,57992,4449,1482,894,...,,,,,,,,,,
4,380580781,False,,"[18996905, 400689940, 31331740, 19358562, 2043...",2359,20720,15033,1968,219,45,...,,,,,,,,,,
5,18996905,False,,"[15023872, 18951737, 17868918, 34428380, 22462...",3344,14602,284075,2158,68,1934,...,,,,,,,,,,
6,221036078,False,,"[153460275, 22462180, 196327549, 117674417, 29...",2660,6933,2121,309,206,12,...,,,,,,,,,,
7,153460275,False,,"[43003845, 19493072, 22462180, 34428380, 15133...",2834,807,4710,74,35,49,...,,,,,,,,,,
8,107830991,False,,"[17868918, 40981798, 100581193, 461410856, 274...",2970,3021,11627,350,934,63,...,,,,,,,,,,
9,17868918,False,,"[31331740, 8088112, 117674417, 17759158, 40981...",3385,4153,16996,9,552,238,...,,,,,,,,,,


In [37]:
save_pickle_file(path+"network_simulation_keynode_initial.pkl",network_simulation)

Dumping data to path /Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/network_simulation_keynode_initial.pkl
Finished dumping data to path /Users/jay/MSC_WSBDA/MSc_Thesis/Msc_project/Data/network_simulation_keynode_initial.pkl


In [50]:
max(network_simulation['time_lapsed'])

nan

In [None]:
result