 # Artifical Friendship Construction and Explore the potential social information from the friendship network

In [2]:
import pandas as pd
import feather as ft
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from math import log, e
import time
import pickle

## Important interim results import

In [6]:
""" Mobility dataset import"""
df_wp = ft.read_dataframe('data/weeplace_checkins_without_loc_NA.feather') # it is the dataset without NA location

# # it will be the same if you use the original csv file.
# df_wp= pd.read_csv('data/weeplace_checkins.csv')  # this is original Weeplace dataset without any processing, including some NA location
# df_wp = df_wp.dropna(subset=["placeid",'userid', 'datetime'])

"""Previous results of meetup information import"""
pickle_in = open("meetup_store.pickle", "rb")
meetup_store = pickle.load(pickle_in)
pickle_in.close()

In [192]:
user_meetup=pd.concat(meetup_store)
user_meetup=user_meetup.rename(columns = {'count':'meetup'})
user_meetup = user_meetup.reindex(user_meetup.columns.tolist() + \
                                  ['SN-E ego','LZ-E ego', 'Pi ego', \
                                   'CE alter', 'KL alter', 'Pi alter','CE ego+alter', 'Pi ego+alter', \
                                   'Weight','Impact','Included Rank', \
                                   'CCE ego+alters', 'CCE alters', \
                                   'Pi ego+alters', 'Pi alters'], axis=1)  # version > 0.20.0

In [193]:
user_meetup

Unnamed: 0,userid_x,userid_y,meetup,percent,SN-E ego,LZ-E ego,Pi ego,CE alter,KL alter,Pi alter,CE ego+alter,Pi ego+alter,Weight,Impact,Included Rank,CCE ego+alters,CCE alters,Pi ego_alters,Pi alters
14,fred-wilson,andrew-parker,10,0.035088,,,,,,,,,,,,,,,
25,fred-wilson,bijan-sabet,6,0.021053,,,,,,,,,,,,,,,
34,fred-wilson,caroline-mccarthy,5,0.017544,,,,,,,,,,,,,,,
139,fred-wilson,mark-g,5,0.017544,,,,,,,,,,,,,,,
69,fred-wilson,eric-spiegelman,4,0.014035,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,mark-van-der-poel,simon-colijn,22,0.733333,,,,,,,,,,,,,,,
2,mark-van-der-poel,rene-sijnke,5,0.166667,,,,,,,,,,,,,,,
0,mark-van-der-poel,celine-lucas,1,0.033333,,,,,,,,,,,,,,,
1,mark-van-der-poel,jordi-wiegerink,1,0.033333,,,,,,,,,,,,,,,


## Obtain the users' temporal-spatial trajectories

In [20]:
user_list = list(set(df_wp['userid'].tolist()))
N_users = len(user_list)

In [21]:
# # # this is dict, dict will more faster!!!
# # This produces all users' temporal placeid
# user_placeidT = {user_list[i]: df_wp[df_wp['userid'] ==user_list[i]].set_index('datetime').sort_index()[['placeid']] \
#                                for i in range(N_users)}

# # save pickle file
# pickle_out = open("user_placeidT.pickle", "wb")
# pickle.dump(user_placeidT, pickle_out)
# pickle_out.close()

In [24]:
# read picle file
pickle_in = open("user_placeidT.pickle", "rb")
user_placeidT = pickle.load(pickle_in)

## Import useful functions

In [25]:
""" import useful functions"""
from entropy_functions import shannon_entropy, entropy, cross_entropy,LZ_entropy

""" Compute predictability given the lengths of sequences and the LZ-entropy"""
import mpmath

# As required by algorithm, N should be large, we set e as the threshold of N. 
# if it is smaller than threshold, we will just print NA
def getPredictability(N, S, e=100):
    if N >= e:
        f = lambda x: (((1-x)/(N-1)) **(1-x))* x**x - 2**(-S)
        root = mpmath.findroot(f, 1)
        return float(root.real)
    else: 
        return np.nan

In [None]:
""" compute different entropies, meetup, weights, and predictability"""

for ego in user_list:
    ego_temporal_placeid = user_placeidT[ego]
    ego_time = pd.to_datetime(ego_temporal_placeid.index).tolist()
    ego_placeid = ego_temporal_placeid['placeid'].tolist()
    N_uniq_placeid = len(set(ego_placeid))
    length_ego = len(ego_placeid)
    
    # compute shannon entropy
    user_meetup.loc[(user_meetup['userid_x']==ego), 'SN-E ego'] = shannon_entropy(ego_placeid)
    
    # compute entropy (it is LZ-entropy without filter)
    ego_LZ_entropy = entropy(ego_placeid)
    ego_L = entropy(ego_placeid, lambdas=True)
    
    user_meetup.loc[(user_meetup['userid_x']==ego),'LZ-E ego'] = ego_LZ_entropy
    
    # compute predictability (use Fano equation and solve the equation)
    user_meetup.loc[(user_meetup['userid_x']==ego),'Pi ego'] = getPredictability(length_ego, ego_LZ_entropy, e=2)
    
    # find all the alters for the ego
    # since user_meetup is sorted, so the first one is top frequenct meetup friend
    alters = user_meetup[user_meetup['userid_x']==ego]['userid_y'].tolist()
    
    N_alters = len(alters)
    length_alters = [None] * N_alters

    """For cumulative cross entropy, largest L for all alters, and then get sum"""
    L = [None] * N_alters
    weight = [None] * N_alters
    impact = [None] * N_alters
    
    for j in range(N_alters):
        alter = alters[j]
        # included rank is j+1
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'Included Rank'] = j+1
        
        alter_temporal_placeid = user_placeidT[alter]
        alter_time = pd.to_datetime(alter_temporal_placeid.index).tolist()
        alter_placeid = alter_temporal_placeid['placeid'].tolist()
        
        """Be careful: W1 in cross_entropy is B in the paper, W2 is cross_entropy is A in the paper """        
        # so we need to get the relative time order of ego in alter (abosulte position of ego+alter)
        # for function cross_entropy, we need to have PTs
        total_time = sorted(ego_time + alter_time)
        PTs = [total_time.index(x) for x in ego_time]
        
        """ function cross_entropy can return L, as defintion of cumulative cross entropy, we need to get max """
        # compute cross entropy with this alter
        alter_CE = cross_entropy(alter_placeid,ego_placeid, PTs)        
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'CE alter'] = alter_CE 
        # compute KL divergence with this alter, cross entropy - entropy
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'KL alter'] = alter_CE - ego_LZ_entropy 
        # compute the Pi if only use alter's information to predict ego's future
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'Pi alter'] = \
        getPredictability(length_ego, alter_CE, e=2)
        
        L[j] = cross_entropy(alter_placeid,ego_placeid, PTs, lambdas=True)
        # compute the weight: how many times subsequence of A appears in B. If the length  >1, that is to say,
        # at least length-1 subsequence appears in match sequence. 
        weight[j] = np.count_nonzero(np.array(L[j]) - 1)
        # impact is total substring lengths appear in B
        impact[j] = sum(np.array(L[j]) - 1)
        
        length_alters[j] = len(alter_placeid)
        
        # for alters: top above all alters
        alters_L = L[:j+1]
        alters_Lmax = np.amax(alters_L, axis=0)
        alters_length = length_alters[:j+1]
        CCE_atlers = (1.0*length_ego/sum(alters_Lmax)) * np.log2(np.mean(alters_length))
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'CCE alters'] = CCE_atlers 
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'Pi alters'] = \
        getPredictability(length_ego, CCE_atlers, e=2)
        
        # for only this alter and ego
        ego_alter_L = [ego_L, L[j]]
        ego_alter_Lmax = np.amax(ego_alter_L, axis=0)
        bi_length = [length_ego, length_alters[j]]
        CE_ego_atler = (1.0*length_ego/sum(ego_alter_Lmax)) * np.log2(np.mean(bi_length))
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'CE ego+alter'] = CE_ego_atler
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'Pi ego+alter'] = \
        getPredictability(length_ego, CE_ego_atler, e=2)
        
        # for ego+alters: top above all alters + ego
        alters_L.append(ego_L)
        ego_alter_CLmax = np.amax(alters_L, axis=0)
        alters_length.append(length_ego)
        CCE_ego_atlers = (1.0*length_ego/sum(ego_alter_CLmax)) * np.log2(np.mean(alters_length))
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'CCE ego+alters'] = CCE_ego_atlers
        user_meetup.loc[(user_meetup['userid_x']==ego) & (user_meetup['userid_y']==alter),'Pi ego+alters'] = \
        getPredictability(length_ego, CCE_ego_atlers, e=2)
        
    user_meetup.loc[(user_meetup['userid_x']==ego),'Weight'] = np.array(weight)/sum(weight)
    user_meetup.loc[(user_meetup['userid_x']==ego),'Impact'] = np.array(impact)/sum(impact)
#     print(user_list.index(ego))
    
user_meetup.to_csv('user_meetup_full.csv', index=False)