In [17]:
import pandas as pd
import feather as ft
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from math import log, e
import time
import pickle

In [18]:
""" import useful functions"""
from entropy_functions import shannon_entropy, entropy, cross_entropy,LZ_entropy

""" Compute predictability given the lengths of sequences and the LZ-entropy"""
import mpmath

# As required by algorithm, N should be large, we set e as the threshold of N. 
# if it is smaller than threshold, we will just print NA
def getPredictability(N, S, e=100):
    if N >= e:
        f = lambda x: (((1-x)/(N-1)) **(1-x))* x**x - 2**(-S)
        root = mpmath.findroot(f, 1)
        return float(root.real)
    else: 
        return np.nan

In [19]:
""" Mobility dataset import"""
df_wp = ft.read_dataframe('data/weeplace_checkins_without_loc_NA.feather') # it is the dataset without NA location

# # it will be the same if you use the original csv file.
# df_wp= pd.read_csv('data/weeplace_checkins.csv')  # this is original Weeplace dataset without any processing, including some NA location
# df_wp = df_wp.dropna(subset=["placeid",'userid', 'datetime'])

"""Previous results of meetup information import"""
pickle_in = open("meetup_store.pickle", "rb")
meetup_store = pickle.load(pickle_in)
pickle_in.close()

In [4]:
user_list = list(set(df_wp['userid'].tolist()))

In [5]:
# read picle file
pickle_in = open("user_placeidT.pickle", "rb")
user_placeidT = pickle.load(pickle_in)
pickle_in.close()

In [None]:
""" meetup class define """

""" import useful functions"""
from entropy_functions import shannon_entropy, entropy, cross_entropy,LZ_entropy

""" Compute predictability given the lengths of sequences and the LZ-entropy"""
import mpmath
import numpy as np

# As required by algorithm, N should be large, we set e as the threshold of N. 
# if it is smaller than threshold, we will just print NA
def getPredictability(N, S, e=100):
    if N >= e:
        f = lambda x: (((1-x)/(N-1)) **(1-x))* x**x - 2**(-S)
        root = mpmath.findroot(f, 1)
        return float(root.real)
    else: 
        return np.nan

class MeetupStrategy():
    
    def __init__(self, userlist, user_meetup, placeidT, epsilon=2):
        # MeetupStrategy needs to have three important inputs, userlist, user_meetup and placeidT.
        # It can be replaced as dataset, but it will cost more time any time call the calss
        self.userlist = userlist 
        self.user_meetup = user_meetup
        self.placeidT = placeidT
        self.epsilon = epsilon
        
        
    def extract_info(self, user):
        # extract temporal-spatial information for each user
        user_temporal_placeid = self.placeidT[user]
        user_time = pd.to_datetime(user_temporal_placeid.index).tolist()
        user_placeid = user_temporal_placeid['placeid'].tolist()
        N_uniq_placeid = len(set(user_placeid))
        N_placeid = len(user_placeid)
        
        return user_time, N_uniq_placeid, N_placeid, user_placeid
    
    
    def cross_entropy_pair(self, length_ego, alters_L, ave_length):
        # length_ego: the length of the visited placedid sequence.
        # alters_L: cross-parsed match legnths for the alters given
        # ave_length: the weighted average lengths of all the users in B
        alters_Lmax = np.amax(alters_L, axis=0)
        return (1.0*length_ego/sum(alters_Lmax)) * np.log2(ave_length)
    

    def weight(self, ego_L, alter_L=None):
        if alter_L is None:
            return len(ego_L)
        else:
            # count how many elements of ego_L is in alter_L
            return sum(x in alter_L for x in ego_L)
    
    
    def cross_entropy_element(self, ego_time, ego_placeid, ego_L, alter, alters, L, length_alters):
        length_ego = len(ego_placeid)
        alterid = alters.index(alter)
        
        # included rank is j+1
        rank = alterid + 1
        
        ego_time, length_ego_uni, length_ego, ego_placeid = self.extract_info(ego)
        alter_time, ~ , ~, alter_placeid = self.extract_info(alter)
        
        """Be careful: W1 in cross_entropy is B in the paper, W2 is cross_entropy is A in the paper """        
        # so we need to get the relative time order of ego in alter (abosulte position of ego+alter)
        # for function cross_entropy, we need to have PTs        
        total_time = sorted(ego_time + alter_time)
        PTs = [total_time.index(x) for x in ego_time]
        
        """ function cross_entropy can return L, as defintion of cumulative cross entropy, we need to get max """
        
        # compute cross entropy with only this alter
        """ For alter"""
        CE_alter = cross_entropy(alter_placeid,ego_placeid, PTs)
        Pi_alter = getPredictability(length_ego, CE_alter, e=self.epsilon)
        
        """ For alters """
        # obtain the cross-parsed match length
        L[alterid] = cross_entropy(alter_placeid,ego_placeid, PTs, lambdas=True)
        # TODO: define weight 
        wb[alterid] = weight(ego_L, alter)       
        
        # for alters: top above all alters
        alters_L = L[:alterid+1]
        alters_length = length_alters[:alteridr+1]
        
        # average lengths
        ave_length = np.array(alters_length) * np.array(wb) / sum(wb)
        
        # CCE for all above alters
        CCE_alters = self.cross_entropy_pair(length_ego, alters_L, ave_length)
        Pi_alters = getPredictability(length_ego, CCE_alters, e=self.epsilon)
        
        """For only this alter + ego"""
        # for only this alter and ego
        ego_alter_L = [ego_L, L[alterid]]
        bi_length = np.array([length_alters[alterid], length_ego])
        # TODO: weight 
        bi_weight = np.array([weight[alterid], weight(ego)])
        ave_length = np.mean(bi_length * bi_weight / sum(bi_weight))
        CCE_ego_alter = self.cross_entropy_pair(length_ego, ego_alter_L, ave_length)
        Pi_ego_alter = getPredictability(length_ego, CCE_ego_alter, e=self.epsilon)
        
        """For all above alters + ego"""
        # for ego+alters: top above all alters + ego
        alters_L.append(ego_L)
        alters_length.append(length_ego)
        # TODO: weight
        ego_alters_weight = weight[:alterid+1] + [weight(ego)]
        ave_length = np.mean(np.array(alters_length) * np.array(ego_alters_weight) / sum(ego_alters_weight))
        CCE_ego_atlers = self.cross_entropy_pair(length_ego, alters_L, ave_length)
        Pi_ego_alters = getPredictability(length_ego, CCE_ego_alters, e=self.epsilon)
        
        return [alter, rank, wb[alterid], 
                CE_alter, CCE_alters, CCE_ego_alter, CCE_ego_atlers,
                Pi_alter, Pi_alters, Pi_ego_alter, Pi_ego_atlers,
               ]

    def ego_meetup(self, ego, tempsave=False):
        # extraact information of ego
        ego_time, length_ego_uni, length_ego, ego_placeid = self.extract_info(ego)
        
        # compute the cumulative cross entropy for an ego
        alters = self.user_meetup[self.user_meetup['userid_x']==ego]['userid_y'].tolist()
        df_ego_meetup = self.user_meetup[self.user_meetup['userid_x']==ego]
        
        ego_LZ_entropy = LZ_entropy(ego_placeid, e=self.epsilon) 
        ego_L = LZ_entropy(ego_placeid, e=self.epsilon, lambdas=True)
        
        ego_info = [self.cross_entropy_element(ego_time, ego_placeid, ego_L, alter, alters, \
                                               L, length_alters) for alter in alters]
        ego_info = pd.DataFrame(ego_info, columns=[
            'userid_y', 'Included Rank', 'Weight', 
            'CE_alter', 'CCE_alters', 'CCE_ego_alter', 'CCE_ego_atlers',
            'Pi_alter', 'Pi_alters', 'Pi_ego_alter', 'Pi_ego_atlers',
        ])
        
        # combine two parts of meetup information
        ego_meetup = pd.merge(df_ego_meetup, ego_info, on='userid_y')
        
        if tempsave:
            ego_meetup.to_csv('user-meetup-part.csv', index=False, mode='a', header=False)    

        return ego_meetup
        
    def user_info(self, start=0, end=len(self.userlist)):
        meetup_list = [ego_meetup(ego) for ego in userlist[start:end]]
        user_meetup = pd.concat(meetup_list)
        
        # save the file 
        user_meetup.to_csv('user-meetup-full', index=False)
        
        return user_meetup

In [5]:
def func(x):
    return [x, x*x, x*x*x]

In [21]:
a = [func(x) for x in range(10)]

In [22]:
b = pd.DataFrame(a, columns=['a', 'b', 'c'])

In [23]:
b

Unnamed: 0,a,b,c
0,0,0,0
1,1,1,1
2,2,4,8
3,3,9,27
4,4,16,64
5,5,25,125
6,6,36,216
7,7,49,343
8,8,64,512
9,9,81,729


In [11]:
my_dict = {'a': [[1,2,4],[1,2,4]], 'b': [[2,4,5],[1,2,4]], 'c': [[3,4,5],[1,2,4]]}
with open('test.csv', 'w') as f:
    for key in my_dict.keys():
        f.write("%s,%s\n"%(key,my_dict[key]))

In [15]:
a = [1,1,2,1,5]
b = [4,3,3,2,1]

sum(x in b for x in a)

4

In [31]:
b.to_csv('my_csv.csv', index=False, mode='a', header=False)    

Unnamed: 0,userid_x,userid_y,meetup,percent
14,fred-wilson,andrew-parker,10,0.035088
25,fred-wilson,bijan-sabet,6,0.021053
34,fred-wilson,caroline-mccarthy,5,0.017544
139,fred-wilson,mark-g,5,0.017544
69,fred-wilson,eric-spiegelman,4,0.014035
...,...,...,...,...
88,fred-wilson,jaime-punishill,1,0.003509
89,fred-wilson,jake-dwyer,1,0.003509
90,fred-wilson,james-sims,1,0.003509
92,fred-wilson,jamie-dubs-wilkinson,1,0.003509


In [34]:
a = [1 ,2,3,4]

In [35]:
a[0:3]

[1, 2, 3]