In [2]:
import pandas as pd
import feather as ft
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from math import log, e
import time
import pickle

In [3]:
""" import useful functions"""
from entropy_functions import shannon_entropy, entropy, cross_entropy,LZ_entropy

""" Compute predictability given the lengths of sequences and the LZ-entropy"""
import mpmath

# As required by algorithm, N should be large, we set e as the threshold of N. 
# if it is smaller than threshold, we will just print NA
def getPredictability(N, S, e=100):
    if N >= e:
        f = lambda x: (((1-x)/(N-1)) **(1-x))* x**x - 2**(-S)
        root = mpmath.findroot(f, 1)
        return float(root.real)
    else: 
        return np.nan

In [4]:
""" Mobility dataset import"""
df_wp = ft.read_dataframe('data/weeplace_checkins_without_loc_NA.feather') # it is the dataset without NA location

# # it will be the same if you use the original csv file.
# df_wp= pd.read_csv('data/weeplace_checkins.csv')  # this is original Weeplace dataset without any processing, including some NA location
# df_wp = df_wp.dropna(subset=["placeid",'userid', 'datetime'])

"""Previous results of meetup information import"""
pickle_in = open("meetup_store.pickle", "rb")
meetup_store = pickle.load(pickle_in)
pickle_in.close()

In [31]:
user_list = list(set(df_wp['userid'].tolist()))

user_meetup=pd.concat(meetup_store)
user_meetup=user_meetup.rename(columns = {'count':'meetup'})

In [6]:
# read picle file
pickle_in = open("user_placeidT.pickle", "rb")
user_placeidT = pickle.load(pickle_in)
pickle_in.close()

In [73]:
""" meetup class define """

""" import useful functions"""
from entropy_functions import shannon_entropy, entropy, cross_entropy,LZ_entropy

""" Compute predictability given the lengths of sequences and the LZ-entropy"""
import mpmath
import numpy as np

# As required by algorithm, N should be large, we set e as the threshold of N. 
# if it is smaller than threshold, we will just print NA
def getPredictability(N, S, e=100):
    if N >= e:
        f = lambda x: (((1-x)/(N-1)) **(1-x))* x**x - 2**(-S)
        root = mpmath.findroot(f, 1)
        return float(root.real)
    else: 
        return np.nan

class MeetupStrategy:
    """
    Create a MeetupStrategy class to include all the computation
    """
    def __init__(self, userlist, user_meetup, placeidT, epsilon=2, user_stats=None):
        """
        MeetupStrategy needs to have several important inputs, 
        userlist: list, all userid
        user_meetup: DataFrame, cols = ['userid_x', 'userid_y', 'meetup', 'percentage']
        placeidT: dict, include all the users' temporal placeid, keys are the userids
        epsilon: int, shortest length we considered in our computation
        user_stats: DataFrame, cols = ['userid_x', 'userid_y', 'meetup', 'percentage', and other statas]
        
        The top three inputs can be replaced as the whole dataset, but it will cost more time any time call the calss
        """
        self.userlist = userlist 
        self.user_meetup = user_meetup
        self.placeidT = placeidT
        self.epsilon = epsilon
        self.user_stats = user_stats
        
        
    def extract_info(self, user):
        """ Public function: extract temporal-spatial information for each user 
        Arg:
            user: a string, a userid
        
        Return:
            user_time: datetime, user's timestamps
            N_uniq_placeid: int, the number user's unique visited placeids
            N_placeid: int, the number of user's visited placeids
            user_placeid: list, time-ordered visited placeid in a list
        """
        user_temporal_placeid = self.placeidT[user]
        user_time = pd.to_datetime(user_temporal_placeid.index).tolist()
        user_placeid = user_temporal_placeid['placeid'].tolist()
        N_uniq_placeid = len(set(user_placeid))
        N_placeid = len(user_placeid)
        
        return user_time, N_uniq_placeid, N_placeid, user_placeid
    
    
    def cross_entropy_pair(self, length_ego, alters_L, ave_length):
        """ public method: Compute cross entropy for a pair of ego and alters
        Args:
            length_ego: list, the length of the visited placedid sequence.
            alters_L: list, cross-parsed match legnths for the alters given
            ave_length: float, the weighted average lengths of all the users in B
        
        Return:
            float, cross entropy for a pair of ego and alters
        """
        alters_Lmax = np.amax(alters_L, axis=0)
        return float((1.0*length_ego/sum(alters_Lmax)) * np.log2(ave_length))
    

    def weight(self, ego_L, alter_L=None):
        """ Public method, compute how important of alter for ego"""
        if alter_L is None:
            return len(ego_L)
        else:
            # count how many elements of ego_L is in alter_L
            return sum(x in alter_L for x in ego_L)
    
    
    def __cross_entropy_element(self, ego_time, ego_placeid, ego_L, alter, alters, L, wb, length_alters):
        """ Private method (recursive structure): compute cross entropy related to statistics 
        Args:
            ego_time: datetime,
            ego_placeid: list,
            ego_L: list, match length for ego
            alter: string, selected alter
            alters: string list, all the alters for ego
            L: nested list, match legnths for all alters before the selected alter
            wb: list, weights for for all alters before the selected alter
            length_alters: list, length of visited placeids for all alters before the selected alter
        
        Return:
            alter related information
        """
        length_ego = len(ego_placeid)
        alterid = alters.index(alter)
        
        # included rank is j+1
        rank = alterid + 1
        
        alter_time, length_alter_uniq, length_alter, alter_placeid = self.extract_info(alter)
        
        alter_log2 = np.log2(length_alter_uniq)
        """Be careful: W1 in cross_entropy is B in the paper, W2 is cross_entropy is A in the paper """        
        # so we need to get the relative time order of ego in alter (abosulte position of ego+alter)
        # for function cross_entropy, we need to have PTs        
        total_time = sorted(ego_time + alter_time)
        PTs = [total_time.index(x) for x in ego_time]
        
        """ function cross_entropy can return L, as defintion of cumulative cross entropy, we need to get max """
        
        # compute cross entropy with only this alter
        """ For alter"""
        CE_alter = cross_entropy(alter_placeid,ego_placeid, PTs)
        Pi_alter = getPredictability(length_ego, CE_alter, e=self.epsilon)
        
        """ For all above alters """
        # Obtain the basic information to extend L, wb, length_alters
        # obtain the cross-parsed match length
        L[alterid] = cross_entropy(alter_placeid,ego_placeid, PTs, lambdas=True)
        # TODO: define weight 
        wb[alterid] = self.weight(ego_L, L[alterid])
        # length of alter placeid
        length_alters[alterid] = length_alter
        
        # for alters: top above all alters
        alters_L = L[:alterid+1]
        alters_length = length_alters[:alterid+1]
        wb_length = wb[:alterid+1]
        
        # average lengths
        ave_length = np.array(alters_length) * np.array(wb_length) / sum(wb_length)
        
        # CCE for all above alters
        CCE_alters = self.cross_entropy_pair(length_ego, alters_L, ave_length)
        Pi_alters = getPredictability(length_ego, CCE_alters, e=self.epsilon)
        
        """For only this alter + ego"""
        # for only this alter and ego
        ego_alter_L = [ego_L, L[alterid]]
        bi_length = np.array([length_alters[alterid], length_ego])
        # TODO: weight 
        bi_weight = np.array([wb[alterid], self.weight(ego_L)])
        ave_length = np.mean(bi_length * bi_weight / sum(bi_weight))
        CCE_ego_alter = self.cross_entropy_pair(length_ego, ego_alter_L, ave_length)
        Pi_ego_alter = getPredictability(length_ego, CCE_ego_alter, e=self.epsilon)
        
        """For all above alters + ego"""
        # for ego+alters: top above all alters + ego
        alters_L.append(ego_L)
        alters_length.append(length_ego)
        # TODO: weight
        ego_alters_weight = wb[:alterid+1] + [self.weight(ego_L)]
        ave_length = np.mean(np.array(alters_length) * np.array(ego_alters_weight) / sum(ego_alters_weight))
        CCE_ego_alters = self.cross_entropy_pair(length_ego, alters_L, ave_length)
        Pi_ego_alters = getPredictability(length_ego, CCE_ego_alters, e=self.epsilon)
        
        return [alter, rank, wb[alterid], alter_log2,
                CE_alter, CCE_alters, CCE_ego_alter, CCE_ego_alters,
                Pi_alter, Pi_alters, Pi_ego_alter, Pi_ego_alters,
               ]

    def ego_meetup(self, ego, tempsave=False):
        # extraact information of ego and compute all the statistics for all egos
        ego_time, length_ego_uni, length_ego, ego_placeid = self.extract_info(ego)
        
        # compute the cumulative cross entropy for an ego
        alters = self.user_meetup[self.user_meetup['userid_x']==ego]['userid_y'].tolist()
        df_ego_meetup = self.user_meetup[self.user_meetup['userid_x']==ego]
        N_alters = len(alters)
        
        ego_L = LZ_entropy(ego_placeid, e=self.epsilon, lambdas=True)
        
        # initial space
        L = [None] * N_alters
        wb = [None] * N_alters
        length_alters = [None] * N_alters
        
        ego_stats = [self.__cross_entropy_element(ego_time, ego_placeid, ego_L, alter, alters, \
                                               L, wb, length_alters) for alter in alters]
        ego_stats = pd.DataFrame(ego_stats, columns=[
            'userid_y', 'Included Rank', 'Weight', 'alter_info',
            'CE_alter', 'CCE_alters', 'CCE_ego_alter', 'CCE_ego_alters',
            'Pi_alter', 'Pi_alters', 'Pi_ego_alter', 'Pi_ego_alters',
        ])
        
        # combine two parts of meetup information
        meetup_ego = pd.merge(df_ego_meetup, ego_stats, on='userid_y')
        
        if tempsave:
            ego_meetup.to_csv('user-meetup-part.csv', index=False, mode='a', header=False)    

        return meetup_ego
        
    def ego_alter_info(self, start=0, end=None, tempsave=False):
        """ Produce all the ego-alter information"""
        if end is None:
            end = len(self.userlist)
        
        meetup_list = [self.ego_meetup(ego, tempsave=tempsave) for ego in self.egolist[start:end]]
        self.user_stats = pd.concat(meetup_list)
        
        # save the file 
        user_stats.to_csv('user-meetup-full.csv', index=False)
        
        return self.user_stats
    
    def ego_info(self, start=0, end=None, tempsave=False):
        
        if end is None:
            end = len(self.userlist)
            
        ego_time, length_ego_uni, length_ego, ego_placeid = zip(*[self.ego_info(ego) \
                                                                  for ego in self.egolist[start:end]])
        N =  end - start
        ego_LZ_entropy = [LZ_entropy(ego_placeid[i], e=self.epsilon) \
                          for i in range(N)]
        Pi_ego = [getPredictability(length_ego[i], ego_LZ_entropy[i], e=self.epsilon) \
                  for i in range(N)]
        ego_log2 = list(length_ego_uni)
        
        df_ego = [self.userlist[start:end], ego_log2, ego_LZ_entropy, Pi_ego]
        
        df_ego = pd.DataFrame(df_egof, columns=['userid_x', 'ego_info', 'LZ_entropy', 'Pi'])
        
        if tempsave:
            df_ego.to_csv('user-ego-info.csv', index=False)
        
        return df_ego

In [74]:
we_meet = MeetupStrategy(user_list, user_meetup, user_placeidT)
we_meet.ego_alter_info(end=2)

TypeError: only size-1 arrays can be converted to Python scalars

In [62]:
getPredictability(699, [9])

TypeError: bad operand type for unary -: 'list'

In [17]:
b[0:5][1]

2