In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import timeit
import math
import random
from scipy.optimize import fsolve
from mpmath import *
from datetime import datetime, timedelta
from scipy.optimize import curve_fit
#import xarray

In [2]:
wf = pd.read_csv("../../../Data_Sets/weeplaces/weeplaces/weeplace_checkins.csv")
wf['datetime'] = pd.to_datetime(wf['datetime'])

In [3]:
'''Entropy estimator for the time dependent string of locations of a given user'''

def Lempel_Ziv(ego):
    Lambda_i = 0
    h_t = [0]
    maxlen = 0
    dict_ego = []
    loc_x = wf[wf['userid'] == ego]['lon'].round(4).to_list()
    loc_y = wf[wf['userid'] == ego]['lat'].round(4).to_list()
    lenx = len(loc_x)
    i = 0
    wb = 0
    while i < len(loc_x):
        substr = []
        j = 0
        if dict_ego == []:
            Lambda_i += 1
            dict_ego.append([[(loc_x[i],loc_y[i])]])
            maxlen += 1
            i+=1
        else:
            while i < len(loc_x):
                substr.append((loc_x[i+j],loc_y[i+j]))
                strlen = len(substr)
                if strlen > maxlen:
                    maxlen += 1
                    dict_ego.append([substr])
                    Lambda_i += j+1
                    i+= 1
                    h_t.append(i*np.log2(i)/Lambda_i)
                    break
                if (i+j) == (len(loc_x) - 1):
                    dict_ego[strlen-1].append(substr)
                    Lambda_i += j+1
                    i += 1
                    h_t.append(i*np.log2(i)/Lambda_i)
                    break
                if substr in dict_ego[strlen-1]:
                    j += 1
                    #wb += 1
                else:
                    if j > 0:
                        wb += 1
                    Lambda_i += j+1
                    dict_ego[strlen-1].append(substr)
                    i += 1
                    h_t.append(i*np.log2(i)/Lambda_i)
                    break
    return [lenx*np.log2(lenx)/Lambda_i,dict_ego,Lambda_i,lenx,wb,h_t]
'''Returns a list whos elements are:
Entropy
The Dictionary of the Ego,
Lambda_i
Length of the string
w_b
Entropy w.r.t. time
'''

In [165]:
def CrossEntropy(ego,alters, **kwargs):
    '''
    returns[array of cross entropies, array of weights wb (returns len(alters) + 1 elements if with ego == True),,Array of Lambda_i]
    '''
    '''Dictionaries of Alters and their lengths are stored in an array in the order the alters were called
    kwargs:
        with_ego: bool, True implies we include the ego in the cummulative cross entropy
        temporal_control: bool, True means we shuffle the time stamps of the alter locations
        delay: int, Most recent number of hours that is removed from the alter to test for recency
    '''
    
    '''Lambda_i is a list of the cross-parsed match lengths of the ego based on each alter i
    wb is a list of number of matches of substrings of A in B
    cross_ent is the list of (cummulative) cross entropies of the alters'''
    TempCont = False
    delay0 = False
    if 'temporal_control' in kwargs:
        TempCont = kwargs['temporal_control']
    if 'delay' in kwargs:
        delay0 = kwargs['delay']
    '''Gets Coordinates of alters. Makes array of x-locs and y-locs
    key is an array the size of the list of locations with all elements 'B', signifying each element as the alter's
    N_alters is a list of the number of coordinates in the alter's string
    Time_alters are the timestamps of the location visits
    '''
    if type(alters) is list:
        loc_x_alters = []
        loc_y_alters = []
        key_alters = []
        time_alters = []
        N_alters = []
        k = 0
        for usr in alters:
            #print(usr)
            loc_x_alters.append(wf[wf['userid'] == usr]['lon'].round(4).to_list())
            loc_y_alters.append(wf[wf['userid'] == usr]['lat'].round(4).to_list())
            key_alters.append(['B']*len(loc_x_alters[k]))
            N_alters.append(len(loc_x_alters[k]))
            time_alters.append(wf[wf['userid'] == usr]['datetime'].to_list())
            if delay0 != False:
                time_alters.append(list(np.array(wf[wf['userid'] == usr]['datetime'].to_list()) + timedelta(hours=delay0)))
            if delay0 == False:
                time_alters.append(wf[wf['userid'] == usr]['datetime'].to_list())
            if TempCont:
                '''If we want a temporally controlled entropy, we shuffle the times and sort the 
                locations with respect to the shuffled time list'''
                indices = np.arange(0,len(time_alters[0]),1)
                random.shuffle(indices)
                loc_x_alters[k] = [loc_x_alters[k][x] for x in indices]
                loc_y_alters[k] = [loc_y_alters[k][x] for x in indices]
            k+=1
    else:
        k=0
        loc_x_alters = [wf[wf['userid'] == alters]['lon'].round(4).to_list()]
        loc_y_alters = [wf[wf['userid'] == alters]['lat'].round(4).to_list()]
        key_alters = [['B']*len(loc_x_alters[k])]
        if delay0 != False:
            time_alters = [list(np.array(wf[wf['userid'] == alters]['datetime'].to_list()) + timedelta(hours=delay0))]
        else:
            time_alters = [wf[wf['userid'] == alters]['datetime'].to_list()]
        N_alters = [len(loc_x_alters[0])]
        if TempCont:
            indices = np.arange(0,len(time_alters[0]),1)
            random.shuffle(indices)
            loc_x_alters[0] = [loc_x_alters[0][x] for x in indices]
            loc_y_alters[0] = [loc_y_alters[0][x] for x in indices]
            #print(time_alters)
        
    Lambda_i = []
    wb = []
    cross_ent = []
    k = -1
    ego_index = 0
    loc_x_ego = wf[wf['userid'] == ego]['lon'].round(4).to_list()
    loc_y_ego = wf[wf['userid'] == ego]['lat'].round(4).to_list()
    key_ego = ['A']*len(loc_x_ego)
    time_ego = wf[wf['userid'] == ego]['datetime'].to_list()
    N_ego = len(loc_x_ego)
    
    if 'with_ego' in kwargs:
        with_ego = kwargs['with_ego']
        if kwargs['with_ego']:
            dummy = CrossEntropy(ego,ego,delay = delay0)
            wb.append(dummy[1][0])
            Lambda_i.append(dummy[2][0])
            N_alters.insert(0,len(loc_x_ego))
            ego_index = 1
    else:
        with_ego = False
    for ALTER in loc_x_alters:
        i = 0
        i_ego = 0
        i_alter = 0
        k+=1
        maxlen = 0
        dict_ego = []
        dict_alter = []
        wb.append(0)
        Lambda_i.append(0)
        #print(k)
        loc_x = loc_x_ego + loc_x_alters[k]
        loc_y = loc_y_ego + loc_y_alters[k]
        key = key_alters[k] + key_ego
        times = time_alters[k] + time_ego
        #print(sorted(zip(times,key)))
        key = [x for _, x in sorted(zip(times,key))]
        loc_x = [x for _, x in sorted(zip(times,loc_x))]
        loc_y = [x for _, x in sorted(zip(times,loc_y))]
        while i < len(loc_x):
            if 'A' not in key[i:]:
                break
            substr_ego = []
            substr_alter = []
            j_ego = 0
            j_alter = 0
            while True:
                if (key[i] == 'A') & (i_ego + j_ego < len(loc_x_ego)):
                    substr_ego.append((loc_x_ego[i_ego+j_ego],loc_y_ego[i_ego+j_ego]))
                    strlen_ego = len(substr_ego)
                    if strlen_ego > maxlen:
                        maxlen += 1
                        dict_ego.append([substr_ego])
                        dict_alter.append([])
                        Lambda_i[k+ego_index] += j_ego+1
                        i+= 1
                        i_ego+=1
                        break
                    if (i_ego+j_ego) == (len(loc_x_ego) - 1):
                        dict_ego[strlen_ego-1].append(substr_ego)
                        Lambda_i[k+ego_index] += j_ego+1
                        i_ego += 1
                        i+=1
                        break
                    if (substr_ego in dict_alter[strlen_ego-1]):
                        j_ego += 1
                        #wb[k+ego_index] += 1   
                    else:
                        if j_ego > 0:
                            wb[k+ego_index] += 1
                        Lambda_i[k+ego_index] += j_ego+1
                        dict_ego[strlen_ego-1].append(substr_ego)
                        i += 1
                        i_ego+=1
                        break
                elif (key[i] == 'B') & (i_alter + j_alter < len(loc_x_alters[k])):
                    substr_alter.append((loc_x_alters[k][i_alter+j_alter],loc_y_alters[k][i_alter+j_alter]))
                    strlen_alter = len(substr_alter)
                    if strlen_alter > maxlen:
                        maxlen += 1
                        dict_alter.append([substr_alter])
                        dict_ego.append([])
                        i+= 1
                        i_alter+=1
                        break
                    if (i_alter+j_alter) == (len(loc_x_alters[k]) - 1):
                        dict_alter[strlen_alter-1].append(substr_alter)
                        i_alter += 1
                        i+=1
                        break
                    if (substr_alter in dict_alter[strlen_alter-1]):
                        j_alter += 1 
                    else:
                        dict_alter[strlen_alter-1].append(substr_alter)
                        i += 1
                        i_alter+=1
                        break      
        N_AB = np.sum(np.multiply(wb,N_alters[:len(wb)]))/np.sum(wb)
        Lambda_max = np.max(Lambda_i)
        #print(wb)
        #print(N_alters[:len(wb)])
        #print(N_AB)
        #print(Lambda_i)
        cross_ent.append(N_ego*np.log2(N_AB)/Lambda_max)
    return [cross_ent,wb,Lambda_i]

In [176]:
'''Fano Inequality'''
def Fano(Pi_max, N, S):
    return np.log2(N-1)-S+Pi_max*np.log2((1/Pi_max - 1)*(1/(N-1))) - np.log2(1-Pi_max)

In [5]:
usrstest = wf.groupby('userid')['userid'].head(1).to_list()

In [174]:
Lempel_Ziv(usrstest[3])[0]

6.257254541065184

In [173]:
CrossEntropy(usrstest[3],usrstest[3],delay = 0)[0]

[6.257254541065184]

In [175]:
CrossEntropy(usrstest[3],usrstest[3], with_ego = True, temporal_control = True, delay = 0)[0]

[5.936521987465261]

In [171]:
CrossEntropy(usrstest[3],usrstest[3],temporal_control = True)[0]

[5.774816171555542]