<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
def Lempel_Ziv(usr,df, lambdas=False, e=150):
    """Estimate the entropy rate of the symbols encoded in `seq`, a list of
    strings.

    Kontoyiannis, I., Algoet, P. H., Suhov, Y. M., & Wyner, A. J. (1998).
    Nonparametric entropy estimation for stationary processes and random
    fields, with applications to English text. IEEE Transactions on Information
    Theory, 44(3), 1319-1327.

    Bagrow, James P., Xipei Liu, and Lewis Mitchell. "Information flow reveals
    prediction limits in online social activity." Nature human behaviour 3.2
    (2019): 122-128.
    """
    if df == 'Brightkite':
        seq = bf[bf['userid']==usr]['placeid'].to_list()
    elif df == 'Weeplaces':
        seq = wf[wf['userid']==usr]['placeid'].to_list()
    else:
        return np.nan
    
    N = len(seq)
    wb = 0
    h_t = []
    h_t.append(0)
    h_t.append(0)

    if N < e:
        return np.nan
    else:
        L = []
        for i, w in enumerate(seq):
            seen = True
            prevSeq = " %s " % " ".join(seq[0:i])
            c = i
            while seen and c < N:
                c += 1
                seen = (" %s " % " ".join(seq[i:c])) in prevSeq
            
            l = c - i
            if l > 1:
                wb+=1
            L.append(l)
            if i > 1:
                h_t.append((1.0 * i / sum(L)) * np.log2(i))
        rate_h = np.subtract(h_t[15:],h_t[:-15])
        
        if lambdas:
            return L
        return [(1.0 * N / sum(L)) * np.log2(N),sum(L),N,wb,L,h_t, rate_h]
                #,L,np.sum(L),prevSeq]


In [2]:
def CrossEntropy(ego,alters,dataset, lambdas=False, e=150, **kwargs):
    
    """Estimate the entropy rate of the symbols encoded in `seq`, a list of
    strings.

    Kontoyiannis, I., Algoet, P. H., Suhov, Y. M., & Wyner, A. J. (1998).
    Nonparametric entropy estimation for stationary processes and random
    fields, with applications to English text. IEEE Transactions on Information
    Theory, 44(3), 1319-1327.

    Bagrow, James P., Xipei Liu, and Lewis Mitchell. "Information flow reveals
    prediction limits in online social activity." Nature human behaviour 3.2
    (2019): 122-128.
    """

    '''
    returns[array of cross entropies, array of weights wb (returns len(alters) + 1 elements if with ego == True),Array of Lambda_i]
    '''
    '''Dictionaries of Alters and their lengths are stored in an array in the order the alters were called
    kwargs:
        with_ego: bool, True implies we include the ego in the cummulative cross entropy
        temporal_control: bool, True means we shuffle the time stamps of the alter locations
    '''
    
    '''Lambda_i is a list of the cross-parsed match lengths of the ego based on each alter i
    wb is a list of number of matches of substrings of A in B
    cross_ent is the list of (cummulative) cross entropies of the alters'''
    
    
    if dataset == 'Brightkite':
        df = bf
    if dataset == 'Weeplaces':
        df = wf
    else:
        return np.nan
    
    TempCont = False
    if 'temporal_control' in kwargs:
        TempCont = kwargs['temporal_control']
    '''Gets Coordinates of alters. Makes array of x-locs and y-locs
    key is an array the size of the list of locations with all elements 'B', signifying each element as the alter's
    N_alters is a list of the number of coordinates in the alter's string
    Time_alters are the timestamps of the location visits
    '''
    
    seq_ego = df[df['userid'] == ego]['placeid'].to_list()
    key_ego = ['A']*len(seq_ego)
    time_ego = df[df['userid'] == ego]['datetime'].to_list()
    cutoff30 = df[(df['userid'] == ego) & (df['cutoff%'] <= .3)]['datetime'].tail(1).to_list()[0]
    N_ego = len(seq_ego)
    N_ego_uni = df[df['userid']== ego]['Unique Locs'].to_list()[0]
    
    if N_ego < e:
        return np.nan  
    
    
    if type(alters) is list:
        seq_alters = []
        key_alters = []
        time_alters = []
        N_alters = []
        k = 0
        for usr in alters:
            #print(usr)
            seq_alters.append(df[df['userid'] == usr]['placeid'].to_list())
            key_alters.append(['B']*len(seq_alters[k]))
            time_alters.append(df[df['userid'] == usr]['datetime'].to_list())
            N_alters.append(len(seq_alters[k]))
            k+=1
            if TempCont:
                '''If we want a temporally controlled entropy, we shuffle the times and sort the 
                locations with respect to the shuffled time list'''
                random.shuffle(time_alters[k-1])
                seq_alters[k-1] = [x for _, x in sorted(zip(time_alters[k-1],seq_alters[k-1]))]
                seq_alters[k-1] = [x for _, x in sorted(zip(time_alters[k-1],seq_alters[k-1]))]
    else:
        k=0
        seq_alters = [df[df['userid'] == alters]['placeid'].to_list()]
        key_alters = [['B']*len(seq_alters[k])]
        time_alters = [df[df['userid'] == alters]['datetime'].to_list()]
        N_alters = [len(seq_alters[0])]
        if TempCont:
            random.shuffle(time_alters)
            seq_alters = [x for _, x in sorted(zip(time_alters,seq_alters))]
    
    L_i = []
    Lambda_max = []
    wb = []
    cumcross_ent = []
    cross_ents = []
    k = -1
    ego_index = 0
    h_t = []
    if 'with_ego' in kwargs:
        with_ego = kwargs['with_ego']
        if kwargs['with_ego']:
            dummy = Lempel_Ziv(ego,dataset)
            N_alters.insert(0,dummy[2])
            wb.append(dummy[3])
            L_i.append(dummy[4])
            ego_index = 1
    else:
        with_ego = False 
    
    for ALTER in seq_alters:
        i = 0
        i_ego = 0
        i_alter = 0
        k+=1
        maxlen = 0
        dict_ego = []
        dict_alter = []
        wb.append(0)
        seq = seq_alters[k] + seq_ego
        key = key_alters[k] + key_ego
        times = time_alters[k] + time_ego
        key = [x for _, x in sorted(zip(times,key))]
        seq = [x for _, x in sorted(zip(times,seq))]
        times = sorted(times)
        
        N_alters[k+ego_index] = N_alters[k+ego_index] - key[::-1].index('A')
        seq = seq[:(len(key)-key[::-1].index('A'))]
        key = key[:(len(key)-key[::-1].index('A'))]

    
        i_ego = 0
        i_alter = 0
        prevSeq = 0
        h_t.append([])
        for i in range(key[::-1].index('A')+2):
            h_t[-1].append(0)
        L_i.append([])
        if (N_alters[k + ego_index] < 150) or (time_alters[k][0] > cutoff30) :
            N_alters[k + ego_index] = 0
            wb[-1] = 0
            L_i[-1] = [1]*len(L_i[-1])
            cross_ents.append(float('nan'))
        else:
            for i, w in enumerate(seq):
                if key[i] == 'B':
                    i_alter += 1
                else:
                    seen = True
                    prevSeq = " %s " % " ".join(seq_alters[k][0:i_alter])
                    c = i_ego
                    while seen and c < N_ego:
                        c += 1                    
                        seen = (" %s " % " ".join(seq_ego[i_ego:c])) in prevSeq

                    l = c - i_ego
                    i_ego += 1
                    L_i[-1].append(l)
                    if l > 1:
                        wb[-1]+=1
                    if i_ego > 2:
                        h_t[-1].append((1.0 * i_ego / sum(L_i[-1])) * np.log2(i_alter))


            if with_ego == True:
                N_AB_one = (wb[0]*N_ego + wb[-1]*N_alters[k + ego_index])/(wb[0]+wb[-1])
                Lambda_max_one = np.sum(np.nanmax([L_i[0],L_i[-1]],axis=0))
                if N_alters[k + ego_index] < 150:
                    cross_ents.append(float('nan'))
                else:
                    cross_ents.append(N_ego*np.log2(N_AB_one)/Lambda_max_one)
            else:
                if (sum(wb) == 0) or (wb[-1] == 0):
                    cross_ents.append(float('nan'))
                else:
                    cross_ents.append(N_ego*np.log2(N_alters[k+ego_index])/sum(L_i[-1]))
        if cross_ents[-1] > np.log2(N_ego_uni):
            N_alters[k+ego_index] = 0
            wb[-1] = 0
        N_AB = np.sum(np.multiply(wb,N_alters[:len(wb)]))/np.sum(wb)
        Lambda_max = np.sum(np.nanmax(L_i,axis=0))
        cumcross_ent.append(N_ego*np.log2(N_AB)/Lambda_max)
    if lambdas:
        return L
    return [cumcross_ent,cross_ents,wb,N_alters[ego_index:],h_t]


In [3]:
'''Fano Inequality

        if (S/np.log2(N) > thresh) or (S < 1):
            return fsolve(Fano,.1,(N,S))[0]
        else:
'''
def binaryEnt(x):
    return -1*(x*np.log2(x) + (1-x)*np.log2(1-x))

#def Fano(Pi_max, N, S):
#    return np.log2(N-1)-S+Pi_max*np.log2((1/Pi_max - 1)*(1/(N-1))) - np.log2(1-Pi_max)

def Fano(Pi_max, N, S):
    return (1-Pi_max)*np.log2(N-1)-S+binaryEnt(1-Pi_max)

def CalcPi(N,S, thresh = .9):
    if math.isnan(S) or (N == 1) or (np.log2(N) < S):
        return float('nan')
    else:
        if (S < 1):
            return fsolve(Fano,.9,(N,S))[0]
        else:
            return fsolve(Fano,.5,(N,S))[0]

In [4]:
def clearCCEwithEgo(Network):
    Network['CCE w/ Ego'] = -1
    
    return Network

def clearCCETempCont(Network):
    Network['CCE Temporal Control'] = -1
    
    return Network

def clearCCE(Network):
    Network['CCE'] = -1
    Network['CCP'] = -1
    
    return Network

def clearCE(Network):
    Network['Cross Entropy'] = -1
    Network['CCE'] = -1
    Network['wb'] = -1
    Network['N_B'] = -1
    Network['Cross Predictability'] = -1
    Network['CCP'] = -1
    
    return Network

def clearAll(Network):
    Network['Ego Entropy'] = -1
    Network['Pi_Ego'] = -1
    Network['Cross Entropy'] = -1
    Network['CCE'] = -1
    Network['wb'] = -1
    Network['N_B'] = -1
    Network['Cross Predictability'] = -1
    Network['CCP'] = -1
    Network['CCE Temporal Control'] = -1
    Network['CCE w/ Ego'] = -1
    
    return Network

In [5]:
def writeAll(Network, datasource, **kwargs):
    
    start1 = timeit.default_timer()
    unilocs = Network[Network['CCP'] == -1].groupby('Ego').head(1)['Unique Locs'].to_list()

    usrsleft = Network[Network['CCP'] == -1].groupby('Ego').head(1)['Ego'].to_list()
    cumsumtotFriendss = Network[Network['Ego'].isin(usrsleft)].groupby('Ego').head(1)['Total Friends'].cumsum().to_list()
    i = 0

    for usr in usrsleft:
        if i != 0:
            print('Percent finished: ' + str(round(cumsumtotFriendss[i]/cumsumtotFriendss[-1],3)*100) + '%')
            print('Time Left: ' + str(round((stop2-start1)*(cumsumtotFriendss[-1]/cumsumtotFriendss[i] - 1)/3600,3)) + ' hour(s)')
        print(i,' out of ',len(usrsleft),' users remaining')
        start2 = timeit.default_timer()
        wfdummy = Network[Network['Ego'] == usr]['Alters'].to_list()
        LZ_Ent = Lempel_Ziv(usr,df = datasource)[0]
        Network.loc[Network['Ego'] == usr,'Ego Entropy'] = LZ_Ent
        Network.loc[Network['Ego'] == usr,'Pi_Ego'] = CalcPi(unilocs[i],LZ_Ent)
        CCEdummy = CrossEntropy(usr,wfdummy,datasource)
        if type(CCEdummy) == float:
            Network.loc[Network['Ego'] == usr,['Cross Entropy','CCE','wb','N_B','Unique Locs Intersection','N_Ego (Seen by Friends)','Cumulative Unique Locs Intersection'
                     ,'LZ Intersection','Pi_Max Intersection','Cum. LZ Int.','Cum. Pi_Max Int.','Cross Predictability','CCP']] = float('nan')
        else: 
            CE = CCEdummy[1]
            CCE = CCEdummy[0]
            wb = CCEdummy[2]
            N_B = CCEdummy[3]
            Network.loc[Network['Ego'] == usr,'Cross Entropy'] = CE
            Network.loc[Network['Ego'] == usr,'CCE'] = CCE
            Network.loc[Network['Ego'] == usr,'wb'] = wb
            Network.loc[Network['Ego'] == usr,'N_B'] = N_B

            CP = []
            CCP = []
            j = 0
            for Friends in wfdummy:
                CP = CP + [CalcPi(unilocs[i],CE[j])]
                CCP = CCP + [CalcPi(unilocs[i],CCE[j])]

                j+=1

            Network.loc[Network['Ego'] == usr,'Cross Predictability'] = CP
            Network.loc[Network['Ego'] == usr,'CCP'] = CCP

        stop2 = timeit.default_timer()
        print("Time: " + str(round((stop2-start2)/60,3)) + ' min(s)')
        print("Total Time: "  + str(round((stop2-start1)/3600,4)) + ' hour(s)')
        print('')
        i+=1
    
    return Network


In [7]:
def writeCCE(Network, datasource, **kwargs):
    
    start1 = timeit.default_timer()
    unilocs = Network[Network['CCP'] == -1].groupby('Ego').head(1)['Unique Locs'].to_list()

    usrsleft = Network[Network['CCP'] == -1].groupby('Ego').head(1)['Ego'].to_list()
    cumsumtotFriendss = Network[Network['Ego'].isin(usrsleft)].groupby('Ego').head(1)['Total Friends'].cumsum().to_list()
    i = 0

    for usr in usrsleft:
        if i != 0:
            print('Percent finished: ' + str(round(cumsumtotFriendss[i]/cumsumtotFriendss[-1],3)*100) + '%')
            print('Time Left: ' + str(round((stop2-start1)*(cumsumtotFriendss[-1]/cumsumtotFriendss[i] - 1)/3600,3)) + ' hour(s)')
        print(i,' out of ',len(usrsleft),' users remaining')
        start2 = timeit.default_timer()
        wfdummy = Network[Network['Ego'] == usr]['Alters'].to_list()
        CCEdummy = CrossEntropy(usr,wfdummy,datasource)
        if type(CCEdummy) == float:
            Network.loc[Network['Ego'] == usr,['Cross Entropy','CCE','wb','N_B','Unique Locs Intersection','N_Ego (Seen by Friends)','Cumulative Unique Locs Intersection'
                     ,'LZ Intersection','Pi_Max Intersection','Cum. LZ Int.','Cum. Pi_Max Int.','Cross Predictability','CCP']] = float('nan')
        else: 
            CCE = CCEdummy[0]
            Network.loc[Network['Ego'] == usr,'CCE'] = CCE

            CCP = []
            j = 0
            for Friends in wfdummy:
                CCP = CCP + [CalcPi(unilocs[i],CCE[j])]

                j+=1

            Network.loc[Network['Ego'] == usr,'CCP'] = CCP

        stop2 = timeit.default_timer()
        print("Time: " + str(round((stop2-start2)/60,3)) + ' min(s)')
        print("Total Time: "  + str(round((stop2-start1)/3600,4)) + ' hour(s)')
        print('')
        i+=1
    
    return Network


In [6]:
def writeCE(Network, datasource, **kwargs):
    
    start1 = timeit.default_timer()
    unilocs = Network[Network['Cross Predictability'] == -1].groupby('Ego').head(1)['Unique Locs'].to_list()

    usrsleft = Network[Network['Cross Predictability'] == -1].groupby('Ego').head(1)['Ego'].to_list()
    cumsumtotFriendss = Network[Network['Ego'].isin(usrsleft)].groupby('Ego').head(1)['Total Friends'].cumsum().to_list()
    i = 0

    for usr in usrsleft:
        if i != 0:
            print('Percent finished: ' + str(round(cumsumtotFriendss[i]/cumsumtotFriendss[-1],3)*100) + '%')
            print('Time Left: ' + str(round((stop2-start1)*(cumsumtotFriendss[-1]/cumsumtotFriendss[i] - 1)/3600,3)) + ' hour(s)')
        print(i,' out of ',len(usrsleft),' users remaining')
        start2 = timeit.default_timer()
        wfdummy = Network[Network['Ego'] == usr]['Alters'].to_list()
        CCEdummy = CrossEntropy(usr,wfdummy,datasource)
        if type(CCEdummy) == float:
            Network.loc[Network['Ego'] == usr,['Cross Entropy','CCE','wb','N_B','Unique Locs Intersection','N_Ego (Seen by Friends)','Cumulative Unique Locs Intersection'
                     ,'LZ Intersection','Pi_Max Intersection','Cum. LZ Int.','Cum. Pi_Max Int.','Cross Predictability','CCP']] = float('nan')
        else: 
            CE = CCEdummy[1]
            wb = CCEdummy[2]
            N_B = CCEdummy[3]
            Network.loc[Network['Ego'] == usr,'Cross Entropy'] = CE
            Network.loc[Network['Ego'] == usr,'wb'] = wb
            Network.loc[Network['Ego'] == usr,'N_B'] = N_B

            CP = []
            j = 0
            for Friends in wfdummy:
                CP = CP + [CalcPi(unilocs[i],CE[j])]

                j+=1

            Network.loc[Network['Ego'] == usr,'Cross Predictability'] = CP

        stop2 = timeit.default_timer()
        print("Time: " + str(round((stop2-start2)/60,3)) + ' min(s)')
        print("Total Time: "  + str(round((stop2-start1)/3600,4)) + ' hour(s)')
        print('')
        i+=1
    
    return Network


In [8]:
def writeCCE_with_Ego(Network, datasource):
    
    start1 = timeit.default_timer()
    unilocs = Network[Network['CCP w/ Ego'] == -1].groupby('Ego').head(1)['Unique Locs'].to_list()

    usrsleft = Network[Network['CCP w/ Ego'] == -1].groupby('Ego').head(1)['Ego'].to_list()
    cumsumtotFriendss = Network[Network['Ego'].isin(usrsleft)].groupby('Ego').head(1)['Total Friends'].cumsum().to_list()
    i = 0

    for usr in usrsleft:
        if i != 0:
            print('Percent finished: ' + str(round(cumsumtotFriendss[i]/cumsumtotFriendss[-1],3)*100) + '%')
            print('Time Left: ' + str(round((stop2-start1)*(cumsumtotFriendss[-1]/cumsumtotFriendss[i] - 1)/3600,3)) + ' hour(s)')
        print(i,' out of ',len(usrsleft),' users remaining')
        start2 = timeit.default_timer()
        wfdummy = Network[Network['Ego'] == usr]['Alters'].to_list()
        CCEdummy = CrossEntropy(usr,wfdummy,datasource, with_ego = True)
        
        if type(CCEdummy) == float:
            Network.loc[Network['Ego'] == usr,['Cross Entropy','CCE','wb','N_B','Unique Locs Intersection','N_Ego (Seen by Friends)','Cumulative Unique Locs Intersection'
                     ,'LZ Intersection','Pi_Max Intersection','Cum. LZ Int.','Cum. Pi_Max Int.','Cross Predictability','CCP']] = float('nan')
        else: 
            CCE = CCEdummy[0]
            Network.loc[Network['Ego'] == usr,'CCE w/ Ego'] = CCE

            
            CCP = []
            j = 0
            for Friends in wfdummy:
                CCP = CCP + [CalcPi(unilocs[i],CCE[j])]
                j+=1

            Network.loc[Network['Ego'] == usr,'CCP w/ Ego'] = CCP

        stop2 = timeit.default_timer()
        print("Time: " + str(round((stop2-start2)/60,3)) + ' min(s)')
        print("Total Time: "  + str(round((stop2-start1)/3600,4)) + ' hour(s)')
        print('')
        i+=1
    
    return Network


In [9]:
def writeCCE_Temporal_Control(Network, datasource):
    
    start1 = timeit.default_timer()
    unilocs = Network[Network['CCP Temp Control'] == -1].groupby('Ego').head(1)['Unique Locs'].to_list()

    usrsleft = Network[Network['CCP Temp Control'] == -1].groupby('Ego').head(1)['Ego'].to_list()
    cumsumtotFriendss = Network[Network['Ego'].isin(usrsleft)].groupby('Ego').head(1)['Total Friends'].cumsum().to_list()
    i = 0

    for usr in usrsleft:
        if i != 0:
            print('Percent finished: ' + str(round(cumsumtotFriendss[i]/cumsumtotFriendss[-1],3)*100) + '%')
            print('Time Left: ' + str(round((stop2-start1)*(cumsumtotFriendss[-1]/cumsumtotFriendss[i] - 1)/3600,3)) + ' hour(s)')
        print(i,' out of ',len(usrsleft),' users remaining')
        start2 = timeit.default_timer()
        wfdummy = Network[Network['Ego'] == usr]['Alters'].to_list()
        CCEdummy = CrossEntropy(usr,wfdummy,datasource, temporal_control = True)
        
        
        if type(CCEdummy) == float:
            Network.loc[Network['Ego'] == usr,['Cross Entropy','CCE','wb','N_B','Unique Locs Intersection','N_Ego (Seen by Friends)','Cumulative Unique Locs Intersection'
                     ,'LZ Intersection','Pi_Max Intersection','Cum. LZ Int.','Cum. Pi_Max Int.','Cross Predictability','CCP']] = float('nan')
        else: 
            CCE = CCEdummy[0]
            Network.loc[Network['Ego'] == usr,'CCE Temp Control'] = CCE

            CP = []
            CCP = []
            j = 0
            for Friends in wfdummy:
                CCP = CCP + [CalcPi(unilocs[i],CCE[j])]

                j+=1

            Network.loc[Network['Ego'] == usr,'CCP Temp Control'] = CCP

        stop2 = timeit.default_timer()
        print("Time: " + str(round((stop2-start2)/60,3)) + ' min(s)')
        print("Total Time: "  + str(round((stop2-start1)/3600,4)) + ' hour(s)')
        print('')
        i+=1
    
    return Network
