## Multi-model inference

In [162]:
#Importing necessary libraries for project
import json
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import time

In [163]:
#Calculating time required to run this algorithm
start = time.time()

In [164]:
#Converting the dataset from json format to a python dictionry
all_users = {}
with open( 'master-list-old-with-media.json', 'r' ) as f:
    all_users = json.load(f)
    
user_captions = {}
user_hashtags = {}

In [165]:
#Obtaining all the captions under every post for a particular user

for user in all_users.keys():
    #Parsing through all users    
    captions = []
    medias = all_users[user]['list_of_medias']
    #Parsing through every post under a particular user
    for post in medias:
        #Appending the captions to a list
        captions.append(post['caption_text'])
    user_captions[user] = captions

In [166]:
#Obtaining all the hashtags under every post for a particular user

for user in user_captions.keys():
    captions = user_captions[user]
    hashtags = []
    for caption in captions:
        #parsing through alll captions under a particualr user
        for word in caption.split(' '):
            if '#' in word:
                #if word starts with #, implies that the the given word is a hashtag
                hashtags.append(word.split('#')[1])
    user_hashtags[user] = hashtags

In [167]:
#Obtaining all the users who have liked posts for a particular user

user_liked = {}
for user in all_users.keys():
    temp_list = []
    liked_list = all_users[user]['list_of_medias']
    for id in liked_list:
        try:
            media_list = id['media_likers']
            for uid in media_list:
               temp_list.append(uid['id'])
        except:
            temp_list = []
    user_liked[user] = temp_list
    
    
#     for media in liked_list:
#         media_liked.append(media['media_likers'])
#     user_media[user] = media_liked


In [168]:
#Assuming user with id 14689113504 is the victim user for ue case
victim_hashtags = user_hashtags['14689113504']
victim_captions = user_captions['14689113504']
victim_liked = user_liked['14689113504']

#deleting victim user from obtained dictionaries
del user_hashtags['14689113504']
del user_captions['14689113504']
del user_liked['14689113504']

In [169]:
user_liked

{'6321704114': ['12277100520',
  '6072097785',
  '54585818660',
  '54342218449',
  '50330391517',
  '20681041128',
  '4573911130',
  '1705958376',
  '261618378',
  '18818649119',
  '1132840880',
  '16988897356',
  '1466260521',
  '3515043588',
  '1797684876',
  '6394217478',
  '503109994',
  '4060356086',
  '10825751876',
  '3420258277',
  '8031175733',
  '6952654775',
  '5354157810',
  '3529700225',
  '33917939215',
  '44606455462',
  '43936900536',
  '2359889745',
  '49250048260',
  '11828653694',
  '54585818660',
  '2242611704',
  '50330391517',
  '4573911130',
  '54063820521',
  '3118417047',
  '3934221905',
  '13088176462',
  '1705958376',
  '9220935822',
  '261618378',
  '4872174536',
  '325583882',
  '11677061522',
  '49570420281',
  '1409189693',
  '6298968308',
  '1698143674',
  '1797684876',
  '6394217478',
  '5792616315',
  '1422558839',
  '1995323794',
  '6952654775',
  '5783645068',
  '4570193455',
  '5354157810',
  '3529700225',
  '44606455462',
  '49088241806',
  '492500

In [170]:
#Counting the number of captions made by a particular user
caption_count = {}
for user in user_captions:
    count=0
    for _ in user_captions[user]:
        count = count+1
    caption_count[user] = count

In [171]:
#Counting the number of likes made by a particular user
like_count = {}
for user in user_liked:
    count=0
    for _ in user_liked[user]:
        count = count+1
    like_count[user] = count

In [172]:
#importing the model required to train the transformer on caption and hashtag data
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [173]:
#encoding the captions and the hashtags under the victim using the model
u_captions = model.encode(victim_captions)
u_hashtags = model.encode(victim_hashtags)

In [174]:
#finding the cosine similarity between victim captions and other user's captions
captions_score = {}
for user in user_captions:
    try:
        #running model over user's captions
        f_captions = model.encode(user_captions[user])
        
        #Cosine similarity for comparing user captions and victim captions
        res = cosine_similarity(u_captions, f_captions)
        
        #copmuting mean over all the values generatted by the model
        captions_score[user] = res.mean()
        
    except:
        #exception handling 
        captions_score[user] = 0.0
#len(captions_score)

#sorting the output of cosine similarity
sorted_captions = sorted([(value, key) for (key, value) in captions_score.items()])

In [175]:
#finding the cosine similarity between victim hashtags and other user's hashtags
hashtags_score = {}
for user in user_hashtags:
    try:
        #running model over user's hashtags
        f_hashtags = model.encode(user_hashtags[user])
        
        #Cosine similarity for comparing user hashtags and victim hashtags
        res = cosine_similarity(u_hashtags, f_hashtags)
        
        #copmuting mean over all the values generatted by the model
        hashtags_score[user] = res.mean()
        
    except:
        #exception handling 
        hashtags_score[user] = 0.0
#len(hashtags_score)

#sorting the output of cosine similarity
sorted_hashtags = sorted([(value, key) for (key, value) in hashtags_score.items()])

In [176]:
#Finding mutual likers between victim and other users across all posts
liked_score = {}
for user in user_liked:
    count = 0
    for id in user_liked[user]:
        for victim_id in victim_liked:
            if id == victim_id:
                #increasing count if there are mutual
                count = count+1
    #Appending liked score based on count
    liked_score[user] = count
sorted_liked = sorted([(value, key) for (key, value) in liked_score.items()])

In [177]:
#Dictionary to maintain score of every user obtained 
final_captions = {}
for user in sorted_captions:
    uid = user[1]
    final_captions[uid] = user[0]

In [178]:
#Dictionary to maintain score of every user obtained 
final_hashtags = {}
for user in sorted_hashtags:
    uid = user[1]
    final_hashtags[uid] = user[0]

In [179]:
#Master list to hold the cumuluative values for caption and hashtags
master_list = {}

In [180]:

#1. Increasing the 'sum' value for every for loop iteration and giving the particular value to the user.
#2. As the caption/hashtag dictionary is already in the sorted order, this will help by giving higher values to the user 
#who have the maximum influence over the victim user.


sum = 0
for user in final_captions:
    master_list[user] = sum
    sum += 1

In [181]:
#Appending 'sum' value to the master_list across hashtags
sum = 0
for user in final_hashtags:
    master_list[user] = master_list[user] + sum
    sum += 1

In [182]:
#Appending the number of mutual likers to the master_list
final_liked = {}
for user in sorted_liked:
    if user[0] != 0.0:
        final_liked[user[1]] = user[0]
#final_liked


#for the mutual media likers to the victim, it so happens that there are only 4 other users, with >0 mutual likers. 

In [183]:
#Giving the same weight as the number of mutuals in the list
for user in final_liked:
    master_list[user] = master_list[user] + final_liked[user]
    
#master_list

In [184]:
#Sorting the final list generated

sorted_final = sorted([(value, key) for (key, value) in master_list.items()])
#sorted_final

In [185]:
multi_model_time = time.time() - start
print("Time taken by multi-model inference: ", multi_model_time, "seconds")

Time taken by multi-model inference:  127.94560384750366 seconds


## Anonymization of graph network

In [186]:
import networkx as nx
import random as rn

In [187]:
start = time.time()

In [188]:
# Graph anonymization 
def graph_anonymization(orignal_Graph,k_degree):

  dv = [ d[1] for d in orignal_Graph.degree()]
  degree_sequence = np.sort(dv)[::-1]

  # step1: degree anonymizer using dynamic programming
  anonymised_sequence = degree_anonymization(degree_sequence,k_degree) 
  # step2: Construct graph using ConstructGraph algorithm
  Ga = construct_graph(anonymised_sequence,orignal_Graph)

  return Ga


In [189]:
# step1 : Degree Anonymization using Dynamic programming 
def degree_anonymization(degree_sequence,k_degree):

  C = anonymisation_cost(degree_sequence,k_degree)
  n = np.size(degree_sequence)
  Da = np.full(n,np.inf)
  sequences = [None] * n
  cost, anonymised_sequence = degree_anonymization_recursion(degree_sequence,k_degree,C,n,Da,sequences)

  return anonymised_sequence

In [190]:
def anonymisation_cost(degree_sequence,k):
    n = np.size(degree_sequence)
    C = np.full([n,n],np.inf)
    for i in range(n-1):
        for j in range(i+k-1,np.min([i+2*k,n])):
          if C[i,j-1] == np.inf:
            C[i,j] = assignment_cost_addition(degree_sequence[i:j+1])           
          else:
            C[i,j] = C[i,j-1] + degree_sequence[i] - degree_sequence[j]
    return C

In [191]:
def assignment_cost_addition(degree_sequence):
  return np.sum(degree_sequence[0]-degree_sequence)

In [192]:
def degree_anonymization_recursion(degree_sequence,k,C,n,Da,sequences):
  group_degree = degree_sequence[0]
  
  all_group_sequence = np.full(n,group_degree)
  all_group_cost = C[0,n-1]  
      
  if n < 2*k:
      return all_group_cost, all_group_sequence
  else:
    min_cost = np.inf
    min_cost_sequence = np.empty(0)
    
    for t in range(np.max([k-1,n-2*k]),n-k):
      
      if Da[t] == np.inf:
        cost, sequence = degree_anonymization_recursion(degree_sequence[0:t+1],k,C,t+1,Da,sequences)
        Da[t] = cost
        sequences[t] = sequence
      else:
        cost = Da[t]
        sequence = sequences[t]
        cost = cost + C[t+1,n-1]
      
      if cost < min_cost:
        min_cost = cost
        min_cost_sequence = np.concatenate((sequence,np.full(np.size(degree_sequence[t+1:]),degree_sequence[t+1])))                
    min_cost_squence_return = (min_cost, min_cost_sequence) if min_cost < all_group_cost else (all_group_cost, all_group_sequence)
  return min_cost_squence_return

In [193]:
#Step2: Graph construction 
def construct_graph(degree_sequence, original_graph):
  
  n = len(degree_sequence)
  if np.sum(degree_sequence) % 2 != 0:
    return None
            
  G = nx.empty_graph(n)
  vd = [(v,d) for v,d in enumerate(degree_sequence)]

  while True:
    
    vd.sort(key=lambda tup: tup[1], reverse=True)
    if vd[-1][1] < 0:
      return None
    
    tot_degree = 0
    for vertex in vd:
      tot_degree = tot_degree + vertex[1]
      
    if tot_degree == 0:
      return G
        
    remaining_vertices = [i for i,vertex in enumerate(vd) if vertex[1] > 0]
    idx = remaining_vertices[rn.randrange(len(remaining_vertices))]
    v = vd[idx][0]
    
    for i,u in enumerate(vd):
      
      if vd[idx][1] == 0:
        break
        
      if u[0] == v:
        continue
            
      if G.has_edge(u[0],v):
        continue
            
      if original_graph.has_edge(v,u[0]) and u[1] > 0:
        G.add_edge(v,u[0])     
        vd[i] = (u[0],u[1] - 1)      
        vd[idx] = (v,vd[idx][1] - 1)
                
        
    for i,u in enumerate(vd):
      
      if vd[idx][1] == 0:
        break
          
      if u[0] == v:
        continue
            
      if G.has_edge(v,u[0]):
        continue
            
      if not original_graph.has_edge(v,u[0]):
        G.add_edge(v,u[0])
        vd[i] = (u[0],u[1] - 1)
        vd[idx] = (v,vd[idx][1] - 1)

In [194]:
if __name__ == "__main__":
  
  # import data set and generate graph
  df = pd.read_csv('sample.csv',delimiter=" ")
  
  G = nx.from_pandas_edgelist(df,source='From',target='To')
  print(nx.number_of_nodes(G),nx.number_of_edges(G),nx.density(G))
  
  # k-degree graph anonymization
  k_degree=4 
  Ga = graph_anonymization(G,k_degree)

  Gb = nx.intersection(G,Ga)
  num_edges_G = len(set(G.edges()))
  num_edges_both = len(set(Gb.edges()))

  print("Edges overlap = " + str(100*num_edges_both/num_edges_G)+"%")
  print("Num edges original graph = " + str(nx.number_of_edges(G)))
  print("Num edges anonymised graph = " + str(nx.number_of_edges(Ga)))

1005 16706 0.03311331787278746
Edges overlap = 56.656291152879206%
Num edges original graph = 16706
Num edges anonymised graph = 17242


In [195]:
df_out = nx.to_pandas_edgelist(Ga)
df_out.to_csv('sample_out.csv', index=False)

In [196]:
anonymization_time = time.time() - start
print("Time taken by anonymimzation of graph: ", anonymization_time, "seconds")

Time taken by anonymimzation of graph:  1.08412766456604 seconds


## Obtaining edge weights

In [197]:
from numpy.linalg import norm

In [198]:
start = time.time()

In [199]:
all_users = {}
with open( 'master-list-old-with-media.json', 'r' ) as f:
    all_users = json.load(f)
    
user_captions = {}
user_hashtags = {}
user_media = {}
df = pd.DataFrame()

In [200]:
#obtaining the number of likers for each user

user_liker_count = {}
for user in all_users.keys():
    #Parsing through all users
    liker_count = {}
    medias = all_users[user]['list_of_medias']
    for posts in medias:
        #Parsing through every post of a user
        try:
             
            for user_likers in posts['media_likers']:
                if user_likers['id'] in liker_count:
                    #if user already exists increase count by 1
                    liker_count[user_likers['id']] = liker_count[user_likers['id']] + 1
                else:
                    liker_count[user_likers['id']] = 1
                
                #print(liker_count)
        except:
            pass
            
        #print(liker_count)
    user_liker_count[user] = liker_count
        #print(posts['media_likers'])

    

In [201]:
#obtaining the number of commenters for each user
user_comment_count = {}
for user in all_users.keys():
    #Parsing through all users
    comment_count = {}
    medias = all_users[user]['list_of_medias']
    for posts in medias:
        #Parsing through every post of a user
        try:
            for user_comments in posts['media_commenters']:
                if user_comments['id'] in comment_count:
                    #if user already exists increase count by 1
                    comment_count[user_comments['id']] = comment_count[user_comments['id']] + 1
                else:
                    comment_count[user_comments['id']] = 1
               
                #print(liker_count)
        except:
            pass
            
        #print(liker_count)
    user_comment_count[user] = comment_count
        #print(posts['media_likers'])

    

In [202]:
#getting the max number of comments generated 
counter = 0
for user in user_comment_count:
    
    for i in user_comment_count[user]:
        if user_comment_count[user][i]>counter:
            counter = user_comment_count[user][i]
#print(counter)

#maximum comments initialized to max_comms
max_comms = counter

In [203]:
#getting the max number of likes
counter = 0
for user in user_liker_count:
    
    for i in user_liker_count[user]:
        if user_liker_count[user][i]>counter:
            counter = user_liker_count[user][i]
#print(counter)

#maximum likes initialized to max_likes
max_likes = counter

In [204]:
# normalizing like count, by dividing every like count by max value obtained
for user in user_liker_count:    
    for i in user_liker_count[user]:
        user_liker_count[user][i] = user_liker_count[user][i]/max_likes

#user_liker_count

In [205]:
# normalizing comment count, by dividing every comment count by max value obtained
for user in user_comment_count:    
    for i in user_comment_count[user]:
        user_comment_count[user][i] = user_comment_count[user][i]/max_comms

#user_comment_count

In [206]:
#assigning weights decided by the user for weighted graph
comm_weight = 0.7
like_weight = 0.3

In [207]:
edge_weight_time = time.time() - start
print("Time taken by edge weight generation: ", edge_weight_time, "seconds")

Time taken by edge weight generation:  0.2212066650390625 seconds


## Differential Privacy on edge weights

In [208]:
from diffpriv_laplace import DiffPrivLaplaceMechanism

In [None]:
start = time.time()

In [209]:
def add_noise(epsilon, value):
  #print(value)
  # Using the class method
  anonymized = DiffPrivLaplaceMechanism.anonymize_count_with_budget(value, epsilon)

  # Using an instance
  anonymizer = DiffPrivLaplaceMechanism(epsilon)
  anonymized = anonymizer.anonymize_count(value)
  return anonymized

In [210]:
df = pd.read_csv('weighted_graph.csv', header = None)

In [211]:
#a python list to house the graph weights from original network
weights = []
val = df[2].to_numpy()
for i in val:
  weights.append(i)
#print(weights)

In [212]:
#Change epsilon value as required
epsilon = 0.1

weights_with_noise = add_noise(epsilon, weights)

In [213]:
print("Updated weight values: ")
print(weights_with_noise)


Updated weight values: 
[ 5.10168859 15.52400932 -9.28714956 ... -6.94150273 34.02572448
 -2.5820434 ]


In [214]:
#Updating network with new weights
df[3] = weights_with_noise
df.to_csv('diff_priv.csv')

In [None]:
diff_priv_time = time.time() - start
print("Time taken by differential privacy weights: ", diff_priv_time, "seconds")

## Metrics for Differential Privacy

In [239]:
import sklearn.metrics as sk

In [244]:
start = time.time()

In [240]:
#Reading csv file with edge weights after differential privacy is executed
df = pd.read_csv('diff_priv.csv')

In [241]:
original_weights = df['2']
dp_weights = df['3']

In [242]:
meen_absolute_error = sk.mean_absolute_error(original_weights, dp_weights)
mean_squared_error = sk.mean_squared_error(original_weights, dp_weights)
mean_absolute_percent_error = sk.mean_absolute_percentage_error(original_weights, dp_weights)
d2_absolute_error_score = sk.d2_absolute_error_score(original_weights,dp_weights)

AttributeError: module 'sklearn.metrics' has no attribute 'd2_absolute_error_score'

In [243]:
#calculating variation in data

original_mean = np.mean(original_weights)
original_std = np.std(original_weights)
dp_mean = np.mean(dp_weights)
dp_std =np.std(dp_weights)

original_var = original_std / original_mean
dp_var = dp_std / dp_mean

#print(original_var)
#print(dp_var)
    

1.3126660571223738
-99.42758837779611


In [245]:
diff_metrics_time = time.time() - start
print("Time taken by differential privacy metrics: ", diff_metrics_time, "seconds")

Time taken by differential privacy metrics:  7.862414360046387 seconds
