In [1]:
#Import modules
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm
from datetime import datetime
import math
from numpy.linalg import norm
from multiprocessing import Pool
import scipy.spatial.distance as dist
import itertools
import warnings
import os
import re

import json
import random

from fastdtw import fastdtw

from collections import defaultdict




#Entropy function
def calculate_entropy(probabilities):
    probabilities=np.array(probabilities)
    s=np.sum(probabilities)
    if s>0:
        probabilities=probabilities/s
        probabilities = probabilities[probabilities > 0]  # Exclude zero probabilities
        return -np.sum(probabilities * np.log(probabilities))
    return 0

def update_entropy_incremental(h1, Y, m):
    """Update entropy after adding value m to list Y."""
    s=np.sum(Y)
    H=0
    if s+m>0:
        p1 = s /(s+ m)  # New total sum after adding m
        p2=  m /(s+ m)
        if s==0:
            return 0
        if p1+p2==0:
            H= 0
        elif p1==0:
            H=-p2*np.log(p2)
        elif p2==0:
            H=p1*h1-p1*np.log(p1)
        else:
            H=p1*h1-p1*np.log(p1)-p2*np.log(p2)
    return H

def Entropy1(X):
    nonzero_indices = np.nonzero(X)
    nonzero_values = X[nonzero_indices]
    h = np.sum(np.multiply(np.negative(nonzero_values),np.log(nonzero_values)))
    return h

def Entropy2(X,s):
    nonzero_indices = np.nonzero(X)
    nonzero_values = X[nonzero_indices]
    h = np.sum(np.multiply(np.divide(np.negative(nonzero_values),s),np.log(np.divide(nonzero_values,s))))
    return h


#JSD function
def JSD_Divergance(user_link1,user_link2):
    # removed this as it didn't seem to be used
    s1=np.sum(user_link1)
    s2=np.sum(user_link2)
    XX=np.multiply(np.add(np.divide(user_link1,s1),np.divide(user_link2,s2)),0.5)    
    JSD=Entropy1(XX)
    JSD=np.subtract(JSD,np.multiply(Entropy2(user_link1,s1),0.5))
    JSD=np.subtract(JSD,np.multiply(Entropy2(user_link2,s2),0.5))
    return JSD
    
# Function to find the maximum entropy partition
def find_max_entropy_partition(beha):
    max_value = -1
    max_partition = None
    h1=0
    h2=0
    E1=[]
    E2=[]
    Z=beha.copy()
    s=np.sum(beha)
    if s==0:
        return None
    beha=beha/s
    for pivot in range(len(beha)):
        pivot2=len(beha)-pivot
        if pivot==0 or pivot==len(beha):
            h1=0
        else:
            
            h1 = update_entropy_incremental(h1, beha[:pivot-1], beha[pivot-1])


        if pivot2==len(beha):
            h2=0
        else:
            h2 = update_entropy_incremental(h2, beha[pivot2:], beha[pivot2-1])
            
        E1.append(h1)
        E2.append(h2)
        
    for pivot in range(len(beha)):
        h1=E1[pivot]
        h2=E2[len(beha)-1-pivot]
        if h1 > h2 :
            if h1 > max_value:
                max_value = h1
                max_partition = (beha[:pivot], beha[pivot:])
        elif h2 > max_value:
            max_value = h2
            max_partition = (beha[:pivot], beha[pivot:])

    if max_partition:
        return max_partition[0] if len(max_partition[0]) > 0 else max_partition[1]
    return None

#Cosine similarity function
def Cosine_similarity(user_link1,user_link2):
    if norm(user_link1)*norm(user_link2)==0:
        return 0
    return np.dot(user_link1,user_link2)/(norm(user_link1)*norm(user_link2))

# In[712]:


#Assign Numerical ID
def assign_numerical_ID(P,field,ID_field):
    unique_set = P[field].unique()
    entity_to_id = {entity: id for id, entity in enumerate(unique_set)}
    P[ID_field] = P[field].map(entity_to_id)
    return P,len(unique_set)

def extract_hashtags(text: str) -> list[str]:
    "Extracts hashtags from text and returns a list."
    pattern = r"(#\w+)" #r"#[^\s!@#$%^&*()=+./,\[{\]};:'\"?><]+"
    return re.findall(pattern, text)

def extract_urls(text: str) -> list[str]:
    "Extracts urls from text and returns a list."
    pattern = r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b[-a-zA-Z0-9%_\+.~#?&//=]*"
    return re.findall(pattern, text)

#Read posts
def read_post(file_name, ignore_retweets):
    data = pd.read_csv(file_name, delimiter=',', skipinitialspace=True, engine='python',converters={"Post_ID":str, "Post_text":str,"User_ID":str,"Post_text":str})
    data['Post_text']=data['Post_text'].astype(str)
    if ignore_retweets:
        data = data.drop(data[data['Post_type'] == "Retweet"].index)

    # rather than using pd.read_csv we just read each line as this also works
    # in the case where the user has sent an empty file of excludes
    
    # these hold the column numbers so a) the columns can move around and
    # b) we only have to look them up once
    text_column = data.columns.get_loc('Post_text')
    user_column = data.columns.get_loc('User_ID')
    time_column = data.columns.get_loc('Post_time')
    post_column = data.columns.get_loc('Post_ID')

    hashtag_data = []
    for row in data.itertuples(index=False):
        text = row[text_column] # 'Post_text'
        hashtags = extract_hashtags(text.lower())
        if '#sackdoval' in hashtags:
            # make sure to remove any duplicates
            hashtags=set(hashtags)
            
            redate = re.compile('[^0-9]')

            for hashtag in hashtags:
                    hashtag_data.append({
                        'UserID': row[user_column], # 'User_ID'
                        'Hashtag': hashtag,
                        'Time': pd.Timestamp(row[time_column].split('+')[0]).timestamp(), # 'Post_time'
                        'PostID': row[post_column] # 'Post_ID'
                    })

    hashtag_df = pd.DataFrame(hashtag_data)

    hashtag_df.columns = ['UserID', 'Link', 'PostDate', 'PostID']
    Posts = hashtag_df.iloc[1:][['UserID', 'Link', 'PostDate','PostID']]
    display(Posts.dtypes)
    return Posts[['UserID', 'Link', 'PostDate','PostID']], data





# In[713]:


#Recursivly remove once repeated entitis and users
def recursive_remove(Posts):
    Flag=True
    while (Flag):
        len1=len(Posts)
        grouped = Posts.groupby('Link')
        once_shared = set()
        for _, group_df in grouped:
            user_ids = group_df['UserID'].unique()
            if len(user_ids) <= 1:
                once_shared.add(group_df['Link'].iloc[0])
        Posts = Posts[~np.isin(Posts['Link'],once_shared)]
        
        len2=len(Posts)
        if len1==len2:
            Flag=False

        grouped = Posts.groupby('UserID')
        shared_one_link = set()
        for _, group_df in grouped:
            links = group_df['Link'].unique()
            if len(links) <= 1:
                shared_one_link.add(group_df['UserID'].iloc[0])
        Posts = Posts[~np.isin(Posts['UserID'],shared_one_link)]

        len3=len(Posts)
        if len3==len2:
            Flag=False
        
        
    return Posts    


# In[714]:


#filter a percentage of users with toprecords of sharing
def filter_top_users(Posts,percentage):
    print("filtering users")
    user_sharing_counts = Posts['UserID'].value_counts().reset_index()
    user_sharing_counts.columns = ['UserID', 'User_sharing_count']
    user_sharing_counts.sort_values(by='User_sharing_count', ascending=False, inplace=True)
    k = int(len(user_sharing_counts) * (percentage/100))
    top_users = user_sharing_counts.head(k)
    Posts = Posts[np.isin(Posts['UserID'],top_users['UserID'])]
    return Posts

# In[715]:


#Generate a multi-edge bipartite graph
def multi_edge_graph(Posts):
    
    grouped = Posts.groupby('Link')
    
    max_time = 0
    total_time = 0
    counter = 0

    for name, group in grouped:
        group = group.sort_values(by='PostDate')

        t0 = group.iloc[0]['PostDate']
        tn = group.iloc[-1]['PostDate']
        timedecay = (tn - t0)
        #print(type(timedecay),type(max_time))
        if timedecay > max_time:
            max_time = timedecay

        total_time += timedecay
        counter += 1

    alpha = math.log(10000) / (total_time / counter)
    
    
    UserID = []
    Expt = []
    Link = []
    UserNID = []
    LinkNID = []
    PostID=[]

    for name, group in grouped:
        group = group.sort_values(by='PostDate')
        t0 = group.iloc[0]['PostDate']
        group['TimeDecay'] = group['PostDate'].sub(t0).apply(lambda x: math.exp(-alpha * x))
        mask = group['TimeDecay'] > 0.00001
        group = group[mask]

        UserID.extend(group['UserID'])
        Expt.extend(group['TimeDecay'])
        Link.extend(group['Link'])
        UserNID.extend(group['Numeric_UID'])
        LinkNID.extend(group['Numeric_LID'])
        PostID.extend(group['PostID'])

    data = {
        'UserID': UserID,
        'TimeDecay': Expt,
        'Link': Link,
        'Numeric_UID': UserNID,
        'Numeric_LID': LinkNID,
        'PostID': PostID
    }

    df = pd.DataFrame(data)
    return df

# In[716]:


#Convert Muti-edge Bipartite graph into Single_edge_Bipartite graph
def single_edge_graph_summation(Posts):
    grouped = Posts.groupby(['UserID', 'Link'])
    UserID = []
    UserNID = []
    Usage = []
    Link = []
    LinkNID = []
    Number_link_used = []
    PostIDs=[]

    for _, group in grouped:
        group_size = len(group)
        Usage.append(np.sum(group['TimeDecay']))
        Number_link_used.append(group_size)
        
        first_row = group.iloc[0]
        UserID.append(first_row['UserID'])
        Link.append(first_row['Link'])
        UserNID.append(first_row['Numeric_UID'])
        LinkNID.append(first_row['Numeric_LID'])
        PostIDs.append(group['PostID'].tolist())

    data = {
        'UserID': UserID,
        'Usage': Usage,
        'Link': Link,
        'Numeric_UID': UserNID,
        'Numeric_LID': LinkNID,
        'Number_link_used': Number_link_used,
        'PostIDs':PostIDs
    }

    df = pd.DataFrame(data)
    return df

# In[717]:


#Identify users suspected for coordination basd on individual sharing behaviour
def link_usage_behaviour_matrix(SEBgraph):
    user_behaviour=[]
    user_ID_behaviour=[]
    user_behaviour_on_link=[]
    PostIDs=[]

    
    grouped = SEBgraph.groupby('Numeric_LID')
    for group in grouped:
        group_df = group[1]
        numeric_lid = group_df['Numeric_LID'].values[0]
        beha = group_df['Usage'].tolist()
        user_ids = group_df['Numeric_UID'].tolist()
        post_ids=group_df['PostIDs'].tolist()

        if len(beha) > 1:
            sorted_indices = np.argsort(beha)[::-1]
            beha = np.array(beha)[sorted_indices].tolist()
            user_ids = np.array(user_ids)[sorted_indices].tolist() 
            post_ids = np.array(post_ids, dtype=object)[sorted_indices].tolist()
            vector = find_max_entropy_partition(beha)

            if len(vector)==len(beha):
                if abs(calculate_entropy(beha)-calculate_entropy(beha[:-1]))<abs(np.std(beha)-np.std(beha[:-1])):
                    vector=vector[:-1]

            coordination_size=len(vector)
 
            if coordination_size>1:
                user_ID_behaviour.append(user_ids[:coordination_size])
                user_behaviour.append(beha[:coordination_size])
                user_behaviour_on_link.append(numeric_lid)
                PostIDs.append(post_ids[:coordination_size])
    
    
    return user_behaviour,user_ID_behaviour,user_behaviour_on_link,PostIDs

# In[718]:


#Dynamic Time Wrapping
def dtw(dist_mat):
    
    N, M = dist_mat.shape
    
    cost_mat = np.zeros((N + 1, M + 1))
    for i in range(1, N + 1):
        cost_mat[i, 0] = np.inf
    for i in range(1, M + 1):
        cost_mat[0, i] = np.inf

    traceback_mat = np.zeros((N, M))
    for i in range(N):
        for j in range(M):
            penalty = [
                cost_mat[i, j],     
                cost_mat[i, j + 1],  
                cost_mat[i + 1, j]]  
            i_penalty = np.argmin(penalty)
            cost_mat[i + 1, j + 1] = dist_mat[i, j] + penalty[i_penalty]
            traceback_mat[i, j] = i_penalty

    
    cost_mat = cost_mat[1:, 1:]
    return  cost_mat

# In[719]:


#Create a dataframe with pairs of users, their behaviour in sharing and the entity
def create_edges(users_behaviour, edges_users, coordinated_link,PostIDs):
    edges_form, edges_to, beha_1, beha_2,posts_1,posts_2, coor_link = [], [], [], [], [], [], []
    for i in range(len(edges_users)):
        for j, k in itertools.combinations(range(len(edges_users[i])), 2):
            user1, user2 = edges_users[i][j], edges_users[i][k]
            b1, b2 = users_behaviour[i][j], users_behaviour[i][k]
            p1, p2= PostIDs[i][j], PostIDs[i][k]
            l = coordinated_link[i]

            if user1 < user2:
                edges_form.append(user1)
                edges_to.append(user2)
                beha_1.append(b1)
                beha_2.append(b2)
                posts_1.append(p1)
                posts_2.append(p2)
            else:
                edges_form.append(user2)
                edges_to.append(user1)
                beha_1.append(b2)
                beha_2.append(b1)
                posts_1.append(p2)
                posts_2.append(p1)
            coor_link.append(l)

    

    df_behaviour = pd.DataFrame({
        'From': edges_form,
        'To': edges_to,
        'Beha_1': beha_1,
        'Beha_2': beha_2,
        'Numeric_LID': coor_link,
        'PostIDs_1':posts_1,
        'PostIDs_2':posts_2
    })
    df_behaviour = df_behaviour.sort_values(['From', 'To'])
    return df_behaviour

# In[720]:


#Functions for the parallel process to assess pairwise sharing behaviour

#Chunk function
def unequal_chunks(list, chunk_size):
    chunks = []
    b=0
    for i in range(0, len(list), chunk_size):
        s=b+chunk_size
        tb=b+chunk_size+1
        while s<len(list) and list[s][0]==list[s-1][0]  and list[s][1]==list[s-1][1]:
            s+=1
            tb+=1
        
        if len(list[b:s])>0:
            chunks.append(list[b:s])
        b=s
    return chunks

#Function to calculate edges for each chunk
def calculated_edges(chunks,df_behaviour,MEBgraph):
    second_Edges_form=[]
    second_Edges_to=[]
    second_Edges_weight=[]
    second_beha_1=[]
    second_beha_2=[]
    second_post_1=[]
    second_post_2=[]
    second_coor_link=[]
    
    TmpFrom= chunks[:,0]
    TmpTo= chunks[:,1]

    chunk_userids_From=set(TmpFrom)
    chunk_userids_To=set(TmpTo)
    df_behaviour_temp=df_behaviour.loc[(df_behaviour['From'].isin(chunk_userids_From))&(df_behaviour['To'].isin(chunk_userids_To))].copy()

    mask=np.zeros(len(df_behaviour_temp),dtype=bool)
    preFrom=-1
    preTo=-1
    MEBgraph_temp=MEBgraph.loc[(MEBgraph.Numeric_UID.isin(chunk_userids_From))|(MEBgraph.Numeric_UID.isin(chunk_userids_To))&(MEBgraph.Numeric_LID.isin(set(chunks[:,4])))].copy()

    for i in range(len(TmpFrom)):
        v1=TmpFrom[i]
        v2=TmpTo[i]
        if v1!=preFrom or v2!=preTo:
            preFrom=v1
            preTo=v2
            
            mask=(df_behaviour_temp['From']==v1)&(df_behaviour_temp['To']==v2)    
            matrix=df_behaviour_temp[mask][['From','To','Beha_1','Beha_2','Numeric_LID','Numeric_LID','PostIDs_1','PostIDs_2']].to_numpy()
            if len(matrix)>1:
                ZXZX=[] 
                for row in matrix:

                    X=MEBgraph_temp.loc[(MEBgraph_temp.Numeric_UID==row[0])&(MEBgraph_temp.Numeric_LID==row[4])]['TimeDecay']

                    Y=MEBgraph_temp.loc[(MEBgraph_temp.Numeric_UID==row[1])&(MEBgraph_temp.Numeric_LID==row[4])]['TimeDecay']

                    x=np.array(sorted(X))
                    y=np.array(sorted(Y))
                    N = x.shape[0]
                    M = y.shape[0]

                    #with open("/home/ahmad/Downloads/Datasets/Coordination_groundtruth/array_shape.txt", "a") as file:
                    #    file.write(f"Shape of the array: {len(matrix)}  {row[0]}  {row[4]} {row[1]} {X} {Y} {N}    {M}  \n")
                    if N>1000 and M>1000:
                        ZXZX.append(1)
                    else:
                        dist_mat = np.zeros((N, M))
                        for i in range(N):
                            for j in range(M):
                                dist_mat[i, j] = abs(x[i] - y[j])
                        cost_mat = dtw(dist_mat)
                        ZXZX.append(float(1/(1+(cost_mat[N - 1, M - 1]))))
                added=np.concatenate([np.abs(matrix[:,2:3]-matrix[:,3:4])],axis=1)
                matrix=np.concatenate([matrix,added],axis=1)

                added2= np.array(ZXZX)
                matrix = np.hstack((matrix, added2[:, np.newaxis]))

                sorted_indices=np.argsort(matrix[:,8])
                matrix=matrix[sorted_indices]
                flag=True
                pivot=2

                while flag:
                    s1=matrix[:pivot,2]
                    s2=matrix[:pivot,3]
                    d1=matrix[:pivot,8]

                    cosine=Cosine_similarity(s1,s2)
                    diver=np.sum(np.multiply(d1,d1))
                    if diver==0:
                        diver=0.00000001
                    cosine=cosine/diver
                    if cosine<1 or pivot==len(matrix):
                        flag=False
                        if cosine<1:
                            pivot-=1
                    else:
                        pivot+=1

                if(pivot>1):
                    u1=np.min([matrix[0,0],matrix[0,1]])
                    u2=np.max([matrix[0,0],matrix[0,1]])
                        
                    for ctr in range(pivot):
                        second_Edges_form.append(u1)
                        second_Edges_to.append(u2)
                        second_beha_1.append(matrix[ctr][2])
                        second_beha_2.append(matrix[ctr][3])
                        second_coor_link.append(matrix[ctr][4])
                        second_post_1.append(matrix[ctr][6])
                        second_post_2.append(matrix[ctr][7])
                        second_Edges_weight.append(matrix[ctr][9])
    
    # Create a structured array to hold the lists
    result = np.empty(shape=(len(second_Edges_form),), dtype=[
        ('int_col1', int),
        ('int_col2', int),
        ('int_col3', int),
        ('float_col', float),
        ('list_col1', list),
        ('list_col2', list)
    ])

    # Assign values to the structured array
    result['int_col1'] = second_Edges_form
    result['int_col2'] = second_Edges_to
    result['int_col3'] = second_coor_link
    result['float_col'] = second_Edges_weight
    result['list_col1'] = second_post_1
    result['list_col2'] = second_post_2

    #result = np.empty(shape=(len(second_Edges_form), 6), dtype=[('',int),('',int),('',int),('',float),('',object),('',object)])
    #print(second_post_1,type(second_post_1))
    #result = np.empty((len(second_Edges_form), 6)) 
    #result[:,0]=second_Edges_form
    #result[:,1]=second_Edges_to
    #result[:,2]=second_coor_link
    #result[:,3]=second_Edges_weight
    #for i in range(len(second_Edges_form)):
    #    result[i,4]=second_post_1[i]
    #    result[i,5]=second_post_2[i]
    del(MEBgraph_temp)
    
    
    return result

#Multiprocess determing edges function
def multiprocess_edge_calculation(function_reference, file_chunks, arg1,arg2, num_process):
    pool = Pool(num_process)
    pbar = tqdm(total=len(file_chunks))
    def update(arg):
        pbar.update()
    results = []
    for i in range(pbar.total):
        result=pool.apply_async(function_reference, args=(file_chunks[i],)+ (arg1,arg2,),callback=update)      
        results.append(result)
        
    
    pool.close()
    pool.join()
    
    final_result =np.concatenate([r.get() for r in results], axis=0)
    return final_result

#Decompress list and call function to run
def calculate_edges_with_chunk(dataframe,MEBgraph,num_process,chunk_size):
    edge_list=dataframe.values
    chunks = unequal_chunks(edge_list, chunk_size)
    return multiprocess_edge_calculation(calculated_edges, chunks, dataframe,MEBgraph, num_process)


# In[721]:


#Creat user_link probability matrix for every users
def user_link_matrix(Posts, user_count, link_count):
    user_links = np.zeros((user_count, link_count))
    user_links[Posts['Numeric_UID'], Posts['Numeric_LID']] = 1
    return user_links


#calculate weight of edges between pairs of users (sum of the coordination weight for all entities)
def calualte_edge_weight(df):
    grouped = df.groupby(['From', 'To'])

    From_ID = []
    To_ID = []
    Weight = []

    for group, group_df in grouped:
        temp = np.sum(group_df['Weight'])
        From_ID.append(group[0])
        To_ID.append(group[1])
        Weight.append(temp)

    df_out = pd.DataFrame({'Source': From_ID, 'Target': To_ID, 'Weight': Weight})
    return df_out


#Assess the similarity of non-coordinated users and suspicious users
def divergance_assessment(Thirs_df_behaviour,link_count,user_count,user_link_prob,Result_df):
    warnings.filterwarnings("ignore")
    From_ID=[]
    To_ID=[]
    Numeric_LID=[]
    Weight=[]
    PostIDs_1=[]
    PostIDs_2=[]


    grouped=Result_df.groupby(['From','To'])

    for group in grouped:
            
        new_weight=1      
        matrix=(list(group)[1]).values.tolist()
        node_id1=int(matrix[0][0])
        node_id2=int(matrix[0][1])
        link_IDs = list(np.array(matrix, dtype=object)[:, 2].astype(int))
        
        max_weight1=np.sum(np.array(matrix, dtype=object)[:, 3]) 


        Z=np.zeros(link_count)
        temp_nodes_ids1 = np.where(np.all(user_link_prob[:, link_IDs], axis=1))[0]

        temp_nodes_ids1 = np.delete(temp_nodes_ids1, np.where((temp_nodes_ids1 == node_id1)|(temp_nodes_ids1 == node_id2)))

        len_temp_nodes_ids1=len(temp_nodes_ids1) 

        a0=Thirs_df_behaviour.loc[(Thirs_df_behaviour.Weight.values>=max_weight1)&(((Thirs_df_behaviour.Source.values==node_id1)&(np.isin(Thirs_df_behaviour.Target,temp_nodes_ids1)))|((np.isin(Thirs_df_behaviour.Source,temp_nodes_ids1))&(Thirs_df_behaviour.Target.values==node_id1)))]        
        b0=Thirs_df_behaviour.loc[(Thirs_df_behaviour.Weight.values>=max_weight1)&(((Thirs_df_behaviour.Source.values==node_id2)&(np.isin(Thirs_df_behaviour.Target,temp_nodes_ids1)))|((np.isin(Thirs_df_behaviour.Source,temp_nodes_ids1))&(Thirs_df_behaviour.Target.values==node_id2)))]
        c0 = pd.concat([a0, b0], ignore_index=True)
        Removed_node_IDs=list(set(c0.Source.unique()).union(set(c0.Target.unique())))
        temp_nodes_ids1 = np.delete(temp_nodes_ids1, np.where(np.isin(temp_nodes_ids1, Removed_node_IDs)))


        a=Thirs_df_behaviour.loc[((Thirs_df_behaviour.Source.values==node_id1)&(np.isin(Thirs_df_behaviour.Target,temp_nodes_ids1)))|((np.isin(Thirs_df_behaviour.Source,temp_nodes_ids1))&(Thirs_df_behaviour.Target.values==node_id1))]
        b=Thirs_df_behaviour.loc[((Thirs_df_behaviour.Source.values==node_id2)&(np.isin(Thirs_df_behaviour.Target,temp_nodes_ids1)))|((np.isin(Thirs_df_behaviour.Source,temp_nodes_ids1))&(Thirs_df_behaviour.Target.values==node_id2))]
        c = pd.concat([a, b], ignore_index=True)

        f1=set(c.Source.unique())
        f2=set(c.Target.unique())

        temp_nodes_weights_ids1 =list(f1.union(f2))

        Weight_vector=np.zeros(user_count)

        sum_weight=0
        if (len(temp_nodes_ids1)>0):
            for ctr2 in temp_nodes_ids1:
                X=0
                if ctr2 in temp_nodes_weights_ids1:
                    c1=c.loc[((np.isin(c.Source,[node_id1,node_id2]))&(c.Target.values==ctr2))|((c.Source.values==ctr2)&(np.isin(c.Target,[node_id1,node_id2])))]
                    X=c1['Weight'].max()

                X=(max_weight1-X)/max_weight1
                Weight_vector[ctr2]=X
                sum_weight+=X

            User_IDs=set(temp_nodes_ids1)
            R_temp=(np.multiply(user_link_prob[i, :],Weight_vector[i]) for i in User_IDs)
            Z=np.sum(R_temp, axis=0)
            Z=np.divide(Z,sum_weight)
            JSD1=JSD_Divergance(user_link_prob[node_id1],Z)
            JSD2=JSD_Divergance(user_link_prob[node_id2],Z)
            JSD3=JSD_Divergance(user_link_prob[node_id1],user_link_prob[node_id2])     
            new_weight=(np.subtract(np.min([JSD1,JSD2]),JSD3))
            
        for i in range(len(matrix)):
            From_ID.append(node_id1)
            To_ID.append(node_id2)
            Numeric_LID.append(matrix[i][2])
            x=matrix[i][3]
            if(len_temp_nodes_ids1>0):
                divided = sum_weight/len_temp_nodes_ids1
                x=x*(1-(divided))+x*new_weight*(divided)
            Weight.append(x)
            PostIDs_1.append(matrix[i][4])
            PostIDs_2.append(matrix[i][5])
            
    Result_df=pd.DataFrame({'From':From_ID,'To':To_ID,'Numeric_LID':Numeric_LID,'Weight':Weight,'PostIDs_from':PostIDs_1,'PostIDs_to':PostIDs_2})
    Result_df=Result_df.loc[Result_df.Weight.values>0]
    return Result_df

    

# In[722]:


#Convert numerical_Ids to original Ids and write the final results in outputfile
def generate_result(Result_df, Posts):
    distinct_pairs = Posts[['Link', 'Numeric_LID']].drop_duplicates()
    merged = pd.merge(Result_df, distinct_pairs, on='Numeric_LID', how='left')
    merged=merged.drop(columns=['Numeric_LID'])

    distinct_pairs = Posts[['UserID', 'Numeric_UID']].drop_duplicates()
    merged = pd.merge(merged, distinct_pairs, left_on='From',right_on='Numeric_UID', how='left')
    merged=merged.drop(columns=['From','Numeric_UID'])
    merged=merged.rename(columns={'UserID':'From'})

    
    distinct_pairs = Posts[['UserID', 'Numeric_UID']].drop_duplicates()
    merged = pd.merge(merged, distinct_pairs, left_on='To',right_on='Numeric_UID', how='left')
    merged=merged.drop(columns=['To','Numeric_UID'])
    merged=merged.rename(columns={'UserID':'To'})
    
    new_order = ['From', 'To', 'Link', 'Weight','PostIDs_from','PostIDs_to']  # Adjust the column names accordingly    
    merged = merged[new_order]

    return merged.sort_values(by=['Weight'], ascending=False) 
    
def generate_json_result(df, raw_data):

    node_names= (pd.concat([df['From'],df['To']])).unique()
    
    
    post_text = raw_data.groupby(["User_ID"]).agg(screen_name=('Screen_Name',"first"), posts=('Post_ID', list)).reset_index()
    post_text = post_text[np.isin(post_text['User_ID'],node_names)]

    nodes = []
    edges = []

    for row in post_text.itertuples(index=False):
        # 0 = UserID, 1 = screen_name, 2 = posts
        nodes.append(dict(key=str(row[0]), attributes=dict(label=str(row[1]), posts=row[2])))

    combined = pd.pivot_table(df, index=["From","To"], values=["Weight","Link","PostIDs_from","PostIDs_to"], aggfunc= {"Weight": ["sum",list],"Link":list,"PostIDs_from":list,"PostIDs_to":list})

    #combined["id"] = combined.index

    combined = combined.sort_values(by=("Weight","sum"), ascending=False)

    for row in combined.itertuples():
        # 0 = the index tuple, 5 = Weight/sum, 1 = Link/list, 4 = Weight/list, 2 = PostIDs_from, 3 = PostIDs_to
        edges.append(dict(source=str(row[0][0]),target=str(row[0][1]), attributes=dict(size=row[5]*1, hashtags=row[1], weights=row[4], source=row[2], target=row[3])))

    return dict(nodes=nodes,edges=edges)


In [2]:
def get_incremental_option():
    print("Is it an incremental running of the model:")
    print("1. No")
    print("2. Yes")


    # Ask the user for input
    while True:
        choice = input("Enter the number of your choice (1 or 2): ").strip()
        if choice in {'1', '2'}:
            return int(choice)
        else:
            print("Invalid input. Please enter 1 or 2.")


def get_speed_option():
    print("Please select one of the following options for speed:")
    print("1. Fast")
    print("2. Moderate")
    print("3. Slow")

    # Ask the user for input
    while True:
        choice = input("Enter the number of your choice (1, 2, or 3): ").strip()
        if choice in {'1', '2', '3'}:
            return int(choice)
        else:
            print("Invalid input. Please enter 1, 2, or 3.")

def accomulate_results(result,path_output):
    latest_update=pd.read_csv(path_output)
    merged_results = pd.concat([result, latest_update])

    merged_results = merged_results.groupby(['From', 'To','Link'], as_index=False)['Weight'].max()

    return merged_results


In [3]:
#def calcCoordination(jobId):


path_input = '/home/ahmad/Downloads/input_isis.csv'

incremental_option = get_incremental_option()
speed_option = get_speed_option()

# What percentage of users with the highest number of shared entities should be included?
# (a number between 1 to 100)
top_percent = 100
# How many threads should be used for multi-threaded execution?
num_process = int(os.environ.get("COORDINATION_WORKER_THREADS", "8"))
# How many chunks should each thread consider?
chunk_size = int(os.environ.get("COORDINATION_WORKER_CHUNK_SIZE", "1000"))

Posts,raw_data=read_post(path_input,"1") # options[0], including retweets or not
# don't filter if we actually want everyone anyway
if top_percent != 100:
    Posts=filter_top_users(Posts,top_percent)

Posts=recursive_remove(Posts)
Posts,user_count=assign_numerical_ID(Posts,'UserID','Numeric_UID')
Posts,link_count=assign_numerical_ID(Posts,'Link','Numeric_LID')

if len(Posts) == 0:
    print( "no overlap in sharing behaviour")



#Create a multi-edge bipartite graph
MEBgraph=multi_edge_graph(Posts)
#display(MEBgraph)
#Convert the multi-edge graph to a single-edge bipartite graph
SEBgraph=single_edge_graph_summation(MEBgraph)
#For each hashtag: identifying users sharing suspiciously and edeg between them  (individual level)
users_behaviour,edges_users,coordinated_link,PostIDs=link_usage_behaviour_matrix(SEBgraph)
df_behaviour=create_edges(users_behaviour, edges_users, coordinated_link,PostIDs)
if speed_option==1:
    Final_coordination_records=df_behaviour.copy()

    pair_counts = Final_coordination_records.groupby(['From', 'To']).size()
    more_once__pairs = pair_counts[pair_counts > 1].index
    Final_coordination_records = Final_coordination_records[Final_coordination_records.set_index(['From', 'To']).index.isin(more_once__pairs)]

    Final_coordination_records['Weight']=Final_coordination_records[['Beha_1', 'Beha_2']].min(axis=1)
    Final_coordination_records['PostIDs_from']= Final_coordination_records['PostIDs_1'].apply(lambda x: x[0] if len(x) > 0 else None)
    Final_coordination_records['PostIDs_to']= Final_coordination_records['PostIDs_2'].apply(lambda x: x[0] if len(x) > 0 else None)
    Final_coordination_records=Final_coordination_records[['From','To','Numeric_LID','Weight','PostIDs_from','PostIDs_to']]
    
    result = generate_result(Final_coordination_records, Posts)
    if incremental_option==2:
        result=accomulate_results(result,'path_output')    
    
else:
    #Identifying users sharing suspiciously (pairwise level)
    edges=calculate_edges_with_chunk(df_behaviour,MEBgraph,num_process,chunk_size)
    column_names = ['From', 'To', 'Numeric_LID', 'Weight','PostIDs_from','PostIDs_to']
    pairwis_coordination = pd.DataFrame(edges)#{col: edges[:, idx] for idx, col in enumerate(columns)})
    pairwis_coordination.columns=column_names
    
    
    if speed_option==2:
        Final_coordination_records=pairwis_coordination.copy()

        Final_coordination_records['PostIDs_from']= Final_coordination_records['PostIDs_from'].apply(lambda x: x[0] if len(x) > 0 else None)
        Final_coordination_records['PostIDs_to']= Final_coordination_records['PostIDs_to'].apply(lambda x: x[0] if len(x) > 0 else None)
        Final_coordination_records=Final_coordination_records[['From','To','Numeric_LID','Weight','PostIDs_from','PostIDs_to']]

    
        result = generate_result(Final_coordination_records, Posts)

    else:
        #Identifying users sharing suspiciously (group level)
        user_link_prob = user_link_matrix(Posts, user_count, link_count)
        group_coordination=calualte_edge_weight(pairwis_coordination)
        Final_coordination_records=divergance_assessment(group_coordination,link_count,user_count,user_link_prob,pairwis_coordination)

        Final_coordination_records['PostIDs_from']= Final_coordination_records['PostIDs_from'].apply(lambda x: x[0] if len(x) > 0 else None)
        Final_coordination_records['PostIDs_to']= Final_coordination_records['PostIDs_to'].apply(lambda x: x[0] if len(x) > 0 else None)
        Final_coordination_records=Final_coordination_records[['From','To','Numeric_LID','Weight','PostIDs_from','PostIDs_to']]
        
        result = generate_result(Final_coordination_records, Posts)
        


result = generate_result(Final_coordination_records, Posts)


#return None

Is it an incremental running of the model:
1. No
2. Yes
Please select one of the following options for speed:
1. Fast
2. Moderate
3. Slow


UserID       object
Link         object
PostDate    float64
PostID       object
dtype: object

100%|██████████| 1/1 [00:00<00:00,  2.60it/s]


In [5]:
result.sort_values(by='Weight')

Unnamed: 0,From,To,Link,Weight,PostIDs_from,PostIDs_to
3,Aabekosar,saleha_khan22,#sackdoval,0.121619,752502145606361089,752504933686149120
1,Aabekosar,FarahVaseem,#isis,0.125101,752504585957306368,752503173382897664
44,AyishaBaloch,saleha_khan22,#isis,0.128372,752503469597159424,752504941995057157
0,Aabekosar,FarahVaseem,#sackdoval,0.134626,752502145606361089,752502943761461248
53,KhatijahFatima,sadiayousaf14,#isis,0.152170,752508896544251904,752505817631514625
...,...,...,...,...,...,...
205,afia_jameel,taniasyed5,#iraq,0.918231,752522317629124609,752524152259698688
237,BadalSays,RebuildPAK,#un,0.940276,752532366997028864,752527820035858432
236,BadalSays,RebuildPAK,#dhaka,0.988011,752512696130691072,752512876452032512
30,shahidnazir4821,BadalSays,#sackdoval,0.990880,752510848116125696,752512696130691072
