# Midterm Project
Written by John Carlsson & Lukas Runt for Social network Analysis spring 2023
The tasks are as follows:

# Task 1
Implement, if necessary, optimized versions of the social network mining algorithms seen during the course (diameter, triangles computation, clustering) and test these algorithms on the following datasets:
Facebook large
High-energy physics theory citation

In [None]:
# Task 1:
import networkx as nw
from joblib import Parallel, delayed, cpu_count
import pandas as pd
import itertools as it
import math
from scipy.sparse import linalg
import random
def diameter(G:nw.graph, sample = None):
    nodes=G.nodes()
    n = len(nodes)
    diam = 0
    if sample is None:
        sample = nodes

    for u in sample:
        udiam=0
        clevel=list()
        clevel.append(u)
        visited=set()
        visited.add(u)
        while len(visited) < n:
            nlevel=[]
            while(len(clevel) > 0):
                c=clevel.pop()
                for v in G[c]:
                    if v not in visited:
                        visited.add(v)
                        nlevel.append(v)
            clevel = nlevel
            udiam += 1
        if udiam > diam:
            diam = udiam
    return diam
def chunks(data, size):
    idata=iter(data)
    for i in range(0, len(data), size):
        yield {k:data[k] for k in it.islice(idata, size)} 
def parallel_diam(G,j = cpu_count()):
    diam = 0
    # Initialize the class Parallel with the number of available process
    with Parallel(n_jobs=j) as parallel:
        
        #Run in parallel diameter function on each processor by passing to each processor only the subset of nodes on which it works
        result=parallel(delayed(diameter)(G, X) for X in chunks(G.nodes(), math.ceil(len(G.nodes())/j)))
        #Aggregates the results
        diam = max(result)
    return diam

def less(G, edge):
    if G.degree(edge[0]) < G.degree(edge[1]):
        return 0
    if G.degree(edge[0]) == G.degree(edge[1]) and edge[0] < edge[1]:
        return 0
    return 1
def triangles(G:nw.Graph):
    num_triangles = 0
    m = nw.number_of_edges(G)

    # The set of heavy hitters, that is nodes with degree at least sqrt(m)
    # Note: the set contains at most sqrt(m) nodes, since num_heavy_hitters*sqrt(m) must be at most the sum of degrees = 2m
    # Note: the choice of threshold sqrt(m) is the one that minimize the running time of the algorithm.
    # A larger value of the threshold implies a faster processing of triangles containing only heavy hitters, but a slower processing of remaining triangles.
    # A smaller value of the threshold implies the reverse.
    heavy_hitters=set()
    for u in G.nodes():
        if G.degree(u) >= math.sqrt(m):
            heavy_hitters.add(u)

    # Number of triangles among heavy hitters.
    # It considers all possible triples of heavy hitters, and it verifies if it forms a triangle.
    # The running time is then O(sqrt(m)^3) = m*sqrt(m)
    for triple in it.combinations(heavy_hitters,3):
        if G.has_edge(triple[0],triple[1]) and G.has_edge(triple[1], triple[2]) and G.has_edge(triple[0], triple[2]):
            num_triangles+=1

    # Number of remaining triangles.
    # For each edge, if one of the endpoints is not an heavy hitter, verifies if there is a node in its neighborhood that forms a triangle with the other endpoint.
    # This is essentially the naive algorithm optimized to count only ordered triangle in which the first vertex (i.e., u) is not an heavy hitter.
    # Since the size of the neighborhood of a non heavy hitter is at most sqrt(m), the complexity is O(m*sqrt(m))
    for edge in G.edges():
        sel=less(G,edge)
        if edge[sel] not in heavy_hitters:
            for u in G[edge[sel]]:
                if less(G,[u,edge[1-sel]]) and G.has_edge(u,edge[1-sel]):
                    num_triangles +=1

    return num_triangles

def spectral_clustering(G:nw.Graph):
    n=G.number_of_nodes()
    nodes = sorted(G.nodes())
    # Laplacian of a graph is a matrix, with diagonal entries being the degree of the corresponding node
    # and off-diagonal entries being -1 if an edge between the corresponding nodes exists and 0 otherwise
    L=nw.laplacian_matrix(G, nodes).asfptype()
    # print(L) #To see the laplacian of G uncomment this line
    # The following command computes eigenvalues and eigenvectors of the Laplacian matrix.
    # Recall that these are scalar numbers w_1, ..., w_k and vectors v_1, ..., v_k such that Lv_i=w_iv_i.
    # The first output is the array of eigenvalues in increasing order.
    # The second output contains the matrix of eigenvectors:
    # specifically, the eigenvector of the k-th eigenvalue is given by the k-th column of v

    w, v = linalg.eigsh(L,n-1)
    # print(w) #Print the list of eigenvalues
    # print(v) #Print the matrix of eigenvectors
    # print(v[:,0]) #Print the eigenvector corresponding to the first returned eigenvalue

    # Partition in clusters based on the corresponding eigenvector value being positive or negative
    # This is known to return (an approximation of) the sparset cut of the graph
    # That is, the cut with each of the clusters having many edges, and with few edge among clusters
    # Note that this is not the minimum cut (that only requires few edge among clusters,
    # but it does not require many edge within clusters)
    c1= set()
    c2=set()
    for i in range(n):

        if v[i,0] < 0:
            c1.add(nodes[i])
        else:
            c2.add(nodes[i])
    return (c1, c2)

def two_means(G):
    n=G.number_of_nodes()
    # Choose two clusters represented by vertices that are not neighbors
    u = random.choice(list(G.nodes()))
    v = random.choice(list(nw.non_neighbors(G, u)))
    cluster0 = {u}
    cluster1 = {v}
    added = 2

    while added < n:
        # Choose a node that is not yet in a cluster and add it to the closest cluster
        x = random.choice([el for el in G.nodes() if el not in cluster0|cluster1 and (len(
            set(G.neighbors(el)).intersection(cluster0)) != 0 or len(set(G.neighbors(el)).intersection(cluster1)) != 0)])
        if len(set(G.neighbors(x)).intersection(cluster0)) != 0:
            cluster0.add(x)
            added+=1
        elif len(set(G.neighbors(x)).intersection(cluster1)) != 0:
            cluster1.add(x)
            added+=1

    return cluster0, cluster1

G_fb = nw.Graph()
for edge in pd.read_csv('facebook_large/musae_facebook_edges.csv').values:
    G_fb.add_edge(*edge) 


# print(parallel_diam(G_fb)) # Takes around 10 min to run, returns the longest shortest path for the graph. It's 15
# print(triangles(G_fb)) # Takes 1 second, return the number of unique triangles in the graph: it's 797516
#print(two_means(G_fb)) # Divides the graph into two subgraphs, works by magic, doesnt complete


# Task 2
Implement the shapley-closeness centrality measure as defined in Michalack et al. (JAIR 2013) sec. 4.4.
Implement, if necessary, optimized versions of all studied centrality measures (degree, closeness, betweenness, PageRank, HITS-authority, HITS-hubiness, HITS-both, voterank, shapley-degree, shapley-threshold, shapley-closeness) and test them on the datasets indicated in Task1.
The goal of this task is to shortlists the set of centrality measures based on efficiency and similarity of outcomes. Indeed, in the final project you may need to use centrality measures. This task has the goal to shortlist the set of measures that you will use in the final task.

In [None]:
# Task 2
import networkx as nw
import math
import pandas as pd
from multiprocessing import Pool, cpu_count

# Worker function
def worker(subset, G:nw.Graph, num_nodes):
    shapley_values = [0] * num_nodes
    for i in range(num_nodes):
        if i not in subset:
            marginal_contribution = (closeness(subset + [i], G) - closeness(subset, G)) / binom(num_nodes - 1, len(subset))
            shapley_values[i] += marginal_contribution
    return shapley_values

# Parallel shapley-wilks centrality function
def shapley_closeness_centrality_parallel(G):
    num_nodes = len(G.nodes())
    shapley_values = [0] * num_nodes

    with Pool(processes=cpu_count()) as p:
        subsets = [s for s in all_subsets(G.nodes()) if len(s) > 1 and len(s) < num_nodes]
        results = p.starmap(worker, [(s, G, num_nodes) for s in subsets])

    for result in results:
        shapley_values = [x + y for x, y in zip(shapley_values, result)]

    return shapley_values


# Helper function
def all_subsets(S):
    n = len(S)
    for i in range(2 ** n):
        subset = [S[j] for j in range(n) if (i & (1 << j))]
    return subset
def closeness(S, G):
    total_distance = 0
    for i in S:
        for j in S:
            if i < j:
                distances = nw.shortest_path(G, i, j)
                total_distance += distances[j]
    return 1.0 / total_distance
def binom(n, k):
    return math.factorial(n) / (math.factorial(k) * math.factorial(n-k))
    

G_fb = nw.Graph()
for edge in pd.read_csv('facebook_large/musae_facebook_edges.csv').values:
    G_fb.add_edge(*edge) 


shapley_values = shapley_closeness_centrality_parallel(G_fb)


# Task 3


Implement the VCG, the MUDAN, and the MUDAR, for selling multiple homogeneous items on a social network, with each agent only requiring a single item. The MUDAN and MUDAR algorithm are available on (Fang et al., 2023).

In [87]:
# Task 3
def auction(k, seller_net, reports, bids, type = 'vcg'):
    """
- k, is the number of items to sell;

- seller_net, is a set of strings each identifying a different bidder;

- reports, is a dictionary whose keys are strings each identifying a different bidder and whose
    values are sets of strings representing the set of bidders to which the bidder identified by the
    key reports the information about the auction;

- bids, is a dictionary whose keys are strings each identifying a different bidder and whose
    values are numbers defining the bid of the bidder identified by that key.

- type is the type of auction that is taking place

Returns:
- allocation, that is a dictionary that has as keys the strings identifying each of the bidders
that submitted a bid, and as value a boolean True if this bidder is allocated one of the items,
and False otherwise.

- payments, that is a dictionary that has as keys the strings identifying each of the bidders that
submitted a bid, and as value the price that she pays. Here, a positive price means that the bidder
is paying to the seller, while a negative price means that the seller is paying to the bidder.
    """
    allocation, payments = dict(),dict()

    for p in bids:
        bids[p] = [random.randint(0,1000) for x in range(k)]
    
    sw = list(map(sum, zip(*[bids[i] for i in bids])))

    
    #return allocation, payments

def vcg():
    pass




k = 10
seller_net = ["player "+str(x) for x in range(1,11)]
reports = {}
bids = {}
for s in seller_net:
    nei = [random.randint(0,len(seller_net)-1) for i in range(random.randint(0,len(seller_net)-1))]
    reports[s] = [seller_net[x] for x in nei if seller_net[x] != s]
    bids[s] = []

auction(3,seller_net,reports,bids)

type = 'vcg'

[3755, 4302, 3990]


# Task 4
