In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip
import json
import time
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
class Relation:
    def __init__(self, type_relation, time, source, target, weight):
        self.type_relation_ = type_relation
        self.time_ = time
        self.source_ = source.get_ID
        self.target_ = target.get_ID
        self.weight_ = weight

    @property
    def get_type(self):
        return self.type_relation_
    
    @property
    def time(self):
        return self.time_
    
    @property
    def target(self):
        return self.target_
    
    @property
    def source(self):
        return self.source_
    
    @property
    def weight(self):
        return self.weight_
    
    def set_weight(self, weight):
        self.weight_ = weight
    
    def __str__(self): 
        return "{\"type_relation\": \"" + self.type_relation_ + "\", \"time\": " + str(self.time_) + ", \"source\": " + str(self.source_) + ", \"target\": " + \
        str(self.target_) + ", \"weight\": "+ str(self.weight_) + "}"
    
    def __repr__(self): 
        return self.__str__()
    
    
     

class User:
    def __init__(self, ID_user):
        self.ID_user = ID_user
        self.in_relation = dict()
        self.out_relation = dict()

    def add_in_relation(self, in_relation):
        if in_relation.time in self.in_relation:
            if in_relation.get_type not in self.in_relation[in_relation.time]:
                self.in_relation[in_relation.time][in_relation.get_type] = []
        else:
            self.in_relation[in_relation.time] = {in_relation.get_type: []}
        self.in_relation[in_relation.time][in_relation.get_type].append(in_relation)

    
    def add_out_relation(self, out_relation):
        if out_relation.time in self.out_relation:
            if out_relation.get_type not in self.out_relation[out_relation.time]:
                self.out_relation[out_relation.time][out_relation.get_type] = []
        else:
            self.out_relation[out_relation.time] = {out_relation.get_type: []}
        self.out_relation[out_relation.time][out_relation.get_type].append(out_relation)
    
    def set_in_relation(inRelations):
        self.in_relation = inRelations
    
    def set_out_relation(outRelation):
        self.out_relation = outRelation

    @classmethod
    def from_json(cls, json):
        return cls(json["ID_user"])
    
    @property
    def get_ID(self):
        return self.ID_user

    @property
    def get_in_relation(self):
        return self.in_relation
    
    @property
    def get_out_relation(self):
        return self.out_relation

    def __str__(self):
        return "{\"in_relation\": " + str(self.in_relation) +  ", \"out_relation\": " + str(self.out_relation) + "}"

    def to_json(self):
        return {"in_relation": self.in_relation, "out_relation":self.out_relation }

    def __repr__(self): 
        return self.__str__()

In [3]:
with open('graph.pickle', 'rb') as handle:
    users = pickle.load(handle)

In [4]:
def getMinUnvisited(unvisited, dist):
    result, dist_min = -1 , float('inf')
    
    aux = {key: dist[key] for key in unvisited}
    return min(aux, key=aux.get)

In [5]:
def convertDate(time):
    tmp = time.split("/")
    return int(tmp[1] + tmp[0])

In [6]:
def getShortestPath(source, target, prev, dist):
    path = [target]
    cost = dist[target]
    while target != source:
        if(target != (-1)):
            path.append(prev[target])
            target = prev[target]
    path.reverse()
    #print(path)
    return path, cost

In [73]:
def getNeighbors(node, graph, start, end):
    neighbors = dict()
    x = graph[node].get_in_relation
    for date in x.keys():
        #if start <= date <= end:
            for rel in x[date].keys():
                for edge in x[date][rel]:
                    #print(x[date][rel])
                    target = edge.target
                    weight = edge.weight
                    neighbors[target] = neighbors.get(target, weight) #+ weight
                    #neighbors[] = neighbors[x[date][rel][edge].target].get(x[date][rel][edge], )
                #neighbors.extend([x[date][rel][i].target for i in range(len(x[date][rel]))])
                #neighbors.update([x[date][rel][i].target for i in range(len(x[date][rel]))])
    return neighbors

In [41]:
getNeighbors(3, users, 200808, 200812)

{37749: 2, 2635: 4, 3305: 4, 2017: 6}

In [8]:
def overallWeight(graph,source, target,start, end):
    w = 0
    x = graph[source].get_out_relation
    for date in x.keys():
        if start <= date <= end: 
            for rel in x[date].keys():
                for i in range(len(x[date][rel])):
                    if x[date][rel][i].target == target:
                        w += x[date][rel][i].weight
    return w

In [9]:
def myDijkstra(graph, source, start, end):
    start = convertDate(start)
    end = convertDate(end)
    
    visited = set()
    unvisited = set(graph.keys())
    dist = dict()
    prev = dict()
    
    for u in unvisited:
        dist[u] = float('inf')
        prev[u] = -1
    
    dist[source] = 0    
    visited.add(source)

    while len(unvisited) > 0 or not set(neighbor.keys()).issubset(visited): 
        #print(len(unvisited))      
        current_node = getMinUnvisited(unvisited, dist)
         
        unvisited.remove(current_node)
            #print(unvisited, '\n',current_node) 
        visited.add(current_node)
        neighbor = getNeighbors(current_node,graph, start, end)
        
        for u in unvisited.intersection(set(neighbor.keys())):
            new_dist = dist[current_node] + neighbor[u]
            if new_dist < dist[u]:
                dist[u] = new_dist
                prev[u] = current_node      

    return prev, dist

## Functionality 2 - Find the best users!

The **Closeness Centrality** of a node **u** is the reciprocal of the average shortest path distance to **u** over all **n-1** nodes.

$$C(u) = \frac{n-1}{\sum_{v = 1}^{n-1}{d(v,u)}}$$

where **n-1** is the number of reachable nodes from **u** and **d(v,u)** is the shortest path distance between the node **u** and the node **v**. 


In [10]:
def GetCloseness(graph, source, start, end):
    distances = 0
    nodes = set(graph.keys())
    nodes.remove(source)
    _, dist = myDijkstra(graph, source, start, end) #the control about the interval of time is in GetNeighbors
    dist = np.array(list(dist.values()))
    position = np.where(dist != float('inf'))[0]
    if sum(dist[position]) == 0: return 0
    return round((len(position) - 1) / sum(dist[position]),3)

In [None]:
GetCloseness(users, 3, "08/2008", "09/2008")

The **Degree Centrality** of a node **u** counts how many neighbors a node has. The graph that we are considering is directed and weighted, so we have to calculate two measures: **In-Degree Centrality** and **Out-Degree Centrality**. The **Out-Degree Centrality** for a weighted graph is defined as: 

$$C_{outD}(u) = \frac{\sum_{v=1}^{n} w_{uv}}{N-1}$$

where **n** are the nodes that are the number of tail ends adjacent to the node **v**, $w_{uv}$ is the weight of the link between the node **u** and the node **v** and **N-1** is the max possible value of $C_{outD}(u)$.

In [59]:
def degreeCentrality(graph, source, start, end):
    start = convertDate(start)
    end = convertDate(end)
    weight = 0
    g = graph[source].get_out_relation
    for year in g:
        for rel in g[year]:
            for edge in g[year][rel]:
                if(start <= edge.time <= end):
                    weight += edge.weight
    return weight, weight/(len(graph)-1)

In [63]:
degreeCentrality(users, 3, "08/2008", "12/2008")[0]

8

The **PageRank Centrality** of a node **u** is defined as: 

$$PR(p_{i}) = \frac{1-d}{N} + d{\sum_{p_{j} \in M(p_i)}{\frac{PR(p_{j})}{L(pj)}}}$$

where **d** is the damping factor, $M(p_i)$ is the set of neighbors of the node $p_{i}$, $L(p_{j})$  is the number of outbound edges of the node $p_{j}$ and **N** is the total number of nodes.

In [74]:
def nOutRel(graph, source, start, end):
    g = graph[source].get_out_relation
    out = 0
    for year in g:
        for rel in g[year]:
            for edge in g[year][rel]:
                #if(start <= edge.time <= end):
                    out += edge.weight
    return out

In [90]:
def pagerank (graph, source, start,end):
    start, end = convertDate(start), convertDate(end)
    iterations, d = 200, 0.85
    nodes = set(graph)
    n = len(nodes) 
    out = {node: nOutRel(graph, node, start, end) for node in nodes}
    out = {k: v for k, v in out.items() if v != 0}
    pr = {node: 1/n for node in nodes}
    for i in range(iterations):
        for pi in nodes:  #scorre tutti i nodi del grafo
            neigh = list(getNeighbors(pi, graph, start, end)) #in relations
            if (len(neigh) == 0):
                pr[pi] = (1 - d)/n
            else:
                sumneigh = 0
                for pj in neigh:
                    if (pj in out):
                        sumneigh += (pr[pj]/out[pj])
            pr[pi] = ((1 - d)/n) + (d*sumneigh)
    finalpr = "{:.7f}".format(pr[source])
    return pr

In [91]:
pagerank (users, 8 , "01/2008", "12/2008")

{1: 7.944313386113643e-06,
 2: 8.247298518824461e-06,
 3: 8.854242565664356e-06,
 4: 8.037698266268485e-06,
 5: 7.95470796237973e-06,
 8: 7.95470796237973e-06,
 9: 8.357469880462064e-06,
 11: 8.357469880462064e-06,
 13: 7.92656196815559e-06,
 17: 7.946108239941505e-06,
 19: 7.913479293062518e-06,
 20: 7.983437248331402e-06,
 22: 8.388847307133412e-06,
 23: 8.648611249248653e-06,
 24: 1.3762572683586989e-05,
 25: 7.967226455660225e-06,
 26: 7.959190862203552e-06,
 27: 8.648611249248653e-06,
 29: 7.946267289158561e-06,
 30: 8.08979534546365e-06,
 32: 9.21958752589808e-06,
 33: 8.02530734624794e-06,
 34: 8.15242982233794e-06,
 35: 8.004193485902749e-06,
 36: 9.21958752589808e-06,
 37: 8.231515103894817e-06,
 38: 8.231515103894817e-06,
 39: 8.042462177033815e-06,
 40: 7.996368478510492e-06,
 41: 8.516749014955176e-06,
 42: 8.056140107465554e-06,
 43: 8.388847307133412e-06,
 44: 8.329978203223704e-06,
 45: 7.948304337841721e-06,
 46: 8.016724993739465e-06,
 48: 8.357469880462064e-06,
 49: 8

In [48]:
import networkx as nx
def graph_to_networkx(graph, type_graph): #if type_graph is all, this mean we must to add all type of node
    G = nx.DiGraph()
    for user in graph:
        for year in graph[user].get_out_relation:
            for type_relation in graph[user].get_out_relation[year]:
                if type_relation == type_graph or type_graph.lower() == 'all':
                    for relation in graph[user].get_out_relation[year][type_relation]:
                        G.add_nodes_from([user, relation.target])
                        G.add_edge(user, relation.target, weight=relation.weight, time=year)
    return G

In [49]:
G = graph_to_networkx(users, "all")

In [71]:
pr = nx.pagerank(G, alpha=0.85)

In [79]:
pr

{8: 9.812660603477211e-06,
 9: 5.01199527510044e-05,
 55: 0.00010896006518895596,
 39: 0.00013032870019274607,
 446: 1.85279086579691e-05,
 905: 0.0005069438743374312,
 16473: 0.00014482453176124023,
 18658: 0.0005256819774955248,
 356: 8.51680514339129e-05,
 19: 1.254017214734286e-05,
 72: 9.328460111573253e-05,
 115: 0.0007216519004858161,
 227: 0.00016241131589790948,
 234: 0.0010215723801362446,
 46: 9.813257838071739e-05,
 22: 5.6729877566173e-05,
 211367: 2.7618392612369904e-05,
 3583: 3.784121088332657e-05,
 4354: 1.4127478879110291e-05,
 4431: 4.710502315107065e-05,
 15401: 0.0010039967173463154,
 25714: 0.0001406464528988563,
 26682: 0.00023268401190753345,
 1: 0.0004622867694321946,
 13: 0.001682468554341967,
 17: 0.00045214964845271,
 91: 0.0007208527944541775,
 268: 0.0002774483930679826,
 212: 1.5080936343945257e-05,
 332: 0.00042756486165302195,
 489: 0.00010733608342507006,
 157: 0.0001933606466822629,
 1061: 8.134042404277007e-05,
 832: 0.000639759973568981,
 493: 0.000

In [72]:
"{:.7f}".format(pr[8])

'0.0000098'

The **Betweenness Centrality**  of a node **u** is the sum of the fraction of all-pairs shortest paths that pass through **u**:

$$C_{B}(u) = \sum_{s,t \in U}{\frac{\sigma(s,t|v)}{\sigma(s,t)}} $$

where **U** is the set of nodes, $\sigma(s,t|u)$ is the number of shortest (s, t)-paths and $\sigma(s,t)$ is the number of those paths passing through some node **u** other than **s**, **t**.

In [None]:
def BetweennessCentrality(graph, source, start, end):
    #start = convertDate(start)
    #end = convertDate(end)
    nodes = list(graph.keys())
    st, stv, el = 0, 0, 0
    for s in nodes:
        newnodes = nodes
        newnodes.remove(s)
        prev, dist = myDijkstra(graph, s, start, end)
        for t in newnodes:
            if(dist != float("inf")):
                seq, w = getShortestPath(s, t, prev, dist)
                st += len(seq)
            if source in seq:
                stv += 1
            el +=  (stv/st)
    return el 

In [None]:
BetweennessCentrality(users, 3, "10/2008", "11/2008")