In [15]:
from graph_tool.all import *
from graph_tool import topology
from graph_tool import draw

%matplotlib inline

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import time
import csv
import itertools

In [2]:
path_to_file = './data/wiki-Vote/wiki-Vote.txt'

# this way of loading the graph does not work as it mapped the original vertex index label to its own

```python
%%time
G_dir = load_graph_from_csv(path_to_file, directed=True,csv_options={'delimiter': '\t'})
G_undir = G_dir
G_undir.set_directed(False)

#G_undir = load_graph_from_csv(path_to_file, directed=False,csv_options={'delimiter': '\t'})
```

# manually load the graph

In [54]:
g = Graph(directed=True)

In [55]:
with open(path_to_file) as f:
    reader_network = csv.reader(f, delimiter='\t', skipinitialspace=True)
    g.add_edge_list(map(int, edge) for edge in reader_network)

In [56]:
print('vertices:', g.num_vertices(), 'edges:', g.num_edges())

vertices: 8298 edges: 103689


**Tenemos un problema que tambien pasaba en la primera version que subiste, los vertices son 7115 y no 8298. Lo mirare despues.**

In [43]:
#graph_draw(g, vertex_text=g.vertex_index, vertex_font_size=18, output_size=(200, 200))

## Calulate connected components

In [58]:
%%time
l = topology.label_largest_component(g, directed=True)
lscc = GraphView(g, vfilt=l)

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 4.94 ms


In [63]:
%%time
w = topology.label_largest_component(g, directed=False)
lwcc = GraphView(g, vfilt=w) 

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 4.68 ms


In [65]:
print('LSCC edges: \t', lscc.num_edges())
print('LSCC nodes: \t', lscc.num_vertices())
print()
print('LWCC edges: \t', lwcc.num_edges())
print('LWCC nodes: \t', lwcc.num_vertices())

LSCC edges: 	 39456
LSCC nodes: 	 1300

LWCC edges: 	 103663
LWCC nodes: 	 7066


## Calulate distances for LSCC

In [78]:
%%time
lscc_distances = topology.shortest_distance(lscc, directed=True)

CPU times: user 970 ms, sys: 150 ms, total: 1.12 s
Wall time: 445 ms


In [100]:
%%time
dist_dir = []
counter = 0 
for (v1, v2) in itertools.permutations(lscc.vertices(), 2):
    dist_dir.append(lscc_distances[v1][v2])
    
    if (counter%100000 == 0):
        print(counter / (lscc.num_vertices() ** 2) * 100, '%')
    counter = counter + 1

0.0 %
5.9171597633136095 %
11.834319526627219 %
17.75147928994083 %
23.668639053254438 %
29.585798816568047 %
35.50295857988166 %
41.42011834319527 %
47.337278106508876 %
53.25443786982249 %
59.171597633136095 %
65.08875739644971 %
71.00591715976331 %
76.92307692307693 %
82.84023668639054 %
88.75739644970415 %
94.67455621301775 %
CPU times: user 21.2 s, sys: 0 ns, total: 21.2 s
Wall time: 21.1 s


In [101]:
print('median distance:\t', np.percentile(dist_dir, 50))
print('mean distance:\t', np.mean(dist_dir))
print('diameter:\t', np.max(dist_dir))
print('effective diameter:\t', np.percentile(dist_dir, 90))

median distance:	 3.0
mean distance:	 2.87928288032
diameter:	 9
effective diameter:	 4.0


## Calculate distances for LWCC

In [103]:
%%time
lwcc_distances_undir = topology.shortest_distance(lwcc, directed=False)

CPU times: user 32.8 s, sys: 2.03 s, total: 34.8 s
Wall time: 9.35 s


**Aqui creo que son combinations y no permutations, porque los vertices son undirected.**

In [106]:
%%time
dist_undir = []
counter = 0 
for (v1, v2) in itertools.permutations(lwcc.vertices(), 2):
    dist_undir.append(lwcc_distances_undir[v1][v2])
    
    if (counter%1000000 == 0):
        print(counter / (lwcc.num_vertices() ** 2) * 100, '%')
    counter = counter + 1

0.0 %
2.002869872182453 %
4.005739744364906 %
6.008609616547359 %
8.011479488729812 %
10.014349360912265 %
12.017219233094718 %
14.02008910527717 %
16.022958977459623 %
18.025828849642075 %
20.02869872182453 %
22.03156859400698 %
24.034438466189435 %
26.037308338371883 %
28.04017821055434 %
30.04304808273679 %
32.04591795491925 %
34.0487878271017 %
36.05165769928415 %
38.0545275714666 %
40.05739744364906 %
42.06026731583151 %
44.06313718801396 %
46.06600706019641 %
48.06887693237887 %
50.07174680456132 %
52.074616676743766 %
54.077486548926224 %
56.08035642110868 %
58.08322629329113 %
60.08609616547358 %
62.088966037656036 %
64.0918359098385 %
66.09470578202095 %
68.0975756542034 %
70.10044552638584 %
72.1033153985683 %
74.10618527075076 %
76.1090551429332 %
78.11192501511566 %
80.11479488729812 %
82.11766475948056 %
84.12053463166302 %
86.12340450384546 %
88.12627437602792 %
90.12914424821038 %
92.13201412039282 %
94.13488399257528 %
96.13775386475774 %
98.14062373694019 %
CPU times: 

In [107]:
print('median distance:\t', np.percentile(dist_undir, 50))
print('mean distance:\t', np.mean(dist_undir))
print('diameter:\t', np.max(dist_undir))
print('effective diameter:\t', np.percentile(dist_undir, 90))

median distance:	 3.0
mean distance:	 3.24750999023
diameter:	 7
effective diameter:	 4.0
