In [1]:
import igraph as ig
import sys, time, re
from random import randint
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pickle
import pylab as plt
%matplotlib inline 
folder = '../data'

sys.path.append('../scripts')
from importlib import reload
import ml_utils as utils
reload(utils)

<module 'ml_utils' from '../scripts/ml_utils.py'>

### Read VK and Inst data

In [2]:
df = utils.read_combine_df(from_raw=False).dropna(how='all')
df.shape

     uid_inst           uname          name_inst  uid_vk          name_vk
0   751026638       mrzelkin1     aleksej zelkin    5394    alexey zelkin
1     2744159         undruha  andrey gnelitskiy   22884  andre undrukhov
2  4157858483       vadimbhai     abhairov vadim   23754    vadim reutsky
3   389335890  superov_sergey     sergey superov   89831   sergey superov
4   199238507       smilychka       nastja gogol  103177     nastya gogol


(24659, 5)

### Gnerate and slice from Graph

In [3]:
g = ig.Graph.Barabasi(n = 10**4, m = 2, implementation = 'bag', directed = False)
for i in range(10):
    print(len(g.vs[i].neighbors()))

1018
393
971
188
39
36
208
112
106
78


In [4]:
def generate_randomG(df, v_count = 10**4):
#     g = ig.Graph.Erdos_Renyi(n = v_count, p = 2e-3)
    g = ig.Graph.Barabasi(n = v_count, m = 2, directed = False)
    df_size = df.shape[0]
    columns = list(df.columns)
    for c in ['name'] + columns:
        g.vs[c] = ''
    for v, i in tqdm(zip(g.vs, range(g.vcount())), total=g.vcount()):
        row = df.iloc[i % df_size]
        v['name'] = str(i)
        for c in columns:
            v[c] = row[c]
    return g

g = generate_randomG(df, v_count=10**5)
print(g.vs[0])
g.vs.attribute_names(), len(g.vs[100].neighbors())


igraph.Vertex(<igraph.Graph object at 0x7fb5180a1a98>, 0, {'name': '0', 'uid_inst': 751026638, 'uname': 'mrzelkin1', 'name_inst': 'aleksej zelkin', 'uid_vk': 5394, 'name_vk': 'alexey zelkin'})


(['name', 'uid_inst', 'uname', 'name_inst', 'uid_vk', 'name_vk'], 34)

In [5]:
%%time
def sample_graph(g, t, s, is_vk = True):
    gs = ig.Graph()
    e_list = []
    
    fname_k = 'name_vk' if is_vk else 'name_inst'
    for c in ['name', 'fname', 'uid']:
        gs.vs[c] = ''
    
    max_int = 2 ** 15 -1
    for v in tqdm(g.vs, total = g.vcount()):
        if (randint(0, max_int) / max_int) < t:
            gs.add_vertex(name = v['name'], fname = v[fname_k], uid=int(v['name']))
    
    uset = set(gs.vs['name'])
    name_to_indx = dict(((v['name'], v.index) for v in gs.vs))
    for e in tqdm(g.es, total = g.ecount()):
        if g.vs[e.source]['name'] not in uset or g.vs[e.target]['name'] not in uset:
            continue
        source = name_to_indx[g.vs[e.source]['name']]
        target = name_to_indx[g.vs[e.target]['name']]
        if g.vs[source].degree() < 2 or g.vs[target].degree() < 2 or (randint(0, max_int) / max_int) < s:    
            e_list.append((source, target))
    gs.add_edges(e_list)
    print('g:', g.vcount(), g.ecount(), 'gs:', gs.vcount(), gs.ecount())
    print('is_connected:', gs.is_connected(), 'components count:', len(gs.clusters())), 
    return gs

def biggest_component(g):
    clust = g.clusters(mode='weak')
    lcc = clust.giant()
    print(lcc.vcount(), g.vcount(), lcc.vcount() / g.vcount())
    return lcc

t = 0.9
s = 0.4
lg = sample_graph(g, t = t,s = s, is_vk = True)
rg = sample_graph(g, t = t,s = s, is_vk = False)
lg = biggest_component(lg)
rg = biggest_component(rg)
rg.vs.attribute_names()





g: 100000 199997 gs: 90025 64931
is_connected: False components count: 32081






g: 100000 199997 gs: 89862 63480
is_connected: False components count: 32782
51367 90025 0.5705859483476812
50261 89862 0.5593131690814805
CPU times: user 3.34 s, sys: 28 ms, total: 3.36 s
Wall time: 3.34 s


In [6]:
pickle.dump((lg, rg), open(os.path.join(folder, 'random_experiment', 'G1_G2_t_%d_s_%s.pickle' % (int(t * 100),int(s * 100))), "wb"))

### Test transfer to NtworkX

In [7]:
import networkx as nx

def igraph_to_nx(ig):
    G = nx.from_edgelist([(int(names[x[0]]), int(names[x[1]]))
                      for names in [ig.vs['name']] # simply a let
                      for x in ig.get_edgelist()]) # nx.Graph()
    return G

G1x = igraph_to_nx(lg)

In [8]:
def describe(G1, G1x):
    print(G1.vcount(), len(G1x.nodes()), G1.ecount(), len(G1x.edges()))
    
describe(lg, G1x)

51367 51367 58353 58353


In [9]:
count = 0

for v in lg.vs:
    uid = int(v['name'])
    
    try:
        G1x[uid]
    except KeyError:
        print(v)
        count += 1
print('count', count)

count 0


In [10]:
import itertools  as it
for l in it.islice(lg.vs, 10):
    try:
        r = rg.vs.find(name = l['name'])
    except ValueError:
        continue
    print(l['fname'],'  |  ', r['fname'])

alexey zelkin   |   aleksej zelkin
andre undrukhov   |   andrey gnelitskiy
vadim reutsky   |   abhairov vadim
nastya gogol   |   nastja gogol
dmitry napolskikh   |   dmitrij napolskih
renata smirnova   |   renata smirnova
natalia sokolova   |   belorusskaja kosmetika
anastasia garifullina   |   garifullina anastasia


In [11]:
for i in range(20):
    print(len(lg.vs[i].neighbors()))

1337
1001
26
116
804
376
97
109
413
330
105
9
177
62
77
10
43
316
136
52
