In [2]:
import igraph as ig
import sys, time, re
from random import randint
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pylab as plt
%matplotlib inline 
folder = 'data'

### Read VK and Inst data

In [3]:
import cyrtranslit

def clean_lineinst(line):
    inst_dict = dict()
    pat = re.compile("(\d+),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')

    _, uname, fname = pat.match(line).groups()
    fname = re.sub(pat_word, '', fname).strip().lower()
    
    fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '')
    return (uname, fname)

def clean_linevk(line):
    inst_dict = dict()
    pat = re.compile("(\d+),(.*),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')

    try:
        _, uname, name1, name2 = pat.match(line).groups()
        name1 = re.sub(pat_word, '', name1).strip().lower()
        name2 = re.sub(pat_word, '', name2).strip().lower()
        fname = name1 + ' ' + name2
    except AttributeError:
        print(line)
    fname = cyrtranslit.to_latin(fname, 'ru').replace("'", '')
    return (uname, fname)

def read_clean_csv(fname, num_col):
    df = pd.DataFrame()

    clean_line = clean_lineinst if num_col == 3 else clean_linevk
    with open(os.path.join(folder, fname), 'r') as f:
        for line in tqdm(f.readlines()):
            df = pd.concat( [df, pd.DataFrame([clean_line(line)])], ignore_index=True )
    return df

In [4]:
inst = read_clean_csv(fname = 'inst_personal.csv', num_col = 3)
inst.columns = ['uname', 'inst_name']
inst.head()




Unnamed: 0,uname,inst_name
0,mrzelkin1,aleksej zelkin
1,undruha,andrey gnelitskiy
2,vadimbhai,abhairov vadim
3,superov_sergey,sergey superov
4,smilychka,nastja gogol


In [5]:
vk = read_clean_csv(fname = 'vk_personal.csv', num_col = 4)
vk.columns = ['uname', 'vk_name']
vk.head()




Unnamed: 0,uname,vk_name
0,mrzelkin1,alexey zelkin
1,undruha,andre undrukhov
2,vadimbhai,vadim reutsky
3,superov_sergey,sergey superov
4,smilychka,nastya gogol


In [6]:
df = pd.merge(inst, vk, on='uname')

In [7]:
print('Percentage of common nodes:', df.shape[0] / vk.shape[0])
df.shape, inst.shape, vk.shape

Percentage of common nodes: 0.9320851128737034


((24443, 3), (24859, 2), (26224, 2))

### Gnerate and slice from Graph

In [26]:
g = ig.Graph.Barabasi(n = 10**5, m = 1, implementation = 'bag', directed = False)
for i in range(10):
    print(len(g.vs[i].neighbors()))

522
461
232
320
93
51
247
161
64
2


In [27]:
def generate_randomG(df, v_count = 100000):
#     g = ig.Graph.Erdos_Renyi(n = v_count, p = 2e-3)
    g = ig.Graph.Barabasi(n = 10**5, m = 2, directed = False)
    df_size = df.shape[0]
    columns = list(df.columns[1:])
    for c in ['name'] + columns:
        g.vs[c] = ''
    for v, i in tqdm(zip(g.vs, range(g.vcount())), total=g.vcount()):
        row = df.iloc[i % df_size]
        v['name'] = str(i)
        for c in columns:
            v[c] = row[c]
    return g

g = generate_randomG(df)
g.vs[0]




igraph.Vertex(<igraph.Graph object at 0x7fc2a21938b8>, 0, {'name': '0', 'inst_name': 'aleksej zelkin', 'vk_name': 'alexey zelkin'})

In [12]:
g.vs.attribute_names(), len(g.vs[100].neighbors())

(['name', 'inst_name', 'vk_name'], 33)

In [39]:
%%time
def sample_graph(g, t, s, is_vk = True):
    gs = ig.Graph()
    e_list = []
    
    fname_k = 'vk_name' if is_vk else 'inst_name'
    for c in ['name', 'fname']:
        gs.vs[c] = ''
    
    for v in tqdm(g.vs, total = g.vcount()):
        if (randint(0, 10 ** 10) / 10 ** 10) < t:
            gs.add_vertex(name = v['name'], fname = v[fname_k])
    
    uset = set(gs.vs['name'])
    name_to_indx = dict(((v['name'], v.index) for v in gs.vs))
    max_int = 2 ** 15 -1
    for e in tqdm(g.es, total = g.ecount()):
        if g.vs[e.source]['name'] not in uset or g.vs[e.target]['name'] not in uset:
            continue
        source = name_to_indx[g.vs[e.source]['name']]
        target = name_to_indx[g.vs[e.target]['name']]
        if g.vs[source].degree() < 10 or g.vs[target].degree() < 10 or (randint(0, max_int) / max_int) < s:    
            e_list.append((source, target))
    gs.add_edges(e_list)
    print('g:', g.vcount(), g.ecount(), 'gs:', gs.vcount(), gs.ecount())
    print('is_connected:', gs.is_connected(), 'components count:', len(gs.clusters())), 
    return gs

t = 1
s = 0.95
lg = sample_graph(g, t = t,s = s, is_vk = True)
rg = sample_graph(g, t = t,s = s, is_vk = False)
rg.vs.attribute_names()





g: 100000 199997 gs: 100000 199572
is_connected: True components count: 1






g: 100000 199997 gs: 100000 199594
is_connected: True components count: 1
CPU times: user 3.12 s, sys: 24 ms, total: 3.15 s
Wall time: 3.13 s


In [40]:
lg.write_pickle(fname=os.path.join(folder, 'random_experiment', 'lg.pickle'))
rg.write_pickle(fname=os.path.join(folder, 'random_experiment', 'rg.pickle'))

In [41]:
import itertools  as it
for l in it.islice(lg.vs, 10):
    try:
        r = rg.vs.find(name = l['name'])
    except ValueError:
        continue
    print(l['fname'],'  |  ', r['fname'])

alexey zelkin   |   aleksej zelkin
andre undrukhov   |   andrey gnelitskiy
vadim reutsky   |   abhairov vadim
sergey superov   |   sergey superov
nastya gogol   |   nastja gogol
ilnur shaydullin   |   none
adelina ashrapova   |   adelina fazulzjanova
dmitry napolskikh   |   dmitrij napolskih
renata smirnova   |   renata smirnova
natalia sokolova   |   belorusskaja kosmetika


In [42]:
for i in range(20):
    print(len(lg.vs[i].neighbors()))

1820
3198
1485
1740
404
723
642
313
371
522
978
1001
33
231
311
103
81
5
1011
75


In [34]:
ig.Graph.save(lg, 'test.ncol')

  return writer(f, *args, **kwds)
