In [100]:
import igraph as ig
import sys, time, re
from random import randint
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import pylab as plt
import numpy as np
from importlib import reload
import sys
import pickle
sys.path.append('../scripts')
%matplotlib inline 
folder = '../data/'

import ml_utils as utils
reload(utils)

<module 'ml_utils' from '../scripts/ml_utils.py'>

### Read graphs

In [83]:
import cyrtranslit

def read_edges(f_name):
    print(f_name)
    g = ig.Graph.Read_Ncol(f_name, names=True, directed=False)
    ig.summary(g)
    return g

def enrich_vk_graph(g):
    inst_dict = dict()
    pat = re.compile("(\d+),(.*),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')
    
    g.vs['fname'] = ''
    
    with open(folder + 'vk_personal2.csv', 'r') as f:
        for line in f:
            try:
                uid, uname, name1, name2 = pat.match(line).groups()
                name1 = re.sub(pat_word, '', name1).strip().lower()
                name2 = re.sub(pat_word, '', name2).strip().lower()
                inst_dict[uid] = (uname, name1 + ' ' + name2)
            except AttributeError:
                print(line)
    for v in g.vs:
        uname, fname = inst_dict[v['name']]
        v['name'] = uname
        v['fname'] = cyrtranslit.to_latin(fname, 'ru').replace("'", '')

def enrich_insta_graph(g):
    inst_dict = dict()
    pat = re.compile("(\d+),(.*),(.*)")
    pat_word = re.compile('[^a-zA-Zа-яА-Я\d\s]+')
    
    g.vs['fname'] = ''
    
    with open(folder + 'inst_personal.csv', 'r') as f:
        for line in f:
            uid, uname, fname = pat.match(line).groups()
            fname = re.sub(pat_word, '', fname).strip().lower()
            inst_dict[uid] = (uname, fname)

    for v in g.vs:
        uname, fname = inst_dict[v['name']]
        v['name'] = uname
        v['fname'] = cyrtranslit.to_latin(fname, 'ru').replace("'", '')

In [84]:
inst_g = read_edges(folder + 'inst_lid_rid.csv')
enrich_insta_graph(inst_g)

# vk_g = read_edges(folder + 'vk_lid_rid.csv')
# enrich_vk_graph(vk_g)

../data/inst_lid_rid.csv
IGRAPH UN-- 20794 240414 -- 
+ attr: name (v)


In [None]:
vk_g.write_pickle(fname=os.path.join(folder, 'vk.pickle'))
inst_g.write_pickle(fname=os.path.join(folder, 'inst.pickle'))

In [5]:
vk_g = ig.Graph.Read_Pickle(os.path.join(folder, 'vk.pickle'))
inst_g = ig.Graph.Read_Pickle(os.path.join(folder, 'inst.pickle'))

### matches to lid_rid

In [41]:
folder_matches = '../matches'

def read_matches(matches_file_name, threshold):
    fname = os.path.join(folder_matches, '%.3d' % threshold, matches_file_name)
    print(fname)
    matches = pickle.load(open(fname, 'rb'))
    print('matches len', len(matches))
    return matches

threshold=91
matches_file_name = 'matches_s_01_th_091_t_10-11_14:21.pickle'

matches = read_matches(matches_file_name, threshold)

../matches/091/matches_s_01_th_091_t_10-11_14:21.pickle
matches len 4885


In [107]:
def transform_matches_to_lid_rid(matches, df, to_log=False):    
    df = utils.read_combine_df(from_raw = False, merge_how='outer')
    lid_rid = []
    for lnode, rnode in tqdm(matches):
        if to_log:
            print(lnode, rnode)
            row = df[df['uname'] == vk_g.vs[lnode]['name']]
            print(row[['uname', 'uid_vk','name_vk']].values[0], vk_g.vs[lnode]['fname'])
            
            row = df[df['uname'] == inst_g.vs[rnode]['name']]
            print(row[['uname', 'uid_inst', 'name_inst']].values[0], inst_g.vs[rnode]['fname'])            
            print('----------------------------------')
        
        lid = df[df['uname'] == vk_g.vs[lnode]['name']]['uid_vk'].values[0]
        rid = df[df['uname'] == inst_g.vs[rnode]['name']]['uid_inst'].values[0]
        lid_rid.append((int(rid),int(lid)))
    print(len(matches), len(matches) == len(lid_rid))
    return lid_rid

lid_rid = transform_matches_to_lid_rid(matches, df)

       uid_inst           uname          name_inst    uid_vk          name_vk
0  7.510266e+08       mrzelkin1     aleksej zelkin    5394.0    alexey zelkin
1  2.744159e+06         undruha  andrey gnelitskiy   22884.0  andre undrukhov
2  4.157858e+09       vadimbhai     abhairov vadim   23754.0    vadim reutsky
3  3.893359e+08  superov_sergey     sergey superov   89831.0   sergey superov
4  1.992385e+08       smilychka       nastja gogol  103177.0     nastya gogol





ValueError: cannot convert float NaN to integer

In [106]:
reload(utils)
utils.precision_recall(lid_rid)

(0.0, 0.0)

### Analysis

In [None]:
def degree_dist(g):
    l = []
    for v in g.vs:
        l.append(v.degree())
    plt.hist(l, bins=30)
    plt.title('Degree distribution')
    plt.show()
    
def double_deg_dist(g):
    l = []
    for v in g.vs:
        for vn in v.neighbors():
            l.append(vn.degree())
    plt.hist(l, bins=30)
    plt.title('Double Degree distribution')
    plt.show()
    
def double_deg_dist_mean(g):
    l = []
    for v in g.vs:
        s = []
        for vn in v.neighbors():
            s.append(vn.degree())
        l.append(sum(s)/len(s))
    plt.hist(l, bins=30)
    plt.title('Mean of double degree distribution')
    plt.show()
    
def double_deg_dist_median(g):
    l = []
    for v in g.vs:
        s = []
        for vn in v.neighbors():
            s.append(vn.degree())
        l.append(sorted(s)[len(s)//2])
    plt.hist(l, bins=30)
    plt.title('Median of double degree distribution')
    plt.show()
    
def plot_dist(g):
    degree_dist(g)
    double_deg_dist_mean(g)
    double_deg_dist(g)
    double_deg_dist_median(g)

In [None]:
plot_dist(vk_g)

In [None]:
plot_dist(inst_g)

### Choose name similarity function

In [8]:
reload(utils)
df = utils.read_combine_df(from_raw = False)
df.head()







Unnamed: 0,uid_inst,uname,name_inst,uid_vk,name_vk
0,751026638,mrzelkin1,aleksej zelkin,5394,alexey zelkin
1,2744159,undruha,andrey gnelitskiy,22884,andre undrukhov
2,4157858483,vadimbhai,abhairov vadim,23754,vadim reutsky
3,389335890,superov_sergey,sergey superov,89831,sergey superov
4,199238507,smilychka,nastja gogol,103177,nastya gogol


In [10]:
def soundex(name, len=4):
    """ soundex module conforming to Knuth's algorithm
        implementation 2000-12-24 by Gregory Jorgensen
        public domain
    """

    # digits holds the soundex values for the alphabet
    digits = '01230120022455012623010202'
    sndx = ''
    fc = ''

    # translate alpha chars in name to soundex digits
    for c in name.upper():
        if c.isalpha():
            if not fc: fc = c   # remember first letter
            d = digits[ord(c)-ord('A')]
            # duplicate consecutive soundex digits are skipped
            if not sndx or (d != sndx[-1]):
                sndx += d

    # replace first digit with first alpha character
    sndx = fc + sndx[1:]

    # remove all 0s from the soundex code
    sndx = sndx.replace('0','')

    # return soundex code padded to len characters
    return (sndx + (len * '0'))[:len]

def soundex_sim(a,b):
    a = soundex(a)
    b = soundex(b)
    return fuzz.token_sort_ratio(a,b) / 100

soundex('ildar nurgaliev')

'I436'

In [15]:
from fuzzywuzzy import fuzz
import Levenshtein as lev
from difflib import SequenceMatcher
from time import time

a = 'Ildar Nurgaliev'
b = 'Nurgaliev Ildar'

s = time()
print(fuzz.ratio(a,b) / 100)
e = time()
print('\tfuzz.ratio', e-s)

s = time()
print(fuzz.partial_ratio(a,b) / 100)
e = time()
print('\tpartial_ratio', e-s)

s = time()
print(fuzz.token_sort_ratio(a,b) / 100)
e = time()
print('\ttoken_sort_ratio', e-s)

s = time()
print(fuzz.token_set_ratio(a,b) / 100)
e = time()
print('\ttoken_set_ratio', e-s)

s = time()
print(lev.ratio(a,b))
e = time()
print('\tlev.ratio', e-s)

s = time()
seq = SequenceMatcher(None, a, b)
print(seq.ratio())
e = time()
print('\tSequenceMatcher', e-s)

s = time()
print(soundex_sim(a,b))
e = time()
print('\tsound', e-s)

0.6
	fuzz.ratio 0.0001049041748046875
0.6
	partial_ratio 7.534027099609375e-05
1.0
	token_sort_ratio 8.535385131835938e-05
1.0
	token_set_ratio 9.012222290039062e-05
0.6
	lev.ratio 5.316734313964844e-05
0.6
	SequenceMatcher 0.00011563301086425781
0.25
	sound 0.0002810955047607422


In [21]:
df.head()

Unnamed: 0,uid_inst,uname,name_inst,uid_vk,name_vk
0,751026638,mrzelkin1,aleksej zelkin,5394,alexey zelkin
1,2744159,undruha,andrey gnelitskiy,22884,andre undrukhov
2,4157858483,vadimbhai,abhairov vadim,23754,vadim reutsky
3,389335890,superov_sergey,sergey superov,89831,sergey superov
4,199238507,smilychka,nastja gogol,103177,nastya gogol


In [31]:
def test_name_sim_fs():
    for row in df[['uname', 'name_inst', 'name_vk']].values[:150]:
        
        a,b = row[1:]
        s1 = lev.ratio(a,b)
        s2 = fuzz.token_sort_ratio(a,b) / 100
        s3 = fuzz.token_set_ratio(a,b) / 100
        c = s1 < s2
        if c:
            print('%s \t %s : %s' % (row[0], row[1], row[2]))
            print('!' if c else '','%.3f %.3f %.3f' % (s1, s2, s3))
        print()

test_name_sim_fs()



vadimbhai 	 abhairov vadim : vadim reutsky
! 0.370 0.520 0.560


smilychka 	 nastja gogol : nastya gogol
! 0.917 0.920 0.920





tatbeauty_ru 	 belorusskaja kosmetika : natalia sokolova
! 0.316 0.320 0.320

stasechka_ 	 garifullina anastasia : anastasia garifullina
! 0.524 1.000 1.000

iloshap 	 iloshap : ilona pekerman
! 0.476 0.480 0.480




more__life 	 valerya vorobeva : valeria vorobyeva
! 0.909 0.910 0.910










_alena_ko 	 alena pesoshina : alyona koroleva
! 0.467 0.470 0.470


retro_live_kzn 	 stanislav k : stanislav kotov
! 0.846 0.850 0.900

dj_xaker 	 djpate xakera : ildar khusainov
! 0.357 0.360 0.360






i.razetdinov 	 i r  ilgiz razetdinov : ilgiz razetdinov
! 0.865 0.890 1.000

nozdrina_t 	 tatjana nozdrina : tatyana nozdrina
! 0.938 0.940 0.940

alexandrievlev 	 aleksandr ievlev : alexander ievlev
! 0.875 0.880 0.880

a.petyhov 	 andrej petuhov : andrey petukhov
! 0.897 0.900 0.900


regina_aibyatova 	 regina : regina aybyatova
! 0.545 0.550 1.000











lo

### Distribution analysis

In [None]:
def run_permutation_test(pooled,sizeZ,sizeY,delta):
    np.random.shuffle(pooled)
    starZ = pooled[:sizeZ]
    starY = pooled[-sizeY:]
    return starZ.mean() - starY.mean()

def bootstrap_test(z,y):
    numSamples = 10000
    
    z = np.array(z)
    y = np.array(y)

    pooled = np.hstack([z,y])
    delta = z.mean() - y.mean()
    estimates = np.array(list(map(lambda x: run_permutation_test(pooled,z.size,y.size,delta),range(numSamples))))
    diffCount = len(np.where(estimates <= delta)[0])
    hat_asl_perm = 1.0 - (float(diffCount)/float(numSamples))
    return hat_asl_perm


bootstrap_test([94,197,16,38,99,141,23], [52,104,146,10,51,30,40,27,46])

In [None]:
def feature(vl, bins=31, size=50):
    feature_set = [0 for i in range(2 * bins)]
    _1hop = [v.degree() for v in vl.neighbors()]
    for h in _1hop:
        if h < bins * size:
            feature_set[int(h / size)] += 1
#     _2hop = []
#     for vs in vl.neighbors():
#         _2hop += [v.degree() for v in vs.neighbors()]
#     for h in _2hop:
#         if h < bins * size:
#             feature_set[bins + int(h / size)] += 1
    return feature_set

a = feature(v)

If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same.

In [None]:
import itertools  as it
from scipy.stats import ks_2samp
from difflib import SequenceMatcher
import numpy as np
from random import random

def dist_sim(vr, vl):
    sl = [v.degree() for v in vl.neighbors()]
    sr = [v.degree() for v in vr.neighbors()]
    return ks_2samp(sl, sr)


def dist_sim2(vl, vr):
#     sl = []
#     for vs in vl.neighbors():
#         sl += [v.degree() for v in vs.neighbors()]
        
#     sr = []
#     for vs in vl.neighbors():
#         sr += [v.degree() for v in vs.neighbors()]
#     sl = [v.degree() for v in vl.neighbors()]
#     sr = [v.degree() for v in vr.neighbors()]
    bins=11
    size=50
    sl = feature(vl, bins, size)
    sr = feature(vr, bins, size)
    return bootstrap_test(sl, sr)

def neigbor_deg_dist(lg, rg):
    seq = SequenceMatcher()
#     for vl, vr in zip(it.islice(lg.vs, 40,50), it.islice(rg.vs, 10,30)):
    for vl in it.islice(lg.vs, 41, 42):
        vrt = rg.vs.find(name = vl['name'])
        
        for vr in [vrt] + vrt.neighbors():
            seq.set_seqs(vl['fname'], vr['fname'])
            t2 = dist_sim2(vl, vr)
            s = []
            for vn in vl.neighbors():
                s.append(vn.degree())
            f, axarr = plt.subplots(nrows = 1, ncols=2)
            axarr[0].hist(s, bins=30)
            axarr[0].set_title('%s %d\n %s %f\n %f' % (vl['name'], vl.degree(), vl['fname'], seq.ratio(), t2))
            for vn in vr.neighbors():
                s.append(vn.degree())
            axarr[1].hist(s, bins=30)

            plt.title('%s %d\n %s' % (vr['name'], vr.degree(), vr['fname']))
            plt.show()
            print(dist_sim(vl, vr))
    
neigbor_deg_dist(vk_g, inst_g)