In [1]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

import tensorflow as tf

from scipy import stats

from six.moves import xrange 
from pathlib import Path


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [2]:
ignore_dumps = False

def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)
        
def dump(data, name):
    with open('data/' + name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/' + name, 'rb') as f:
        return pickle.load(f)
    
def load_or_dump(path, func):
    if not Path('data/' + path).exists() or ignore_dumps:
        res = func()
    
        dump(res, path)
    else:
        res = load(path)
        
    return res


In [57]:
from time import sleep

with open('auth/token') as f:
    token = f.readline().strip()

def get_info(ids):
    sleep(0.2)
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    
    if (not 'response' in r.json()):
        print(r.json())
        
    res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
    return(res)

In [4]:
total = 947528

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [5]:
raw_data = load_or_dump('raw', lambda: raw_data_filter(open("data/sessions_public.txt","r")))

log.info("Data loaded")


2019-03-17 16:13:14,563 DEBUG 10 % of mapping is done.
2019-03-17 16:13:26,355 DEBUG 21 % of mapping is done.
2019-03-17 16:13:44,403 DEBUG 31 % of mapping is done.
2019-03-17 16:14:00,651 DEBUG 42 % of mapping is done.
2019-03-17 16:14:16,835 DEBUG 52 % of mapping is done.
2019-03-17 16:14:35,332 DEBUG 63 % of mapping is done.
2019-03-17 16:14:53,979 DEBUG 73 % of mapping is done.
2019-03-17 16:16:11,982 DEBUG 84 % of mapping is done.
2019-03-17 16:17:05,157 DEBUG 94 % of mapping is done.
2019-03-17 16:17:09,733 INFO Data loaded


<IPython.core.display.Javascript object>

In [29]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i[0]:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [34]:
min_session_size = 5

def initiail_mapping(lst, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        malformed = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub or sub_event in malformed):
                    sub.discard(sub_event)
                    unsub.discard(sub_event)
                    malformed.add(sub_event)
                else:
                    unsub.add(sub_event)
            else:
                if (event in unsub or event in malformed):
                    unsub.discard(event)
                    sub.discard(event)
                    malformed.add(event)
                else:
                    sub.add(event)
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups
    

def set_map(lst, cnt, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set() 
        
        for event in session[0]:
            if (cnt[event] > min_allowed):
                sub.add(event)
                
        for event in session[1]:
            if (cnt.get(event, -1) > min_allowed):
                unsub.add(event)    
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10):
    cnt = None
    sorted_cnt = None
    
    data, groups = initiail_mapping(raw_data, min_allowed)
    cnt = group_count(data) 
    sorted_cnt = sorted(list(cnt.values()))
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed)
                
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d\n", sorted_cnt[0])
        
    return data, groups

In [35]:
ignore_dumps = True
data, groups = load_or_dump('final_data', lambda: drop_uncommon(raw_data, 50))

most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

2019-03-17 16:55:32,370 INFO Length of data:   318524
2019-03-17 16:55:32,450 INFO Total length:     637048
2019-03-17 16:55:32,450 INFO Number of groups: 50062
2019-03-17 16:55:32,451 INFO Minimum count:    16

2019-03-17 16:55:45,852 INFO Length of data:   316823
2019-03-17 16:55:45,925 INFO Total length:     633646
2019-03-17 16:55:45,926 INFO Number of groups: 48561
2019-03-17 16:55:45,926 INFO Minimum count:    42

2019-03-17 16:56:03,204 INFO Length of data:   316710
2019-03-17 16:56:03,278 INFO Total length:     633420
2019-03-17 16:56:03,279 INFO Number of groups: 48490
2019-03-17 16:56:03,279 INFO Minimum count:    44

2019-03-17 16:56:20,517 INFO Length of data:   316686
2019-03-17 16:56:20,591 INFO Total length:     633372
2019-03-17 16:56:20,592 INFO Number of groups: 48482
2019-03-17 16:56:20,592 INFO Minimum count:    44

2019-03-17 16:56:38,089 INFO Length of data:   316666
2019-03-17 16:56:38,167 INFO Total length:     633332
2019-03-17 16:56:38,167 INFO Number of group

<IPython.core.display.Javascript object>

In [37]:
raw_data = None

gc.collect()

0

In [38]:
session_dex = 0
event_dex = 0

def generate_batch(batch_size, window_size = 1):
    assert min_session_size >= window_size * 2 + 1 
    assert batch_size % (window_size * 2) == 0
    
    global session_dex
    global event_dex
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    if (event_dex == 0):
        event_dex = window_size
    
    current = 0
    session = list(data[session_dex][0])
    
    while (current < batch_size):
        i = 0
        for j in range(-window_size, window_size + 1):
            if (j != 0):
                batch[current + i] = w2i[session[event_dex + j]]
                labels[current + i][0] = w2i[session[event_dex]]
                i += 1

        event_dex += 1
        current += window_size * 2

        if (event_dex + window_size >= len(session)):
            event_dex = window_size
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex][0])

     
    return batch, labels


In [39]:
batch, labels = generate_batch(16, 2)

print(data[0])
print(data[1])

for i in range(10):
    print(batch[i], i2w[batch[i]], '->', labels[i, 0],
          i2w[labels[i, 0]])

({23372133, 1959, 20650061, 70034991, 22741624, 35540891, 75909948}, set())
({25794755, 91683885, 34812270, 39325103, 104237982, 34523318, 42533142, 46755517, 49128190}, set())
15333 23372133 -> 26475 20650061
745 1959 -> 26475 20650061
15795 70034991 -> 26475 20650061
24463 22741624 -> 26475 20650061
745 1959 -> 15795 70034991
26475 20650061 -> 15795 70034991
24463 22741624 -> 15795 70034991
7531 35540891 -> 15795 70034991
26475 20650061 -> 24463 22741624
15795 70034991 -> 24463 22741624


In [40]:
#raw_data = None

learning_rate = 1
vocabulary_size = len(groups)

window_size = 3
embedding_size = 48
num_sampled = 64
batch_size = 128

In [41]:
def get_closest(emb, index, f = None):
    p = emb[index]
    cnst = tf.constant(p, shape=[1, embedding_size])
    d = tf.matmul(cnst, emb, transpose_b=True).eval()[0]

    dxs = np.argsort(np.array(d))
    
    ids = []
    res = []
    
    for i in range(len(dxs) - 10, len(dxs)):
        ids.append(i2w[dxs[i]])
        res.append(d[dxs[i]])
        
    info = get_info(ids)
    
    for i in xrange(len(res)):
        print(ids[i], ' ', res[i], ' ', info[i])
        
        if (f != None):
            f.write(str(ids[i]) + ' ' + str(res[i]) + ' ' + str(info[i]) + '\n')

In [42]:
num_steps = 200000

def tf_train(window_size, embedding_size, num_sampled, batch_size):
    graph = tf.Graph()
    
    with graph.as_default():
        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
                )
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

          # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(
                      weights=nce_weights,
                      biases=nce_biases,
                      labels=train_labels,
                      inputs=embed,
                      num_sampled=num_sampled,
                      num_classes=vocabulary_size))

            # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
              optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
        #                                          valid_dataset)
        #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()
        
    with tf.Session(graph=graph) as session:     
        
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter("tmp", session.graph)

        # We must initialize all variables before we use them.
        init.run()
        log.info('Initialized. Embedding size: %s; Num sampled: %s; Window size: %s; Batch size: %s', embedding_size, num_sampled, window_size, batch_size)
        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(batch_size)
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                             feed_dict=feed_dict,
                                             run_metadata=run_metadata)
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                      average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                log.debug('Average loss at step %d: %.4f', step, average_loss)
                average_loss = 0

            #if step % 20000 == 0 and step != 0:
                #print('Most closest to ', most_common[0][0])
                #get_closest(normalized_embeddings.eval(), w2i[most_common[0][0]])

        final_embeddings = normalized_embeddings.eval()

    return graph, final_embeddings


In [43]:
graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)

2019-03-17 17:33:02,152 INFO Initialized. Embedding size: 48; Num sampled: 64; Window size: 3; Batch size: 128
2019-03-17 17:33:02,237 DEBUG Average loss at step 0: 291.5664
2019-03-17 17:33:04,284 DEBUG Average loss at step 2000: 138.7917
2019-03-17 17:33:06,306 DEBUG Average loss at step 4000: 60.2892
2019-03-17 17:33:08,273 DEBUG Average loss at step 6000: 34.3937
2019-03-17 17:33:10,218 DEBUG Average loss at step 8000: 21.7610
2019-03-17 17:33:12,160 DEBUG Average loss at step 10000: 16.1726
2019-03-17 17:33:14,119 DEBUG Average loss at step 12000: 11.7242
2019-03-17 17:33:16,064 DEBUG Average loss at step 14000: 9.6428
2019-03-17 17:33:18,012 DEBUG Average loss at step 16000: 8.0565
2019-03-17 17:33:19,967 DEBUG Average loss at step 18000: 6.6830
2019-03-17 17:33:21,915 DEBUG Average loss at step 20000: 6.3439
2019-03-17 17:33:23,859 DEBUG Average loss at step 22000: 5.6677
2019-03-17 17:33:25,818 DEBUG Average loss at step 24000: 5.5724
2019-03-17 17:33:27,768 DEBUG Average loss 

<IPython.core.display.Javascript object>

In [44]:
from random import randint

def print_random(graph, final_embeddings, window_size, embedding_size, num_sampled):
    try:
        p = 'data/reports/' + 'es_' + str(embedding_size) + '_ns_' + str(num_sampled) + '_ws_' + str(window_size)
        with open(p, 'w') as f:
            for i in range(0, 10):
                with tf.Session(graph=graph) as session:
                    index = len(groups) * i // 10 + randint(0, 100)

                    f.write(str(index) + '\n')
                    print(index)

                    get_closest(final_embeddings, w2i[most_common[index][0]], f)

                f.write('\n\n')
                print('\n')
    except err:
        print(err)
        None


In [58]:
print_random(graph, final_embeddings, window_size, embedding_size, num_sampled)

29
77093415   0.76223373   ('Лайфхакум | Советы, хитрости, идеи', 'vk_lifehack_club', '1,266,203')
42440233   0.77038264   ('Музыка', 'exp.music', '1,789,340')
124999723   0.7732413   ('Строительство и ремонт', 'stroy_ok', '567,107')
54391852   0.77620864   ('Падик', 'club54391852', '1,641,563')
85087785   0.7827162   ('AuRuM TV ˖ Clash Royale ˖ Brawl Stars', 'aurum_tv', '735,320')
158473256   0.79303634   ('как так?', 'hzkaktak', '875,764')
26419239   0.79396236   ('Смейся до слёз :D', 'ifun', '11,186,708')
35061290   0.8380492   ('Эгоист', 'e_goist', '4,756,878')
94255146   0.880185   ('Реально смешно', 'onovoe', '1,984,204')
133668394   0.99999994   ('Заброшенное', 'zabroshenoevk', '3,096,028')


4867
94943864   0.69538283   ('12 стульев', 'twelve_h', '456,835')
106277494   0.7013999   ('Азбука ремонта | Строительство', 'azbuka.remonta', '1,173,632')
159450742   0.7036063   ('Рифмач', 'vizhu_rifmi', '877,194')
156222072   0.7179694   ('Позорно', 'pozornnoo', '923,786')
151414392   0

In [343]:
def train():
    embedding_sizes = [32, 48, 64]
    num_sampled_arr = [32, 48, 64]
    window_sizes = [1, 2, 3, 4, 5]
    batch_sizes = [128, 128, 132, 128, 130]
    for i in xrange(len(embedding_sizes)):
        for j in xrange(len(num_sampled_arr)):
            for k in xrange(len(window_sizes)):
                embedding_size = embedding_sizes[i]
                num_sampled = num_sampled_arr[j]
                window_size = window_sizes[k]
                batch_size = batch_sizes[k]
                
                graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)
                
                print_random(window_size, embedding_size, num_sampled)

In [None]:
train()