In [None]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

import tensorflow as tf

from scipy import stats

from six.moves import xrange 


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [None]:
def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)

In [None]:
with open('auth/token') as f:
    token = f.readline().strip()
    


def get_info(ids):
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    res = lmap(lambda x: (x['name'], x['screen_name'], x[mc] if mc in x else -1),r.json()['response'])
    return(res)

In [None]:
total = 947528

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [None]:
raw_data = raw_data_filter(open("data/sessions_public.txt","r"))
    


In [None]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [None]:
def set_map(lst, cnt, min_allowed, min_session_size):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub):
                    sub.remove(sub_event)
                    
                unsub.add(sub_event)
            else:
                if (not event in unsub):
                    if (cnt == None or cnt[event] > min_allowed):
                        sub.add(event)
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            
            result.append(sub)
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10, min_session_size = 4):
    cnt = None
    sorted_cnt = None
    
    data = raw_data
    groups = None
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed, min_session_size)
        
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d", sorted_cnt[0])
        
    return data, groups

In [None]:
#data = drop_uncommon(data)
data, groups = drop_uncommon(raw_data, 10)

most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

In [None]:
session_dex = 0
event_dex = 0

def generate_batch(batch_size):
    global session_dex
    global event_dex
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    current = 0
    session = list(data[session_dex])
    
    while (current * 2 < batch_size):
        batch[current * 2] = w2i[session[event_dex + 1]]
        labels[current * 2][0] = w2i[session[event_dex]]
        batch[current * 2 + 1] = w2i[session[event_dex + 1]]
        labels[current * 2 + 1][0] = w2i[session[(event_dex + 2) % len(session)]]
        
        event_dex += 1
        current += 1
        
        if (event_dex + 2 >= len(session)):
            event_dex = 0
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex])

     
    return batch, labels


In [None]:
batch, labels = generate_batch(10)

print(data[0])
print(data[1])

for i in range(10):
    print(batch[i], i2w[batch[i]], '->', labels[i, 0],
          i2w[labels[i, 0]])

In [None]:
batch_size = 128
embedding_size = 128
num_sampled = 64
vocabulary_size = len(groups)

graph = tf.Graph()

In [None]:
with graph.as_default():
    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
            )
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

      # Construct the variables for the NCE loss
    with tf.name_scope('weights'):
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
            stddev=1.0 / math.sqrt(embedding_size)))
    with tf.name_scope('biases'):
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                  weights=nce_weights,
                  biases=nce_biases,
                  labels=train_labels,
                  inputs=embed,
                  num_sampled=num_sampled,
                  num_classes=vocabulary_size))
        
        # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
          optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all
    # embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
    #                                          valid_dataset)
    #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()



In [None]:
def get_closest(emb, index):
    p = emb[index]
    cnst = tf.constant(p, shape=[1, embedding_size])
    d = tf.matmul(cnst, emb, transpose_b=True).eval()[0]

    dxs = np.argsort(np.array(d))

    ids = []
    res = []
    
    for i in range(len(dxs) - 10, len(dxs)):
        ids.append(i2w[dxs[i]])
        res.append(d[dxs[i]])
        
    info = get_info(ids)
    
    for i in xrange(len(res)):
        print(ids[i], ' ', res[i], ' ', info[i])



In [None]:
num_steps = 337960

with tf.Session(graph=graph) as session:
    # Open a writer to write summaries.
    writer = tf.summary.FileWriter("tmp", session.graph)

    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # Define metadata variable.
        run_metadata = tf.RunMetadata()

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        # Also, evaluate the merged op to get all summaries from the returned
        # "summary" variable. Feed metadata variable to session for visualizing
        # the graph in TensorBoard.
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
        average_loss += loss_val

        # Add returned summaries to writer in each step.
        writer.add_summary(summary, step)
        # Add metadata to visualize the graph for the last run.
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step%d' % step)

        if step % 2000 == 0:
            if step > 0:
                  average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
        if step % 20000 == 0 and step != 0:
            print('Most closest to ', most_common[0][0])
            get_closest(normalized_embeddings.eval(), w2i[most_common[0][0]])
            
    final_embeddings = normalized_embeddings.eval()



In [15]:
for i in range(0, 10):
    with tf.Session(graph=graph) as session:
        get_closest(final_embeddings, w2i[most_common[i][0]])
        
    print('\n')


150388347   0.576734   ('Модные Прически', 'modnue.pricheski', 1381005)
151779453   0.57750046   ('Я никогда не...', 'never_did', 691581)
73776762   0.5840418   ('Восемнадцать плюс', 'vosplus', 2003817)
71474813   0.5844791   ('Реальный Футбол | Битва поколений', 'refoot', 2531028)
28293246   0.59798276   ('Just love', 'vk.just.love', 3906799)
22741624   0.6135372   ('Улетные приколы', 'humour.page', 5214000)
30532220   0.61808896   ('СМС приколы :D', 'i4sms', 3272684)
29573241   0.6675201   ('NR', 'rapnewrap', 4741209)
26147450   0.6954052   ('Школа? Не, не слышали', 'onesc', 3863075)
27895931   0.9999999   ('Новинки Музыки 2019 | Новая Музыка', 'exclusive_muzic', 16384210)


23758942   0.5705687   ('Взрослей', 'pafos_oo', 2999892)
105999460   0.57192314   ('Мои аудиозаписи', 'my_audios', 2622243)
31613023   0.576375   ('Чёртов стыд', 'grebaniy_stid', 1598456)
149537884   0.59037364   ('Смотри что Я сделяль!', 'sdelyall', 1219087)
116779618   0.5985114   ('Кактус Коля', 'kaktuskola', 

In [25]:
with tf.Session(graph=graph) as session:
    get_closest(final_embeddings, len(groups) // 4 * 3)

80701061   0.43979344   ('Apple Барахолка Саратов', 'applebarahlo64', 7344)
46007784   0.44030026   ('YouTube Angelville', 'yt_angelville', 23418)
43686317   0.4434111   ('Работа Вакансии Работодатели в Калининграде', 'jobs39', 90756)
63303836   0.44487107   ('kipish', 'kipish088', 244231)
152366773   0.44687134   ('Рыбалка с AliExpress', 'club152366773', 4800)
109496640   0.45852435   ('Барахолка Псков', 'club109496640', 12758)
102268559   0.46084452   ('7NN 🌴', '7noname', 39879)
135339893   0.4645276   ('CrockidUfa (КрокидУфа) Детская одежда в Уфе', 'crockidufa', 5438)
71458926   0.50234413   ('Бронницы 24', 'bron24', 8847)
8780658   1.0000001   ('Кубик Рубика. Спидкубинг (официальная группа)', 'ruspeedcubing', 45648)
