In [1]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

import tensorflow as tf

from scipy import stats

from six.moves import xrange 
from pathlib import Path


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [2]:
ignore_dumps = False

def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)
        
def dump(data, name):
    with open('data/' + name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/' + name, 'rb') as f:
        return pickle.load(f)
    
def load_or_dump(path, func):
    if not Path('data/' + path).exists() or ignore_dumps:
        res = func()
    
        dump(res, path)
    else:
        res = load(path)
        
    return res


In [3]:
from time import sleep

with open('auth/token') as f:
    token = f.readline().strip()

def get_info(ids):
    sleep(0.2)
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    
    if (not 'response' in r.json()):
        print(r.json())
        
    res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
    return(res)

In [4]:
total = 947528

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [5]:
raw_data = load_or_dump('raw', lambda: raw_data_filter(open("data/sessions_public.txt","r")))

log.info("Data loaded")


2019-03-18 10:38:48,502 INFO Data loaded


In [6]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i[0]:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [7]:
min_session_size = 5

def initiail_mapping(lst, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        malformed = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub or sub_event in malformed):
                    sub.discard(sub_event)
                    unsub.discard(sub_event)
                    malformed.add(sub_event)
                else:
                    unsub.add(sub_event)
            else:
                if (event in unsub or event in malformed):
                    unsub.discard(event)
                    sub.discard(event)
                    malformed.add(event)
                else:
                    sub.add(event)
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups
    

def set_map(lst, cnt, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set() 
        
        for event in session[0]:
            if (cnt[event] > min_allowed):
                sub.add(event)
                
        for event in session[1]:
            if (cnt.get(event, -1) > min_allowed):
                unsub.add(event)    
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10):
    cnt = None
    sorted_cnt = None
    
    data, groups = initiail_mapping(raw_data, min_allowed)
    cnt = group_count(data) 
    sorted_cnt = sorted(list(cnt.values()))
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed)
                
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d\n", sorted_cnt[0])
        
    return data, groups

In [8]:
ignore_dumps = False
data, groups = load_or_dump('final_data', lambda: drop_uncommon(raw_data, 50))

most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

In [9]:
raw_data = None

gc.collect()

0

In [231]:
session_dex = 0
event_dex = 0

def generate_batch(batch_size, negative_size, window_size = 1):
    assert min_session_size >= window_size * 2 + 1 
    assert batch_size % (window_size * 2) == 0
    
    global session_dex
    global event_dex
    
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    negative = np.ndarray(shape=(negative_size), dtype=np.int32)
    
    if (event_dex == 0):
        event_dex = window_size
    
    current = 0
    session = list(data[session_dex][0])
    
    while (current < batch_size):
        i = 0
        for j in range(-window_size, window_size + 1):
            if (j != 0):
                labels[current + i][0] = w2i[session[event_dex + j]]
                batch[current + i] = w2i[session[event_dex]]
                i += 1

        event_dex += 1
        current += window_size * 2

        if (event_dex + window_size >= len(session)):
            event_dex = window_size
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex][0])
            
    neg = 0
        
    for i in data[session_dex][1]:
        negative[neg] = w2i[i]
        neg += 1
        if (neg == negative_size):
            break
            
    rand_neg = np.random.randint(len(groups), size=negative_size - neg)
        
    for i in range(0, negative_size - neg):
        negative[neg + i] = rand_neg[i]
        
     
    return batch, labels, negative


In [232]:
batch, labels, negative = generate_batch(16, 5, 2)

print(data[0])
print(data[1])

for i in range(10):
    print(batch[i], i2w[batch[i]], '->', labels[i, 0],
          i2w[labels[i, 0]])
    
print(negative)

({23372133, 1959, 20650061, 70034991, 22741624, 35540891, 75909948}, set())
({25794755, 91683885, 34812270, 39325103, 49128190, 34523318, 42533142, 46755517, 104237982}, set())
26468 20650061 -> 15329 23372133
26468 20650061 -> 745 1959
26468 20650061 -> 15791 70034991
26468 20650061 -> 24456 22741624
15791 70034991 -> 745 1959
15791 70034991 -> 26468 20650061
15791 70034991 -> 24456 22741624
15791 70034991 -> 7531 35540891
24456 22741624 -> 26468 20650061
24456 22741624 -> 15791 70034991
[19650 26897 16391  8306 36492]


In [281]:
#raw_data = None

learning_rate = 0.5
vocabulary_size = len(groups)

window_size = 1
embedding_size = 48
num_sampled = 5
batch_size = 4

In [282]:
def get_closest(emb, index, f = None):
    p = emb[index]
    cnst = tf.constant(p, shape=[1, embedding_size])
    d = tf.matmul(cnst, emb, transpose_b=True).eval()[0]

    dxs = np.argsort(np.array(d))
    
    ids = []
    res = []
    
    for i in range(len(dxs) - 10, len(dxs)):
        ids.append(i2w[dxs[i]])
        res.append(d[dxs[i]])
        
    info = get_info(ids)
    
    for i in xrange(len(res)):
        print(ids[i], ' ', res[i], ' ', info[i])
        
        if (f != None):
            f.write(str(ids[i]) + ' ' + str(res[i]) + ' ' + str(info[i]) + '\n')

In [283]:
num_steps = 200000

def mk_negative_samples(train_labels, num_sampled, negative_labels):
    tmp = tf.random.log_uniform_candidate_sampler(
                            true_classes=train_labels,
                            num_true=1,
                            num_sampled=num_sampled,
                            unique=True,
                            range_max=num_sampled,
                            seed=None,
                            name=None
                        )

    return (tf.map_fn(lambda x: negative_labels[x], tmp.sampled_candidates), 
                             tmp.true_expected_count, 
                             tmp.sampled_expected_count)

def loss_function(nce_weights, nce_biases, train_labels, negative_labels, embed, num_sampled, vocabulary_size):
    return tf.nn.sampled_softmax_loss(
                        weights=nce_weights,
                        biases=nce_biases,
                        labels=train_labels,
                        inputs=embed,
                        num_sampled=num_sampled,
                        num_classes=vocabulary_size,
                        num_true=1,
                        sampled_values=mk_negative_samples(train_labels, num_sampled, negative_labels),
                        remove_accidental_hits=True,
                        partition_strategy='mod',
                        name='sampled_softmax_loss',
                        seed=None)


def tf_train(window_size, embedding_size, num_sampled, batch_size):
    graph = tf.Graph()
    
    with graph.as_default():
        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            negative_samples = tf.placeholder(tf.int64, shape=[num_sampled])
            train_labels = tf.placeholder(tf.int64, shape=[batch_size, 1])

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
                )
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

          # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        with tf.name_scope('loss'):
            loss = tf.reduce_mean(loss_function(nce_weights, 
                                 nce_biases, 
                                 train_labels, 
                                 negative_samples, 
                                 embed, 
                                 num_sampled, 
                                 vocabulary_size
                                ))

            # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
              optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        #valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
        #                                          valid_dataset)
        #similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()
        
    with tf.Session(graph=graph) as session:     
        
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter("tmp", session.graph)

        # We must initialize all variables before we use them.
        init.run()
        log.info('Initialized. Embedding size: %s; Num sampled: %s; Window size: %s; Batch size: %s', embedding_size, num_sampled, window_size, batch_size)
        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels, batch_negative = generate_batch(batch_size, num_sampled, window_size)
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels, negative_samples:batch_negative}
            
            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                             feed_dict=feed_dict,
                                             run_metadata=run_metadata)     
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                      average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                log.debug('Average loss at step %d: %.4f', step, average_loss)
                average_loss = 0

            #if step % 20000 == 0 and step != 0:
                #print('Most closest to ', most_common[0][0])
                #get_closest(normalized_embeddings.eval(), w2i[most_common[0][0]])

        final_embeddings = normalized_embeddings.eval()

    return graph, final_embeddings


In [284]:
graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)

2019-03-18 18:12:56,152 INFO Initialized. Embedding size: 48; Num sampled: 5; Window size: 1; Batch size: 4
2019-03-18 18:12:56,208 DEBUG Average loss at step 0: 0.0039
2019-03-18 18:12:57,685 DEBUG Average loss at step 2000: 0.0087
2019-03-18 18:12:59,207 DEBUG Average loss at step 4000: 0.0080
2019-03-18 18:13:00,761 DEBUG Average loss at step 6000: 0.0081
2019-03-18 18:13:02,229 DEBUG Average loss at step 8000: 0.0084
2019-03-18 18:13:03,753 DEBUG Average loss at step 10000: 0.0075
2019-03-18 18:13:05,256 DEBUG Average loss at step 12000: 0.0081
2019-03-18 18:13:06,718 DEBUG Average loss at step 14000: 0.0075
2019-03-18 18:13:08,279 DEBUG Average loss at step 16000: 0.0072
2019-03-18 18:13:09,741 DEBUG Average loss at step 18000: 0.0088
2019-03-18 18:13:11,191 DEBUG Average loss at step 20000: 0.0080
2019-03-18 18:13:12,648 DEBUG Average loss at step 22000: 0.0079
2019-03-18 18:13:14,091 DEBUG Average loss at step 24000: 0.0081
2019-03-18 18:13:15,548 DEBUG Average loss at step 2600

<IPython.core.display.Javascript object>

In [285]:
from random import randint

def print_random(graph, final_embeddings, window_size, embedding_size, num_sampled):
    try:
        p = 'data/reports/' + 'es_' + str(embedding_size) + '_ns_' + str(num_sampled) + '_ws_' + str(window_size)
        with open(p, 'w') as f:
            for i in range(0, 10):
                with tf.Session(graph=graph) as session:
                    index = len(groups) * i // 10 + randint(0, 100)

                    f.write(str(index) + '\n')
                    print(index)

                    get_closest(final_embeddings, w2i[most_common[index][0]], f)

                f.write('\n\n')
                print('\n')
    except err:
        print(err)
        None


In [286]:
print_random(graph, final_embeddings, window_size, embedding_size, num_sampled)

17
105503338   0.49311572   ('СпортНяшечка', 'club105503338', '72,334')
132782438   0.49472433   ('Боевые Жигули', 'boevaja_classica', '517,595')
86526388   0.49519143   ('Работа Астана', 'job_astana', '33,609')
145233167   0.4958241   ('BTS', 'lovvekorean', '134,595')
94404133   0.5004687   ('Бесплатный Челябинск', 'ch_free', '31,827')
17154489   0.50466454   ('Психология отношений | Статусы ♂♀ ♥  by Love ♥', 'psiholog_otnosheniy', '77,035')
115954385   0.50707656   ('Одетая в Счастье. Психология/Эзотерика', 'club115954385', '18,484')
34084756   0.5082171   ('PRO FOTO', 'profotopro', '137,378')
70181160   0.5413219   ('Hearthstone с Хаппой', 'clubhappa', '37,261')
40567146   0.9999999   ('Лайфхак', 'lhack', '7,944,169')


4946
31955706   0.4758619   ('السكوت من ذهب', 'alishaisakova', '139,285')
139355889   0.47945952   ('Бесплатно за репост | Конкурсы', 'freefor_repost', -1)
145491320   0.48079926   ('Full HD', 'full_hd2', '23,488')
166991867   0.50103796   ('you disappoint me', 'club

In [287]:
test_ids=[129440544, 28261334, 92876084, 51016572, 91933860]

with tf.Session(graph=graph) as session:
    for id in test_ids:
        get_closest(final_embeddings, w2i[id])
        print("\n")

96482844   0.49202156   ('Бетке айтамын◄◄', 'club96482844', '79,611')
95128329   0.49394947   ('КаРтЕлЬ', 'showroom121', '39,835')
93965789   0.49399173   ('Элитный клуб  "AlexGrom"', 'alexgrombet', '20,422')
132685380   0.49459141   ('STARRY SKY', 'starry_skyy', '68,922')
38594501   0.49731797   ('Берёзка', 'berezka', '159,671')
32194500   0.49982646   ('Брат, только держись♔', 'ceny_brat', '3,420,282')
129149255   0.5003519   ('Порно гифки', 'gif_sex_porn', '39,981')
44429130   0.501256   ('Девушка мечты - Красота Мода Стиль', 'devushka_m', '534,636')
76271043   0.5273528   ('Новый криптомир', 'ccgfund', '114,193')
129440544   1.0   ('eternal classic', 'eternalclassic', '132,568')


62611753   0.4772137   ('Очумелые ручки', 'crazy.hands', '474,380')
33833860   0.47859693   ('Бег | Музыка для тренировок', 'vk_run', '163,110')
103231362   0.47870976   ('our world shinobi', 'our_world_shinobi', '43,196')
9581149   0.48787045   ('Cultural and Education Section, British Embassy', 'rubriti

In [288]:
def train():
    embedding_sizes = [32, 48, 64]
    num_sampled_arr = [32, 48, 64]
    window_sizes = [1, 2, 3, 4, 5]
    batch_sizes = [128, 128, 132, 128, 130]
    for i in xrange(len(embedding_sizes)):
        for j in xrange(len(num_sampled_arr)):
            for k in xrange(len(window_sizes)):
                embedding_size = embedding_sizes[i]
                num_sampled = num_sampled_arr[j]
                window_size = window_sizes[k]
                batch_size = batch_sizes[k]
                
                graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)
                
                print_random(window_size, embedding_size, num_sampled)