In [None]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

import tensorflow as tf

from scipy import stats

from six.moves import xrange 
from pathlib import Path


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [None]:
mode = 0

In [None]:
ignore_dumps = False

def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)
        
def dump(data, name):
    with open('data/' + name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/' + name, 'rb') as f:
        return pickle.load(f)
    
def load_or_dump(path, func):
    if not Path('data/' + path).exists() or ignore_dumps:
        res = func()
    
        dump(res, path)
    else:
        res = load(path)
        
    return res


In [None]:
from time import sleep

with open('auth/token') as f:
    token = f.readline().strip()

def get_info(ids):
    sleep(0.2)
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    
    if (not 'response' in r.json()):
        print(r.json())
        
    res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
    return(res)

In [None]:
total = 1015925

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [None]:
if (mode == 0):
    raw_data = load_or_dump('raw', lambda: raw_data_filter(open("data/public_sessions_2.txt","r")))

    log.info("Data loaded")


In [None]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i[0]:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [None]:
def load_words_data(file):
    words = []
    
    for line in file:
        for word in line.split():
            words.append(word)

    return words

In [None]:
if (mode == 1):
    words_size = 50000
    
    ignore_dumps = True
    data = load_or_dump('raw_txt', lambda: load_words_data(open("data/text8.txt","r")))
    groups = group_count([[data]])
    
    dictlist = list(groups.items())
    dictlist.sort(key = lambda x: x[1])
    allowed = set(lmap(lambda x: x[0], dictlist[-words_size:]))

    for i in xrange(len(data)):
        if not data[i] in allowed:
            data[i] = '-1'
            
    groups = group_count([[data]])
    
    data = [[data, []]]
    
    print(len(groups))
    

In [None]:
min_session_size = 2
max_session_size = 20

def initiail_mapping(lst, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        malformed = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub or sub_event in malformed):
                    sub.discard(sub_event)
                    unsub.discard(sub_event)
                    malformed.add(sub_event)
                else:
                    unsub.add(sub_event)
            else:
                if (event in unsub or event in malformed):
                    unsub.discard(event)
                    sub.discard(event)
                    malformed.add(event)
                else:
                    sub.add(event)
        
        if (len(sub) >= min_session_size and len(sub) <= max_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups
    

def set_map(lst, cnt, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set() 
        
        for event in session[0]:
            if (cnt[event] > min_allowed):
                sub.add(event)
                
        for event in session[1]:
            if (cnt.get(event, -1) > min_allowed):
                unsub.add(event)    
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10):
    cnt = None
    sorted_cnt = None
    
    data, groups = initiail_mapping(raw_data, min_allowed)
    cnt = group_count(data) 
    sorted_cnt = sorted(list(cnt.values()))
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed)
                
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d\n", sorted_cnt[0])
        
    return data, groups

In [None]:
if (mode == 0):
    ignore_dumps = True
    data, groups = load_or_dump('final_data', lambda: drop_uncommon(raw_data, 50))

    most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

In [None]:
w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

In [None]:
print(i2w[0])

In [None]:
raw_data = None

gc.collect()

In [None]:
session_dex = 0
event_dex = 0

def generate_batch(batch_size, negative_size, window_size = 1):
    assert min_session_size > 1
    
    global session_dex
    global event_dex
    
    labels = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
#     negative = np.ndarray(shape=(batch_size, negative_size), dtype=np.int32)
     
    current = 0
    session = list(data[session_dex][0])
    
    for i in range(0, batch_size):     
        batch[i] = w2i[session[event_dex]]
        
        for j in range(1, window_size + 1):
            labels[i][j - 1] = w2i[session[(event_dex + j) % len(session)]]
            
            if (labels[i][j - 1] == batch[i]):
                labels[i][j - 1] = labels[i][j - 2]
            
#         neg = 0
        
#         for j in data[session_dex][1]:
#             negative[i][neg] = w2i[j]
#             neg += 1
#             if (neg == negative_size):
#                 break
                
#         rand_neg = np.random.randint(len(groups), size=negative_size - neg)
        
#         for j in range(0, negative_size - neg):
#             negative[i][neg + j] = rand_neg[j]
            
        event_dex += 1

        if (event_dex == len(session)):
            event_dex = 0
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex][0])        
     
    return batch, labels, []
#     return batch, labels, negative


In [None]:
session_dex = 0

#print(data[session_dex])
#print(data[session_dex + 1])

batch, labels, negative = generate_batch(16, 1, 1)

for i in range(10):
    print(i2w[batch[i]], '->', lmap(lambda x: i2w[x], labels[i]), )
#      print(i2w[batch[i]], '->', lmap(lambda x: i2w[x], labels[i]), '-> (negative)', lmap(lambda x: i2w[x], negative[i]))
    
#print(negative)

In [None]:
#raw_data = None
test_ids=lfilter(lambda x: x in w2i, [129440544, 28261334, 92876084, 51016572, 91933860])

if (mode == 1):
    test_ids = ['term', 'first', 'used', 'early', 'against', 'working']

learning_rate = 0.1
vocabulary_size = len(groups)

window_size = 4
embedding_size = 48
num_sampled = 10
batch_size = 128

In [None]:
def get_closest(emb, index, f = None):
    p = emb[index]
    cnst = tf.constant(p, shape=[1, embedding_size])
    d = tf.matmul(cnst, emb, transpose_b=True).eval()[0]

    dxs = np.argsort(np.array(d))
    
    ids = []
    res = []
    
    for i in range(len(dxs) - 10, len(dxs)):
        ids.append(i2w[dxs[i]])
        res.append(d[dxs[i]])
    
    if (mode == 0):
        info = get_info(ids)
    else:
        info = ids
    
    for i in xrange(len(res)):
        print(ids[i], ' ', res[i], ' ', info[i])
        
        if (f != None):
            f.write(str(ids[i]) + ' ' + str(res[i]) + ' ' + str(info[i]) + '\n')

In [None]:
num_steps = 200000

def loss_fn(batch_size, batch_inputs, batch_labels, batch_negative, embeddings): 
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
        
    log.info("loss_fn init")
    res_lst = []

    for i in xrange(batch_size):
        inp = batch_inputs[i]
        lbl = batch_labels[i]
        ng = batch_negative[i]

        m = tf.map_fn(lambda k: tf.matmul(tf.gather(normalized_embeddings, [inp]), 
                                            tf.gather(normalized_embeddings, [k]), 
                                            transpose_b=True), 
                        lbl,
                        dtype=tf.float32)
        nm = tf.map_fn(lambda n: tf.matmul(tf.gather(normalized_embeddings, [inp]), 
                                                tf.gather(normalized_embeddings, [n]), 
                                                transpose_b=True),
                            ng,
                            dtype=tf.float32)
        

        s = tf.map_fn(lambda x: tf.log(tf.math.sigmoid(x)), m) 

        ns = tf.map_fn(lambda x: tf.log(tf.math.sigmoid(-x)), nm) 

        res = -(tf.math.reduce_sum(ns) + tf.math.reduce_sum(s))

        res_lst.append(res)

    return tf.stack(res_lst)

def tf_train(window_size, embedding_size, num_sampled, batch_size):
    graph = tf.Graph()
    
    with graph.as_default():
        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            negative_samples = tf.placeholder(tf.int64, shape=[batch_size, num_sampled])
            train_labels = tf.placeholder(tf.int64, shape=[batch_size, window_size])

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
                )
                
#                 norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
#                 normalized_embeddings = embeddings / norm
                
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
            
        with tf.name_scope('loss'):
             loss = tf.reduce_mean(
#                  loss_fn(
#                      batch_size=batch_size, 
#                      batch_inputs=train_inputs,
#                      batch_labels=train_labels,
#                      batch_negative=negative_samples,
#                      embeddings=embeddings))
                tf.nn.nce_loss(
                      weights=nce_weights,
                      biases=nce_biases,
                      labels=train_labels,
                      inputs=embed,
                      num_sampled=num_sampled,
                      num_classes=vocabulary_size,
                      num_true=window_size))

        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
              optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()
        
    with tf.Session(graph=graph) as session:     
        
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter("tmp", session.graph)

        # We must initialize all variables before we use them.
        init.run()
        log.info('Initialized. Embedding size: %s; Num sampled: %s; Window size: %s; Batch size: %s', embedding_size, num_sampled, window_size, batch_size)
        average_loss = 0
        
        for step in xrange(num_steps):
            batch_inputs, batch_labels, batch_negative = generate_batch(batch_size, num_sampled, window_size)
#             feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels, negative_samples:batch_negative}
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
            
            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                             feed_dict=feed_dict,
                                             run_metadata=run_metadata)     
                   
            average_loss += loss_val
       
#             tg = tf.gradients(loss, embeddings)
#             print(session.run(tg, feed_dict))
    
#             print(tf.train.AdagradOptimizer(learning_rate).compute_gradients(loss))
#             print(session.run(tf.train.AdagradOptimizer(learning_rate).compute_gradients(loss), feed_dict=feed_dict))
#             print("====")
#             print(session.run(tf.train.AdagradOptimizer(learning_rate).compute_gradients(loss, var_list=embeddings), feed_dict=feed_dict))
          
            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                      average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                log.debug('Average loss at step %d: %.4f', step, average_loss)
                average_loss = 0

            if step % 10000 == 0:
                for id in test_ids:
                    norm_ = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
                    normalized_embeddings_ = embeddings / norm_
                    final_embeddings = normalized_embeddings_.eval()
                    get_closest(final_embeddings, w2i[id])
                    print("\n")

        norm_ = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings_ = embeddings / norm_
        final_embeddings = normalized_embeddings_.eval()

    return graph, final_embeddings


In [None]:
graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)

In [None]:
from random import randint

def print_random(graph, final_embeddings, window_size, embedding_size, num_sampled):
    try:
        p = 'data/reports/' + 'es_' + str(embedding_size) + '_ns_' + str(num_sampled) + '_ws_' + str(window_size)
        with open(p, 'w') as f:
            for i in range(0, 10):
                with tf.Session(graph=graph) as session:
                    index = len(groups) * i // 10 + randint(0, 100)

                    f.write(str(index) + '\n')
                    print(index)

                    get_closest(final_embeddings, w2i[most_common[index][0]], f)

                f.write('\n\n')
                print('\n')
    except err:
        print(err)
        None


In [None]:
print_random(graph, final_embeddings, window_size, embedding_size, num_sampled)

In [None]:
with tf.Session(graph=graph) as session:
    for id in test_ids:
        get_closest(final_embeddings, w2i[id])
        print("\n")

In [None]:
def train():
    embedding_sizes = [32, 48, 64]
    num_sampled_arr = [32, 48, 64]
    window_sizes = [1, 2, 3, 4, 5]
    batch_sizes = [128, 128, 132, 128, 130]
    for i in xrange(len(embedding_sizes)):
        for j in xrange(len(num_sampled_arr)):
            for k in xrange(len(window_sizes)):
                embedding_size = embedding_sizes[i]
                num_sampled = num_sampled_arr[j]
                window_size = window_sizes[k]
                batch_size = batch_sizes[k]
                
                graph, final_embeddings = tf_train(window_size, embedding_size, num_sampled, batch_size)
                
                print_random(window_size, embedding_size, num_sampled)

In [None]:
# with tf.Session(graph=graph) as session:
#     tf.train.AdagradOptimizer(learning_rate).compute_gradients(loss)

In [None]:
lst = []
test_ids=[129440544, 28261334, 92876084, 51016572, 91933860]

for i in data:
    for j in test_ids:
        if j in i[0]:
            f = list(i[0])
            f.remove(j)
            f.append(j)
            lst.append(f)

In [None]:
print(len(lst))
print(len(lst[0]))

In [None]:
import random
random.shuffle(lst)

In [None]:
for i in xrange(100):
    print(get_info(lst[i]))
    print()
    print("==================================================")
    print()

In [None]:
# from time import sleep

# with open('auth/token') as f:
#     token = f.readline().strip()

# def get_info(ids):
#     sleep(0.2)
#     mc = 'members_count'
#     payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
#     str_ids = functools.reduce(
#         lambda x, y: x + y,
#         lmap(lambda x: str(x) + ',', ids)
#     )
    
#     payload['group_ids'] = str_ids[0:- 1]
    
#     r = requests.get('https://api.vk.com/method/groups.getById', 
#                      params=payload)
    
#     print(r)
#     if (not 'response' in r.json()):
#         print(r.json())
        
#     res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
#     return(res)